Skip to content

Commit

Permalink
API: default empty DataFrame to dtype=object to prevent certain class…
Browse files Browse the repository at this point in the history
… of TypeError, e.g. out of empty SQL query. closes #1783
  • Loading branch information
wesm committed Sep 18, 2012
1 parent 5b033ce commit 852a994
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 21 deletions.
6 changes: 6 additions & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ pandas 0.9.0
transposed. Legacy files will still be readable by HDFStore (#1834, #1824)
- Legacy cruft removed: pandas.stats.misc.quantileTS
- Use ISO8601 format for Period repr: monthly, daily, and on down (#1776)
- Empty DataFrame columns are now created as object dtype. This will prevent
a class of TypeErrors that was occurring in code where the dtype of a
column would depend on the presence of data or not (e.g. a SQL query having
results) (#1783)

**Bug fixes**

Expand Down Expand Up @@ -184,6 +188,8 @@ pandas 0.9.0
datetime.tzinfo without .zone and ._utcoffset attributes (#1922)
- Fix DataFrame formatting of small, non-zero FP numbers (#1911)
- Various fixes by upcasting of date -> datetime (#1395)
- Raise better exception when passing multiple functions with the same name,
such as lambdas, to GroupBy.aggregate

pandas 0.8.1
============
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4994,7 +4994,12 @@ def _homogenize(data, index, columns, dtype=None):
if dtype is not None and issubclass(dtype.type, np.integer):
continue

v = np.empty(len(index), dtype=dtype)
if dtype is None:
# #1783
v = np.empty(len(index), dtype=object)
else:
v = np.empty(len(index), dtype=dtype)

v.fill(nan)
else:
v = data[k]
Expand Down
19 changes: 14 additions & 5 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
class GroupByError(Exception):
pass

class DataError(GroupByError):
pass

class SpecificationError(GroupByError):
pass

def _groupby_function(name, alias, npfunc):
def f(self):
Expand Down Expand Up @@ -290,7 +295,7 @@ def mean(self):
"""
try:
return self._cython_agg_general('mean')
except GroupByError:
except DataError:
raise
except Exception: # pragma: no cover
f = lambda x: x.mean(axis=self.axis)
Expand All @@ -304,7 +309,7 @@ def median(self):
"""
try:
return self._cython_agg_general('median')
except GroupByError:
except DataError:
raise
except Exception: # pragma: no cover
f = lambda x: x.median(axis=self.axis)
Expand Down Expand Up @@ -375,7 +380,7 @@ def _cython_agg_general(self, how):
output[name] = result

if len(output) == 0:
raise GroupByError('No numeric types to aggregate')
raise DataError('No numeric types to aggregate')

return self._wrap_aggregated_output(output, names)

Expand Down Expand Up @@ -1270,6 +1275,10 @@ def _aggregate_multiple_funcs(self, arg):
results = {}

for name, func in arg:
if name in results:
raise SpecificationError('Function names must be unique, '
'found multiple named %s' % name)

results[name] = self.aggregate(func)

return DataFrame(results, columns=columns)
Expand Down Expand Up @@ -1415,7 +1424,7 @@ def _cython_agg_blocks(self, how):
new_blocks.append(newb)

if len(new_blocks) == 0:
raise GroupByError('No numeric types to aggregate')
raise DataError('No numeric types to aggregate')

return new_blocks

Expand Down Expand Up @@ -1542,7 +1551,7 @@ def _aggregate_multiple_funcs(self, arg):
grouper=self.grouper)
results.append(colg.aggregate(arg))
keys.append(col)
except (TypeError, GroupByError):
except (TypeError, DataError):
pass

result = concat(results, keys=keys, axis=1)
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -951,7 +951,7 @@ def reindex_axis(self, new_axis, method=None, axis=0, copy=True):
result.axes[axis] = new_axis

if axis == 0:
# patch ref_items
# patch ref_items, #1823
for blk in result.blocks:
blk.ref_items = new_axis

Expand Down Expand Up @@ -1290,7 +1290,10 @@ def form_blocks(data, axes):

if len(extra_items):
shape = (len(extra_items),) + tuple(len(x) for x in axes[1:])
block_values = np.empty(shape, dtype=float)

# empty items -> dtype object
block_values = np.empty(shape, dtype=object)

block_values.fill(nan)

na_block = make_block(block_values, extra_items, items,
Expand Down
16 changes: 11 additions & 5 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1593,12 +1593,12 @@ def test_constructor_dict(self):
tm.assert_dict_equal(self.ts2, frame['col2'], compare_keys=False)

frame = DataFrame({'col1' : self.ts1,
'col2' : self.ts2},
'col2' : self.ts2},
columns=['col2', 'col3', 'col4'])

self.assertEqual(len(frame), len(self.ts2))
self.assert_('col1' not in frame)
self.assert_(np.isnan(frame['col3']).all())
self.assert_(isnull(frame['col3']).all())

# Corner cases
self.assertEqual(len(DataFrame({})), 0)
Expand Down Expand Up @@ -1888,7 +1888,11 @@ def test_constructor_corner(self):

# does not error but ends up float
df = DataFrame(index=range(10), columns=['a','b'], dtype=int)
self.assert_(df.values.dtype == np.float64)
self.assert_(df.values.dtype == np.object_)

# #1783 empty dtype object
df = DataFrame({}, columns=['foo', 'bar'])
self.assert_(df.values.dtype == np.object_)

def test_constructor_scalar_inference(self):
data = {'int' : 1, 'bool' : True,
Expand Down Expand Up @@ -3305,7 +3309,9 @@ def test_to_csv_multiindex(self):
recons = DataFrame.from_csv(path)
exp = tsframe[:0]
exp.index = []
assert_frame_equal(recons, exp)

self.assert_(recons.columns.equals(exp.columns))
self.assert_(len(recons) == 0)

def test_to_csv_float32_nanrep(self):
df = DataFrame(np.random.randn(1, 4).astype(np.float32))
Expand Down Expand Up @@ -6632,7 +6638,7 @@ def test_boolean_indexing(self):

def test_sum_bools(self):
df = DataFrame(index=range(1), columns=range(10))
bools = np.isnan(df)
bools = isnull(df)
self.assert_(bools.sum(axis=1)[0] == 10)

def test_fillna_col_reordering(self):
Expand Down
21 changes: 13 additions & 8 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pandas.core.index import Index, MultiIndex
from pandas.core.common import rands
from pandas.core.api import Categorical, DataFrame
from pandas.core.groupby import GroupByError
from pandas.core.groupby import GroupByError, SpecificationError, DataError
from pandas.core.series import Series
from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
assert_series_equal, assert_almost_equal)
Expand Down Expand Up @@ -252,11 +252,10 @@ def test_agg_apply_corner(self):

# DataFrame
grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan)
assert_frame_equal(grouped.sum(),
DataFrame(columns=self.tsframe.columns))
assert_frame_equal(grouped.agg(np.sum),
DataFrame(columns=self.tsframe.columns))
assert_frame_equal(grouped.apply(np.sum), DataFrame({}))
exp_df = DataFrame(columns=self.tsframe.columns, dtype=float)
assert_frame_equal(grouped.sum(), exp_df)
assert_frame_equal(grouped.agg(np.sum), exp_df)
assert_frame_equal(grouped.apply(np.sum), DataFrame({}, dtype=float))

def test_agg_grouping_is_list_tuple(self):
from pandas.core.groupby import Grouping
Expand Down Expand Up @@ -1078,11 +1077,11 @@ def test_cython_agg_boolean(self):
def test_cython_agg_nothing_to_agg(self):
frame = DataFrame({'a': np.random.randint(0, 5, 50),
'b': ['foo', 'bar'] * 25})
self.assertRaises(GroupByError, frame.groupby('a')['b'].mean)
self.assertRaises(DataError, frame.groupby('a')['b'].mean)

frame = DataFrame({'a': np.random.randint(0, 5, 50),
'b': ['foo', 'bar'] * 25})
self.assertRaises(GroupByError, frame[['b']].groupby(frame['a']).mean)
self.assertRaises(DataError, frame[['b']].groupby(frame['a']).mean)

def test_wrap_aggregated_output_multindex(self):
df = self.mframe.T
Expand Down Expand Up @@ -1847,6 +1846,12 @@ def test_multiple_functions_tuples_and_non_tuples(self):
expected = self.df.groupby('A').agg(ex_funcs)
assert_frame_equal(result, expected)

def test_agg_multiple_functions_too_many_lambdas(self):
grouped = self.df.groupby('A')
funcs = ['mean', lambda x: x.mean(), lambda x: x.std()]

self.assertRaises(SpecificationError, grouped.agg, funcs)

def test_more_flexible_frame_multi_function(self):
from pandas import concat

Expand Down

0 comments on commit 852a994

Please sign in to comment.