Skip to content

Commit

Permalink
BUG: Fix initialization of DataFrame from dict with NaN as key
Browse files Browse the repository at this point in the history
closes #18455
  • Loading branch information
toobaz committed Dec 2, 2017
1 parent d163de7 commit f7447b3
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 38 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -244,5 +244,7 @@ Other
- Improved error message when attempting to use a Python keyword as an identifier in a numexpr query (:issue:`18221`)
- Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`)
- Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`)
- Fixed construction of a :class:`DataFrame` from a ``dict`` containing ``NaN`` as key (:issue:`18455`)
- Suppressed error in the construction of a :class:`DataFrame` from a ``dict`` containing scalar values when the corresponding keys are not included in the passed index
- Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`)
-
49 changes: 17 additions & 32 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,44 +416,29 @@ def _init_dict(self, data, index, columns, dtype=None):
Needs to handle a lot of exceptional cases.
"""
if columns is not None:
columns = _ensure_index(columns)
arrays = Series(data, index=columns, dtype=object)
data_names = arrays.index

# GH10856
# raise ValueError if only scalars in dict
missing = arrays.isnull()
if index is None:
extract_index(list(data.values()))

# prefilter if columns passed
data = {k: v for k, v in compat.iteritems(data) if k in columns}

if index is None:
index = extract_index(list(data.values()))

# GH10856
# raise ValueError if only scalars in dict
index = extract_index(arrays[~missing].tolist())
else:
index = _ensure_index(index)

arrays = []
data_names = []
for k in columns:
if k not in data:
# no obvious "empty" int column
if dtype is not None and issubclass(dtype.type,
np.integer):
continue

if dtype is None:
# 1783
v = np.empty(len(index), dtype=object)
elif np.issubdtype(dtype, np.flexible):
v = np.empty(len(index), dtype=object)
else:
v = np.empty(len(index), dtype=dtype)

v.fill(np.nan)
# no obvious "empty" int column
if missing.any() and not (dtype is not None and
issubclass(dtype.type, np.integer)):
if dtype is None or np.issubdtype(dtype, np.flexible):
# 1783
nan_dtype = object
else:
v = data[k]
data_names.append(k)
arrays.append(v)
nan_dtype = dtype
v = np.empty(len(index), dtype=nan_dtype)
v.fill(np.nan)
arrays.loc[missing] = [v] * missing.sum()
arrays = arrays.tolist()

else:
keys = list(data.keys())
Expand Down
1 change: 0 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6185,7 +6185,6 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
if not is_bool_dtype(dt):
raise ValueError(msg.format(dtype=dt))

cond = cond.astype(bool, copy=False)
cond = -cond if inplace else cond

# try to align with other
Expand Down
50 changes: 46 additions & 4 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,8 +275,50 @@ def test_constructor_dict(self):
with tm.assert_raises_regex(ValueError, msg):
DataFrame({'a': 0.7}, columns=['a'])

with tm.assert_raises_regex(ValueError, msg):
DataFrame({'a': 0.7}, columns=['b'])
# No reason to raise if item is not used:
result = DataFrame({'a': 0.7}, columns=['b'])
expected = DataFrame(columns=['b'])
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("value", [2, np.nan, None, float('nan')])
def test_constructor_dict_nan_key(self, value):
# GH 18455
cols = [1, value, 3]
idx = ['a', value]
values = [[0, 3], [1, 4], [2, 5]]
data = {cols[c]: pd.Series(values[c], index=idx) for c in range(3)}
result = pd.DataFrame(data).sort_values(1).sort_values('a', axis=1)
expected = pd.DataFrame(np.arange(6).reshape(2, 3),
index=idx, columns=cols)
tm.assert_frame_equal(result, expected)

result = pd.DataFrame(data, index=idx).sort_values('a', axis=1)
tm.assert_frame_equal(result, expected)

result = pd.DataFrame(data, index=idx, columns=cols)
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(reason='GH 18485 comparison fails on MI with NaNs)')
@pytest.mark.parametrize("value", [np.nan, None, float('nan')])
def test_constructor_dict_nan_tuple_key(self, value):
# GH 18455
cols = Index([(11, 21), (value, 22), (13, value)])
idx = Index([('a', value), (value, 2)])
values = [[0, 3], [1, 4], [2, 5]]
data = {cols[c]: pd.Series(values[c], index=idx) for c in range(3)}
result = (DataFrame(data)
.sort_values((11, 21))
.sort_values(('a', value), axis=1))
expected = pd.DataFrame(np.arange(6).reshape(2, 3),
index=idx, columns=cols)
tm.assert_frame_equal(result, expected)

result = pd.DataFrame(data, index=idx).sort_values(('a', value),
axis=1)
tm.assert_frame_equal(result, expected)

result = pd.DataFrame(data, index=idx, columns=cols)
tm.assert_frame_equal(result, expected)

def test_constructor_multi_index(self):
# GH 4078
Expand Down Expand Up @@ -723,15 +765,15 @@ def test_constructor_corner(self):

# does not error but ends up float
df = DataFrame(index=lrange(10), columns=['a', 'b'], dtype=int)
assert df.values.dtype == np.object_
assert df.values.dtype == np.dtype('float64')

# #1783 empty dtype object
df = DataFrame({}, columns=['foo', 'bar'])
assert df.values.dtype == np.object_

df = DataFrame({'b': 1}, index=lrange(10), columns=list('abc'),
dtype=int)
assert df.values.dtype == np.object_
assert df.values.dtype == np.dtype('float64')

def test_constructor_scalar_inference(self):
data = {'int': 1, 'bool': True,
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/test_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ def test_read_one_empty_col_with_header(self):
)
expected_header_none = DataFrame(pd.Series([0], dtype='int64'))
tm.assert_frame_equal(actual_header_none, expected_header_none)
expected_header_zero = DataFrame(columns=[0], dtype='int64')
expected_header_zero = DataFrame(columns=[0])
tm.assert_frame_equal(actual_header_zero, expected_header_zero)

def test_set_column_names_in_parameter(self):
Expand Down

0 comments on commit f7447b3

Please sign in to comment.