-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PERF: SparseDataFrame._init_dict uses intermediary dict, not DataFrame #16883
Changes from 3 commits
e785961
caf3a36
31d9b28
b55b1a2
7053de5
83d8140
e0b468f
f41b490
0a98ac9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,7 +12,10 @@ | |
|
||
from pandas.core.dtypes.missing import isnull, notnull | ||
from pandas.core.dtypes.cast import maybe_upcast, find_common_type | ||
from pandas.core.dtypes.common import _ensure_platform_int, is_scipy_sparse | ||
from pandas.core.dtypes.common import ( | ||
_ensure_platform_int, is_scipy_sparse, | ||
is_float, | ||
) | ||
|
||
from pandas.core.common import _try_sort | ||
from pandas.compat.numpy import function as nv | ||
|
@@ -143,7 +146,7 @@ def _init_dict(self, data, index, columns, dtype=None): | |
sp_maker = lambda x: SparseArray(x, kind=self._default_kind, | ||
fill_value=self._default_fill_value, | ||
copy=True, dtype=dtype) | ||
sdict = DataFrame() | ||
sdict = {} | ||
for k, v in compat.iteritems(data): | ||
if isinstance(v, Series): | ||
# Force alignment, no copy necessary | ||
|
@@ -159,15 +162,12 @@ def _init_dict(self, data, index, columns, dtype=None): | |
v = [v.get(i, nan) for i in index] | ||
|
||
v = sp_maker(v) | ||
sdict[k] = v | ||
sdict[_nan_to_np_nan(k)] = v | ||
|
||
# TODO: figure out how to handle this case, all nan's? | ||
# add in any other columns we want to have (completeness) | ||
nan_vec = np.empty(len(index)) | ||
nan_vec.fill(nan) | ||
for c in columns: | ||
if c not in sdict: | ||
sdict[c] = sp_maker(nan_vec) | ||
nan_arr = sp_maker(np.full(len(index), np.nan)) | ||
sdict.update((c, nan_arr) for c in columns if c not in sdict) | ||
|
||
return to_manager(sdict, columns, index) | ||
|
||
|
@@ -846,6 +846,13 @@ def applymap(self, func): | |
return self.apply(lambda x: lmap(func, x)) | ||
|
||
|
||
def _nan_to_np_nan(value): | ||
"""Normalize nan values to singleton np.NaN object so that when NaNs are | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use isnull There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I had wrongly assumed raw numpy was, as is often the case, faster. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no, why do you need this at all? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because one of the tests ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. have a look thru how DataFrame handles this in the init_dict routines don't want to be reinventing the wheel here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Incorrectly in the sense that: >>> df = pd.DataFrame({np.nan: [1, 2]})
>>> df[np.nan] # Arguably expectedly, nan matches nan
0 1
1 2
Name: nan, dtype: int64
>>> df = pd.DataFrame({np.nan: [1, 2], 2: [2, 3]}, columns=[np.nan, 2])
>>> df # nan from dict didn't match the nan from ensured Float64Index
NaN 2.0
0 NaN 2
1 NaN 3 Nans are tricky because it generally holds How can I improve on the above singleton approach? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you point to the test that is failing. I don't want to address this in this PR. This is non-trivial and needs to be common code. ok with xfailing those tests (and making an issue) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The existing failing test for sparse is |
||
used as dict keys, getitem works. | ||
""" | ||
return np.nan if is_float(value) and isnull(value) else value | ||
|
||
|
||
def to_manager(sdf, columns, index): | ||
""" create and return the block manager from a dataframe of series, | ||
columns, index | ||
|
@@ -855,7 +862,7 @@ def to_manager(sdf, columns, index): | |
axes = [_ensure_index(columns), _ensure_index(index)] | ||
|
||
return create_block_manager_from_arrays( | ||
[sdf[c] for c in columns], columns, axes) | ||
[sdf[_nan_to_np_nan(c)] for c in columns], columns, axes) | ||
|
||
|
||
def stack_sparse_frame(frame): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe I'm being pedantic, but you can't really "fix" performance, only improve it. 😄