-
-
Notifications
You must be signed in to change notification settings - Fork 18.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: astype fill_value for SparseArray.astype #23547
Merged
jreback
merged 11 commits into
pandas-dev:master
from
TomAugspurger:sparse-astype-fill-value
Nov 12, 2018
Merged
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
7b2da4c
BUG: astype fill_value for SparseArray.astype
TomAugspurger 232921b
object type, lint
TomAugspurger 7454e31
text
TomAugspurger 9c3856d
Merge remote-tracking branch 'upstream/master' into sparse-astype-fil…
TomAugspurger 49c90b0
Merge remote-tracking branch 'upstream/master' into sparse-astype-fil…
TomAugspurger 1cc43d6
Moved to astype
TomAugspurger 57d32ae
closing paren
TomAugspurger d93d98f
astype -> update_dtype
TomAugspurger 4f4b3a3
pytest.raises
TomAugspurger 3dfc07e
handle nan
TomAugspurger 173a28a
Merge remote-tracking branch 'upstream/master' into sparse-astype-fil…
TomAugspurger File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -284,6 +284,83 @@ def is_dtype(cls, dtype): | |
return True | ||
return isinstance(dtype, np.dtype) or dtype == 'Sparse' | ||
|
||
def update_dtype(self, dtype): | ||
"""Convert the SparseDtype to a new dtype. | ||
|
||
This takes care of converting the ``fill_value``. | ||
|
||
Parameters | ||
---------- | ||
dtype : Union[str, numpy.dtype, SparseDtype] | ||
The new dtype to use. | ||
|
||
* For a SparseDtype, it is simply returned | ||
* For a NumPy dtype (or str), the current fill value | ||
is converted to the new dtype, and a SparseDtype | ||
with `dtype` and the new fill value is returned. | ||
|
||
Returns | ||
------- | ||
SparseDtype | ||
A new SparseDtype with the corret `dtype` and fill value | ||
for that `dtype`. | ||
|
||
Raises | ||
------ | ||
ValueError | ||
When the current fill value cannot be converted to the | ||
new `dtype` (e.g. trying to convert ``np.nan`` to an | ||
integer dtype). | ||
|
||
|
||
Examples | ||
-------- | ||
>>> SparseDtype(int, 0).update_dtype(float) | ||
Sparse[float64, 0.0] | ||
|
||
>>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan)) | ||
Sparse[float64, nan] | ||
""" | ||
cls = type(self) | ||
dtype = pandas_dtype(dtype) | ||
|
||
if not isinstance(dtype, cls): | ||
fill_value = astype_nansafe(np.array(self.fill_value), | ||
dtype).item() | ||
dtype = cls(dtype, fill_value=fill_value) | ||
|
||
return dtype | ||
|
||
@property | ||
def _subtype_with_str(self): | ||
""" | ||
Whether the SparseDtype's subtype should be considered ``str``. | ||
|
||
Typically, pandas will store string data in an object-dtype array. | ||
When converting values to a dtype, e.g. in ``.astype``, we need to | ||
be more specific, we need the actual underlying type. | ||
|
||
Returns | ||
------- | ||
|
||
>>> SparseDtype(int, 1)._subtype_with_str | ||
dtype('int64') | ||
|
||
>>> SparseDtype(object, 1)._subtype_with_str | ||
dtype('O') | ||
|
||
>>> dtype = SparseDtype(str, '') | ||
>>> dtype.subtype | ||
dtype('O') | ||
|
||
>>> dtype._subtype_with_str | ||
str | ||
""" | ||
if isinstance(self.fill_value, compat.string_types): | ||
return type(self.fill_value) | ||
return self.subtype | ||
|
||
|
||
# ---------------------------------------------------------------------------- | ||
# Array | ||
|
||
|
@@ -614,7 +691,7 @@ def __array__(self, dtype=None, copy=True): | |
# Can't put pd.NaT in a datetime64[ns] | ||
fill_value = np.datetime64('NaT') | ||
try: | ||
dtype = np.result_type(self.sp_values.dtype, fill_value) | ||
dtype = np.result_type(self.sp_values.dtype, type(fill_value)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was having trouble with string fill values. |
||
except TypeError: | ||
dtype = object | ||
|
||
|
@@ -996,7 +1073,7 @@ def _take_with_fill(self, indices, fill_value=None): | |
if len(self) == 0: | ||
# Empty... Allow taking only if all empty | ||
if (indices == -1).all(): | ||
dtype = np.result_type(self.sp_values, fill_value) | ||
dtype = np.result_type(self.sp_values, type(fill_value)) | ||
taken = np.empty_like(indices, dtype=dtype) | ||
taken.fill(fill_value) | ||
return taken | ||
|
@@ -1009,7 +1086,7 @@ def _take_with_fill(self, indices, fill_value=None): | |
if self.sp_index.npoints == 0: | ||
# Avoid taking from the empty self.sp_values | ||
taken = np.full(sp_indexer.shape, fill_value=fill_value, | ||
dtype=np.result_type(fill_value)) | ||
dtype=np.result_type(type(fill_value))) | ||
else: | ||
taken = self.sp_values.take(sp_indexer) | ||
|
||
|
@@ -1030,12 +1107,13 @@ def _take_with_fill(self, indices, fill_value=None): | |
result_type = taken.dtype | ||
|
||
if m0.any(): | ||
result_type = np.result_type(result_type, self.fill_value) | ||
result_type = np.result_type(result_type, | ||
type(self.fill_value)) | ||
taken = taken.astype(result_type) | ||
taken[old_fill_indices] = self.fill_value | ||
|
||
if m1.any(): | ||
result_type = np.result_type(result_type, fill_value) | ||
result_type = np.result_type(result_type, type(fill_value)) | ||
taken = taken.astype(result_type) | ||
taken[new_fill_indices] = fill_value | ||
|
||
|
@@ -1061,7 +1139,7 @@ def _take_without_fill(self, indices): | |
# edge case in take... | ||
# I think just return | ||
out = np.full(indices.shape, self.fill_value, | ||
dtype=np.result_type(self.fill_value)) | ||
dtype=np.result_type(type(self.fill_value))) | ||
arr, sp_index, fill_value = make_sparse(out, | ||
fill_value=self.fill_value) | ||
return type(self)(arr, sparse_index=sp_index, | ||
|
@@ -1073,7 +1151,7 @@ def _take_without_fill(self, indices): | |
|
||
if fillable.any(): | ||
# TODO: may need to coerce array to fill value | ||
result_type = np.result_type(taken, self.fill_value) | ||
result_type = np.result_type(taken, type(self.fill_value)) | ||
taken = taken.astype(result_type) | ||
taken[fillable] = self.fill_value | ||
|
||
|
@@ -1093,7 +1171,9 @@ def _concat_same_type(cls, to_concat): | |
|
||
fill_value = fill_values[0] | ||
|
||
if len(set(fill_values)) > 1: | ||
# np.nan isn't a singleton, so we may end up with multiple | ||
# NaNs here, so we ignore tha all NA case too. | ||
if not (len(set(fill_values)) == 1 or isna(fill_values).all()): | ||
warnings.warn("Concatenating sparse arrays with multiple fill " | ||
"values: '{}'. Picking the first and " | ||
"converting the rest.".format(fill_values), | ||
|
@@ -1212,13 +1292,10 @@ def astype(self, dtype=None, copy=True): | |
IntIndex | ||
Indices: array([2, 3], dtype=int32) | ||
""" | ||
dtype = pandas_dtype(dtype) | ||
|
||
if not isinstance(dtype, SparseDtype): | ||
dtype = SparseDtype(dtype, fill_value=self.fill_value) | ||
|
||
dtype = self.dtype.update_dtype(dtype) | ||
subtype = dtype._subtype_with_str | ||
sp_values = astype_nansafe(self.sp_values, | ||
dtype.subtype, | ||
subtype, | ||
copy=copy) | ||
if sp_values is self.sp_values and copy: | ||
sp_values = sp_values.copy() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess this is only for Sparse which is ok