Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: fix concat of Sparse with non-sparse dtypes #34338

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1063,7 +1063,8 @@ def astype(self, dtype=None, copy=True):
"""
dtype = self.dtype.update_dtype(dtype)
subtype = dtype._subtype_with_str
sp_values = astype_nansafe(self.sp_values, subtype, copy=copy)
# TODO copy=False is broken for astype_nansafe with int -> float
sp_values = astype_nansafe(self.sp_values, subtype, copy=True)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened #34456 for this (and added link in the comment)

if sp_values is self.sp_values and copy:
sp_values = sp_values.copy()

Expand Down
8 changes: 7 additions & 1 deletion pandas/core/arrays/sparse/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,13 @@ def _subtype_with_str(self):
return self.subtype

def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
# TODO for now only handle SparseDtypes and numpy dtypes => extend
# with other compatibtle extension dtypes
if any(
isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype)
for x in dtypes
):
return None

fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)]
fill_value = fill_values[0]
Expand All @@ -371,6 +378,5 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
stacklevel=6,
)

# TODO also handle non-numpy other dtypes
np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value)
6 changes: 6 additions & 0 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
except ValueError:
return arr.astype(object, copy=False)

if is_sparse(arr.dtype) and not is_sparse(dtype):
# problem case: SparseArray.astype(dtype) doesn't follow the specified
# dtype exactly, but converts this to Sparse[dtype] -> first manually
# convert to dense array
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened #34457 for this

return arr.to_dense().astype(dtype, copy=False)

if (
isinstance(arr, np.ndarray)
and arr.dtype.kind in ["m", "M"]
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,8 @@ def _is_boolean(self) -> bool:
return is_bool_dtype(self.categories)

def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
from pandas.core.arrays.sparse import SparseDtype

# check if we have all categorical dtype with identical categories
if all(isinstance(x, CategoricalDtype) for x in dtypes):
first = dtypes[0]
Expand All @@ -658,6 +660,8 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
elif any(non_init_cats):
return None

# categorical is aware of Sparse -> extract sparse subdtypes
dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
# extract the categories' dtype
non_cat_dtypes = [
x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes
Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/arrays/sparse/test_combine_concat.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pytest

import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray

Expand Down Expand Up @@ -29,3 +30,33 @@ def test_uses_first_kind(self, kind):
expected = np.array([1, 2, 1, 2, 2], dtype="int64")
tm.assert_numpy_array_equal(result.sp_values, expected)
assert result.kind == kind


@pytest.mark.parametrize(
"other, expected_dtype",
[
# compatible dtype -> preserve sparse
(pd.Series([3, 4, 5], dtype="int64"), pd.SparseDtype("int64", 0)),
# (pd.Series([3, 4, 5], dtype="Int64"), pd.SparseDtype("int64", 0)),
# incompatible dtype -> Sparse[common dtype]
(pd.Series([1.5, 2.5, 3.5], dtype="float64"), pd.SparseDtype("float64", 0)),
# incompatible dtype -> Sparse[object] dtype
(pd.Series(["a", "b", "c"], dtype=object), pd.SparseDtype(object, 0)),
# categorical with compatible categories -> dtype of the categories
(pd.Series([3, 4, 5], dtype="category"), np.dtype("int64")),
(pd.Series([1.5, 2.5, 3.5], dtype="category"), np.dtype("float64")),
# categorical with incompatible categories -> object dtype
(pd.Series(["a", "b", "c"], dtype="category"), np.dtype(object)),
],
)
def test_concat_with_non_sparse(other, expected_dtype):
# https://github.com/pandas-dev/pandas/issues/34336
s_sparse = pd.Series([1, 0, 2], dtype=pd.SparseDtype("int64", 0))

result = pd.concat([s_sparse, other], ignore_index=True)
expected = pd.Series(list(s_sparse) + list(other)).astype(expected_dtype)
tm.assert_series_equal(result, expected)

result = pd.concat([other, s_sparse], ignore_index=True)
expected = pd.Series(list(other) + list(s_sparse)).astype(expected_dtype)
tm.assert_series_equal(result, expected)