diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 3cfd92d778823..9b89ec99e8df6 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1063,7 +1063,9 @@ def astype(self, dtype=None, copy=True): """ dtype = self.dtype.update_dtype(dtype) subtype = dtype._subtype_with_str - sp_values = astype_nansafe(self.sp_values, subtype, copy=copy) + # TODO copy=False is broken for astype_nansafe with int -> float, so cannot + # passthrough copy keyword: https://github.com/pandas-dev/pandas/issues/34456 + sp_values = astype_nansafe(self.sp_values, subtype, copy=True) if sp_values is self.sp_values and copy: sp_values = sp_values.copy() diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index 8d17ed412f6b4..b3da9cbeb44af 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -360,6 +360,13 @@ def _subtype_with_str(self): return self.subtype def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + # TODO for now only handle SparseDtypes and numpy dtypes => extend + # with other compatibtle extension dtypes + if any( + isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype) + for x in dtypes + ): + return None fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)] fill_value = fill_values[0] @@ -375,6 +382,5 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: stacklevel=6, ) - # TODO also handle non-numpy other dtypes np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index ca3a41813f3d3..fb47b33ce9890 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -1,6 +1,7 @@ """ Utility functions related to concat. """ +from typing import cast import numpy as np @@ -21,6 +22,7 @@ from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries from pandas.core.arrays import ExtensionArray +from pandas.core.arrays.sparse import SparseArray from pandas.core.construction import array @@ -81,6 +83,13 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: except ValueError: return arr.astype(object, copy=False) + if is_sparse(arr) and not is_sparse(dtype): + # problem case: SparseArray.astype(dtype) doesn't follow the specified + # dtype exactly, but converts this to Sparse[dtype] -> first manually + # convert to dense array + arr = cast(SparseArray, arr) + return arr.to_dense().astype(dtype, copy=False) + if ( isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"] diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 8aa146d613dc3..ff35876ab2e73 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -642,6 +642,8 @@ def _is_boolean(self) -> bool: return is_bool_dtype(self.categories) def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + from pandas.core.arrays.sparse import SparseDtype + # check if we have all categorical dtype with identical categories if all(isinstance(x, CategoricalDtype) for x in dtypes): first = dtypes[0] @@ -658,6 +660,8 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: elif any(non_init_cats): return None + # categorical is aware of Sparse -> extract sparse subdtypes + dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] # extract the categories' dtype non_cat_dtypes = [ x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes diff --git a/pandas/tests/arrays/sparse/test_combine_concat.py b/pandas/tests/arrays/sparse/test_combine_concat.py index f1697dc9ff7ce..0f09af269148b 100644 --- a/pandas/tests/arrays/sparse/test_combine_concat.py +++ b/pandas/tests/arrays/sparse/test_combine_concat.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray @@ -29,3 +30,33 @@ def test_uses_first_kind(self, kind): expected = np.array([1, 2, 1, 2, 2], dtype="int64") tm.assert_numpy_array_equal(result.sp_values, expected) assert result.kind == kind + + +@pytest.mark.parametrize( + "other, expected_dtype", + [ + # compatible dtype -> preserve sparse + (pd.Series([3, 4, 5], dtype="int64"), pd.SparseDtype("int64", 0)), + # (pd.Series([3, 4, 5], dtype="Int64"), pd.SparseDtype("int64", 0)), + # incompatible dtype -> Sparse[common dtype] + (pd.Series([1.5, 2.5, 3.5], dtype="float64"), pd.SparseDtype("float64", 0)), + # incompatible dtype -> Sparse[object] dtype + (pd.Series(["a", "b", "c"], dtype=object), pd.SparseDtype(object, 0)), + # categorical with compatible categories -> dtype of the categories + (pd.Series([3, 4, 5], dtype="category"), np.dtype("int64")), + (pd.Series([1.5, 2.5, 3.5], dtype="category"), np.dtype("float64")), + # categorical with incompatible categories -> object dtype + (pd.Series(["a", "b", "c"], dtype="category"), np.dtype(object)), + ], +) +def test_concat_with_non_sparse(other, expected_dtype): + # https://github.com/pandas-dev/pandas/issues/34336 + s_sparse = pd.Series([1, 0, 2], dtype=pd.SparseDtype("int64", 0)) + + result = pd.concat([s_sparse, other], ignore_index=True) + expected = pd.Series(list(s_sparse) + list(other)).astype(expected_dtype) + tm.assert_series_equal(result, expected) + + result = pd.concat([other, s_sparse], ignore_index=True) + expected = pd.Series(list(other) + list(s_sparse)).astype(expected_dtype) + tm.assert_series_equal(result, expected)