diff --git a/doc/redirects.csv b/doc/redirects.csv index a7886779c97d5..a1504f9175480 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -503,7 +503,6 @@ generated/pandas.DataFrame.to_parquet,../reference/api/pandas.DataFrame.to_parqu generated/pandas.DataFrame.to_period,../reference/api/pandas.DataFrame.to_period generated/pandas.DataFrame.to_pickle,../reference/api/pandas.DataFrame.to_pickle generated/pandas.DataFrame.to_records,../reference/api/pandas.DataFrame.to_records -generated/pandas.DataFrame.to_sparse,../reference/api/pandas.DataFrame.to_sparse generated/pandas.DataFrame.to_sql,../reference/api/pandas.DataFrame.to_sql generated/pandas.DataFrame.to_stata,../reference/api/pandas.DataFrame.to_stata generated/pandas.DataFrame.to_string,../reference/api/pandas.DataFrame.to_string @@ -1432,7 +1431,6 @@ generated/pandas.Series.to_msgpack,../reference/api/pandas.Series.to_msgpack generated/pandas.Series.to_numpy,../reference/api/pandas.Series.to_numpy generated/pandas.Series.to_period,../reference/api/pandas.Series.to_period generated/pandas.Series.to_pickle,../reference/api/pandas.Series.to_pickle -generated/pandas.Series.to_sparse,../reference/api/pandas.Series.to_sparse generated/pandas.Series.to_sql,../reference/api/pandas.Series.to_sql generated/pandas.Series.to_string,../reference/api/pandas.Series.to_string generated/pandas.Series.to_timestamp,../reference/api/pandas.Series.to_timestamp diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index b1c6172fb1261..4982edeb7f85b 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -356,15 +356,7 @@ Serialization / IO / conversion DataFrame.to_msgpack DataFrame.to_gbq DataFrame.to_records - DataFrame.to_sparse DataFrame.to_dense DataFrame.to_string DataFrame.to_clipboard DataFrame.style - -Sparse -~~~~~~ -.. autosummary:: - :toctree: api/ - - SparseDataFrame.to_coo diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 7ba625c141f24..5d825c8092efc 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -576,18 +576,7 @@ Serialization / IO / conversion Series.to_sql Series.to_msgpack Series.to_json - Series.to_sparse Series.to_dense Series.to_string Series.to_clipboard Series.to_latex - - -Sparse ------- - -.. autosummary:: - :toctree: api/ - - SparseSeries.to_coo - SparseSeries.from_coo diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d5a30f98e07b1..2c8f66dd99e72 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4641,6 +4641,14 @@ Several caveats. See the `Full Documentation `__. +.. ipython:: python + :suppress: + + import warnings + # This can be removed once building with pyarrow >=0.15.0 + warnings.filterwarnings("ignore", "The Sparse", FutureWarning) + + .. ipython:: python df = pd.DataFrame({'a': list('abc'), diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 98fd30f67d05b..c258a8840b714 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -6,12 +6,6 @@ Sparse data structures ********************** -.. note:: - - ``SparseSeries`` and ``SparseDataFrame`` have been deprecated. Their purpose - is served equally well by a :class:`Series` or :class:`DataFrame` with - sparse values. See :ref:`sparse.migration` for tips on migrating. - Pandas provides data structures for efficiently storing sparse data. These are not necessarily sparse in the typical "mostly 0". Rather, you can view these objects as being "compressed" where any data matching a specific value (``NaN`` / missing value, though any value @@ -168,6 +162,11 @@ the correct dense result. Migrating --------- +.. note:: + + ``SparseSeries`` and ``SparseDataFrame`` were removed in pandas 1.0.0. This migration + guide is present to aid in migrating from previous versions. + In older versions of pandas, the ``SparseSeries`` and ``SparseDataFrame`` classes (documented below) were the preferred way to work with sparse data. With the advent of extension arrays, these subclasses are no longer needed. Their purpose is better served by using a regular Series or DataFrame with @@ -366,12 +365,3 @@ row and columns coordinates of the matrix. Note that this will consume a signifi ss_dense = pd.Series.sparse.from_coo(A, dense_index=True) ss_dense - - -.. _sparse.subclasses: - -Sparse subclasses ------------------ - -The :class:`SparseSeries` and :class:`SparseDataFrame` classes are deprecated. Visit their -API pages for usage. diff --git a/doc/source/whatsnew/v0.16.0.rst b/doc/source/whatsnew/v0.16.0.rst index 42b3b9332ca98..fc638e35ed88b 100644 --- a/doc/source/whatsnew/v0.16.0.rst +++ b/doc/source/whatsnew/v0.16.0.rst @@ -91,8 +91,7 @@ Interaction with scipy.sparse Added :meth:`SparseSeries.to_coo` and :meth:`SparseSeries.from_coo` methods (:issue:`8048`) for converting to and from ``scipy.sparse.coo_matrix`` instances (see :ref:`here `). For example, given a SparseSeries with MultiIndex we can convert to a `scipy.sparse.coo_matrix` by specifying the row and column labels as index levels: -.. ipython:: python - :okwarning: +.. code-block:: python s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), @@ -121,8 +120,7 @@ Added :meth:`SparseSeries.to_coo` and :meth:`SparseSeries.from_coo` methods (:is The from_coo method is a convenience method for creating a ``SparseSeries`` from a ``scipy.sparse.coo_matrix``: -.. ipython:: python - :okwarning: +.. code-block:: python from scipy import sparse A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst index 7e06e5050c5f0..f786ce513f6fe 100644 --- a/doc/source/whatsnew/v0.18.1.rst +++ b/doc/source/whatsnew/v0.18.1.rst @@ -393,8 +393,7 @@ used in the ``pandas`` implementation (:issue:`12644`, :issue:`12638`, :issue:`1 An example of this signature augmentation is illustrated below: -.. ipython:: python - :okwarning: +.. code-block:: python sp = pd.SparseDataFrame([1, 2, 3]) sp @@ -409,8 +408,7 @@ Previous behaviour: New behaviour: -.. ipython:: python - :okwarning: +.. code-block:: python np.cumsum(sp, axis=0) diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 1dad8769a6b39..61a65415f6b57 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -1235,8 +1235,7 @@ Operators now preserve dtypes - Sparse data structure now can preserve ``dtype`` after arithmetic ops (:issue:`13848`) -.. ipython:: python - :okwarning: +.. code-block:: python s = pd.SparseSeries([0, 2, 0, 1], fill_value=0, dtype=np.int64) s.dtype @@ -1245,8 +1244,7 @@ Operators now preserve dtypes - Sparse data structure now support ``astype`` to convert internal ``dtype`` (:issue:`13900`) -.. ipython:: python - :okwarning: +.. code-block:: python s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0) s diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 62604dd3edd2d..c7278d5a47ba6 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -338,8 +338,7 @@ See the :ref:`documentation ` for more information. (:issue: All sparse formats are supported, but matrices that are not in :mod:`COOrdinate ` format will be converted, copying data as needed. -.. ipython:: python - :okwarning: +.. code-block:: python from scipy.sparse import csr_matrix arr = np.random.random(size=(1000, 5)) @@ -351,7 +350,7 @@ All sparse formats are supported, but matrices that are not in :mod:`COOrdinate To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you can use: -.. ipython:: python +.. code-block:: python sdf.to_coo() diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index fe1e2d7826d62..2eca1c6c7636c 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -902,8 +902,7 @@ by a ``Series`` or ``DataFrame`` with sparse values. **Previous way** -.. ipython:: python - :okwarning: +.. code-block:: python df = pd.SparseDataFrame({"A": [0, 0, 1, 2]}) df.dtypes diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 8b2b3a09f8c87..432b1de6dca1b 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -91,8 +91,17 @@ Deprecations .. _whatsnew_1000.prior_deprecations: + +Removed SparseSeries and SparseDataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``SparseSeries`` and ``SparseDataFrame`` have been removed (:issue:`28425`). +We recommend using a ``Series`` or ``DataFrame`` with sparse values instead. +See :ref:`sparse.migration` for help with migrating existing code. + Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + - Removed the previously deprecated :meth:`Series.get_value`, :meth:`Series.set_value`, :meth:`DataFrame.get_value`, :meth:`DataFrame.set_value` (:issue:`17739`) - Changed the the default value of `inplace` in :meth:`DataFrame.set_index` and :meth:`Series.set_axis`. It now defaults to False (:issue:`27600`) - :meth:`pandas.Series.str.cat` now defaults to aligning ``others``, using ``join='left'`` (:issue:`27611`) diff --git a/pandas/__init__.py b/pandas/__init__.py index 6351b508fb0e5..59ecc7f609ae9 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -114,12 +114,7 @@ DataFrame, ) -from pandas.core.sparse.api import ( - SparseArray, - SparseDataFrame, - SparseSeries, - SparseDtype, -) +from pandas.core.sparse.api import SparseArray, SparseDtype from pandas.tseries.api import infer_freq from pandas.tseries import offsets @@ -196,8 +191,9 @@ if pandas.compat.PY37: def __getattr__(name): + import warnings + if name == "Panel": - import warnings warnings.warn( "The Panel class is removed from pandas. Accessing it " @@ -211,6 +207,17 @@ class Panel: pass return Panel + elif name in {"SparseSeries", "SparseDataFrame"}: + warnings.warn( + "The {} class is removed from pandas. Accessing it from " + "the top-level namespace will also be removed in the next " + "version".format(name), + FutureWarning, + stacklevel=2, + ) + + return type(name, (), {}) + raise AttributeError("module 'pandas' has no attribute '{}'".format(name)) @@ -219,6 +226,12 @@ class Panel: class Panel: pass + class SparseDataFrame: + pass + + class SparseSeries: + pass + # module level doc-string __doc__ = """ diff --git a/pandas/_typing.py b/pandas/_typing.py index de9fb5b944186..70ed7b4d3b376 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -12,13 +12,10 @@ from pandas.core.dtypes.dtypes import ExtensionDtype # noqa: F401 from pandas.core.indexes.base import Index # noqa: F401 from pandas.core.series import Series # noqa: F401 - from pandas.core.sparse.series import SparseSeries # noqa: F401 from pandas.core.generic import NDFrame # noqa: F401 -AnyArrayLike = TypeVar( - "AnyArrayLike", "ExtensionArray", "Index", "Series", "SparseSeries", np.ndarray -) +AnyArrayLike = TypeVar("AnyArrayLike", "ExtensionArray", "Index", "Series", np.ndarray) ArrayLike = TypeVar("ArrayLike", "ExtensionArray", np.ndarray) DatetimeLikeScalar = TypeVar("DatetimeLikeScalar", "Period", "Timestamp", "Timedelta") Dtype = Union[str, np.dtype, "ExtensionDtype"] diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 87240a9f986c3..b3c7b8a7c8b9f 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -5,9 +5,14 @@ import copy import pickle as pkl import sys +from typing import TYPE_CHECKING +import warnings from pandas import Index +if TYPE_CHECKING: + from pandas import Series, DataFrame + def load_reduce(self): stack = self.stack @@ -54,6 +59,41 @@ def load_reduce(self): raise +_sparse_msg = """\ + +Loading a saved '{cls}' as a {new} with sparse values. +'{cls}' is now removed. You should re-save this dataset in its new format. +""" + + +class _LoadSparseSeries: + # To load a SparseSeries as a Series[Sparse] + def __new__(cls) -> "Series": + from pandas import Series + + warnings.warn( + _sparse_msg.format(cls="SparseSeries", new="Series"), + FutureWarning, + stacklevel=6, + ) + + return Series() + + +class _LoadSparseFrame: + # To load a SparseDataFrame as a DataFrame[Sparse] + def __new__(cls) -> "DataFrame": + from pandas import DataFrame + + warnings.warn( + _sparse_msg.format(cls="SparseDataFrame", new="DataFrame"), + FutureWarning, + stacklevel=6, + ) + + return DataFrame() + + # If classes are moved, provide compat here. _class_locations_map = { ("pandas.core.sparse.array", "SparseArray"): ("pandas.core.arrays", "SparseArray"), @@ -101,12 +141,12 @@ def load_reduce(self): "SparseArray", ), ("pandas.sparse.series", "SparseSeries"): ( - "pandas.core.sparse.series", - "SparseSeries", + "pandas.compat.pickle_compat", + "_LoadSparseSeries", ), ("pandas.sparse.frame", "SparseDataFrame"): ( "pandas.core.sparse.frame", - "SparseDataFrame", + "_LoadSparseFrame", ), ("pandas.indexes.base", "_new_Index"): ("pandas.core.indexes.base", "_new_Index"), ("pandas.indexes.base", "Index"): ("pandas.core.indexes.base", "Index"), @@ -139,6 +179,14 @@ def load_reduce(self): "pandas.core.indexes.numeric", "Float64Index", ), + ("pandas.core.sparse.series", "SparseSeries"): ( + "pandas.compat.pickle_compat", + "_LoadSparseSeries", + ), + ("pandas.core.sparse.frame", "SparseDataFrame"): ( + "pandas.compat.pickle_compat", + "_LoadSparseFrame", + ), } diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 24dafd11ae2b7..c88289c3a4592 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -43,7 +43,6 @@ ABCIndexClass, ABCSeries, ABCSparseArray, - ABCSparseSeries, ) from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna @@ -607,7 +606,7 @@ def __init__( if fill_value is None and isinstance(dtype, SparseDtype): fill_value = dtype.fill_value - if isinstance(data, (type(self), ABCSparseSeries)): + if isinstance(data, type(self)): # disable normal inference on dtype, sparse_index, & fill_value if sparse_index is None: sparse_index = data.sp_index @@ -1969,7 +1968,7 @@ def _delegate_method(self, name, *args, **kwargs): @classmethod def from_coo(cls, A, dense_index=False): """ - Create a SparseSeries from a scipy.sparse.coo_matrix. + Create a Series with sparse values from a scipy.sparse.coo_matrix. Parameters ---------- @@ -1982,7 +1981,8 @@ def from_coo(cls, A, dense_index=False): Returns ------- - s : SparseSeries + s : Series + A Series with sparse values. Examples -------- @@ -1996,7 +1996,7 @@ def from_coo(cls, A, dense_index=False): matrix([[ 0., 0., 1., 2.], [ 3., 0., 0., 0.], [ 0., 0., 0., 0.]]) - >>> ss = pd.SparseSeries.from_coo(A) + >>> ss = pd.Series.sparse.from_coo(A) >>> ss 0 2 1 3 2 @@ -2009,14 +2009,14 @@ def from_coo(cls, A, dense_index=False): from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series from pandas import Series - result = _coo_to_sparse_series(A, dense_index=dense_index, sparse_series=False) + result = _coo_to_sparse_series(A, dense_index=dense_index) result = Series(result.array, index=result.index, copy=False) return result def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): """ - Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex. + Create a scipy.sparse.coo_matrix from a Series with MultiIndex. Use row_levels and column_levels to determine the row and column coordinates respectively. row_levels and column_levels are the names @@ -2046,10 +2046,10 @@ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): (2, 1, 'b', 0), (2, 1, 'b', 1)], names=['A', 'B', 'C', 'D']) - >>> ss = s.to_sparse() - >>> A, rows, columns = ss.to_coo(row_levels=['A', 'B'], - column_levels=['C', 'D'], - sort_labels=True) + >>> ss = s.astype("Sparse") + >>> A, rows, columns = ss.sparse.to_coo(row_levels=['A', 'B'], + ... column_levels=['C', 'D'], + ... sort_labels=True) >>> A <3x4 sparse matrix of type '' with 3 stored elements in COOrdinate format> diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index cf1a602e9e9f2..4ea649a2a6faf 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -273,8 +273,6 @@ def is_sparse(arr): See Also -------- - DataFrame.to_sparse : Convert DataFrame to a SparseDataFrame. - Series.to_sparse : Convert Series to SparseSeries. Series.to_dense : Return dense representation of a Series. Examples @@ -283,7 +281,7 @@ def is_sparse(arr): >>> is_sparse(pd.SparseArray([0, 0, 1, 0])) True - >>> is_sparse(pd.SparseSeries([0, 0, 1, 0])) + >>> is_sparse(pd.Series(pd.SparseArray([0, 0, 1, 0]))) True Returns `False` if the parameter is not sparse. @@ -300,14 +298,6 @@ def is_sparse(arr): False Returns `False` if the parameter has more than one dimension. - - >>> df = pd.SparseDataFrame([389., 24., 80.5, np.nan], - columns=['max_speed'], - index=['falcon', 'parrot', 'lion', 'monkey']) - >>> is_sparse(df) - False - >>> is_sparse(df.max_speed) - True """ from pandas.core.arrays.sparse import SparseDtype @@ -340,8 +330,6 @@ def is_scipy_sparse(arr): True >>> is_scipy_sparse(pd.SparseArray([1, 2, 3])) False - >>> is_scipy_sparse(pd.SparseSeries([1, 2, 3])) - False """ global _is_scipy_sparse @@ -1715,9 +1703,6 @@ def is_extension_type(arr): True >>> is_extension_type(pd.SparseArray([1, 2, 3])) True - >>> is_extension_type(pd.SparseSeries([1, 2, 3])) - True - >>> >>> from scipy.sparse import bsr_matrix >>> is_extension_type(bsr_matrix([1, 2, 3])) False diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index de41644f09b66..2518f330b26a3 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -52,12 +52,7 @@ def _check(cls, inst): ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) -ABCSparseDataFrame = create_pandas_abc_type( - "ABCSparseDataFrame", "_subtyp", ("sparse_frame",) -) -ABCSparseSeries = create_pandas_abc_type( - "ABCSparseSeries", "_subtyp", ("sparse_series", "sparse_time_series") -) + ABCSparseArray = create_pandas_abc_type( "ABCSparseArray", "_subtyp", ("sparse_array", "sparse_series") ) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c12208db983e2..aeca7782e3ae5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1943,81 +1943,6 @@ def _from_arrays(cls, arrays, columns, index, dtype=None): mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) return cls(mgr) - def to_sparse(self, fill_value=None, kind="block"): - """ - Convert to SparseDataFrame. - - .. deprecated:: 0.25.0 - - Implement the sparse version of the DataFrame meaning that any data - matching a specific value it's omitted in the representation. - The sparse DataFrame allows for a more efficient storage. - - Parameters - ---------- - fill_value : float, default None - The specific value that should be omitted in the representation. - kind : {'block', 'integer'}, default 'block' - The kind of the SparseIndex tracking where data is not equal to - the fill value: - - - 'block' tracks only the locations and sizes of blocks of data. - - 'integer' keeps an array with all the locations of the data. - - In most cases 'block' is recommended, since it's more memory - efficient. - - Returns - ------- - SparseDataFrame - The sparse representation of the DataFrame. - - See Also - -------- - DataFrame.to_dense : - Converts the DataFrame back to the its dense form. - - Examples - -------- - >>> df = pd.DataFrame([(np.nan, np.nan), - ... (1., np.nan), - ... (np.nan, 1.)]) - >>> df - 0 1 - 0 NaN NaN - 1 1.0 NaN - 2 NaN 1.0 - >>> type(df) - - - >>> sdf = df.to_sparse() # doctest: +SKIP - >>> sdf # doctest: +SKIP - 0 1 - 0 NaN NaN - 1 1.0 NaN - 2 NaN 1.0 - >>> type(sdf) # doctest: +SKIP - - """ - warnings.warn( - "DataFrame.to_sparse is deprecated and will be removed " - "in a future version", - FutureWarning, - stacklevel=2, - ) - - from pandas.core.sparse.api import SparseDataFrame - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", message="SparseDataFrame") - return SparseDataFrame( - self._series, - index=self.index, - columns=self.columns, - default_kind=kind, - default_fill_value=fill_value, - ) - @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) def to_stata( self, @@ -7238,7 +7163,6 @@ def join(self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False): 4 K4 A4 NaN 5 K5 A5 NaN """ - # For SparseDataFrame's benefit return self._join_compat( other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort ) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 42de85945cbcb..020708f319776 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5573,9 +5573,6 @@ def get_ftype_counts(self): .. deprecated:: 0.23.0 - This is useful for SparseDataFrame or for DataFrames containing - sparse arrays. - Returns ------- dtype : Series @@ -5670,7 +5667,6 @@ def ftypes(self): See Also -------- DataFrame.dtypes: Series with just dtype information. - SparseDataFrame : Container for sparse tabular data. Notes ----- @@ -5686,13 +5682,6 @@ def ftypes(self): 2 float64:dense 3 float64:dense dtype: object - - >>> pd.SparseDataFrame(arr).ftypes # doctest: +SKIP - 0 float64:sparse - 1 float64:sparse - 2 float64:sparse - 3 float64:sparse - dtype: object """ warnings.warn( "DataFrame.ftypes is deprecated and will " diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e731cffea0671..f8f1455561c03 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -58,7 +58,6 @@ import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series -from pandas.core.sparse.frame import SparseDataFrame from pandas.plotting import boxplot_frame_groupby @@ -258,12 +257,6 @@ def aggregate(self, func, *args, **kwargs): result.columns.levels[0], name=self._selected_obj.columns.name ) - if isinstance(self.obj, SparseDataFrame): - # Backwards compat for groupby.agg() with sparse - # values. concat no longer converts DataFrame[Sparse] - # to SparseDataFrame, so we do it here. - result = SparseDataFrame(result._data) - if not self.as_index: self._insert_inaxis_grouper_inplace(result) result.index = np.arange(len(result)) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 017cc8b27942a..ca4f35514f2a5 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -32,7 +32,6 @@ ABCExtensionArray, ABCIndexClass, ABCSeries, - ABCSparseSeries, ABCTimedeltaArray, ABCTimedeltaIndex, ) @@ -1106,78 +1105,6 @@ def f(self, other): # Sparse -def _cast_sparse_series_op(left, right, opname): - """ - For SparseSeries operation, coerce to float64 if the result is expected - to have NaN or inf values - - Parameters - ---------- - left : SparseArray - right : SparseArray - opname : str - - Returns - ------- - left : SparseArray - right : SparseArray - """ - from pandas.core.sparse.api import SparseDtype - - opname = opname.strip("_") - - # TODO: This should be moved to the array? - if is_integer_dtype(left) and is_integer_dtype(right): - # series coerces to float64 if result should have NaN/inf - if opname in ("floordiv", "mod") and (right.to_dense() == 0).any(): - left = left.astype(SparseDtype(np.float64, left.fill_value)) - right = right.astype(SparseDtype(np.float64, right.fill_value)) - elif opname in ("rfloordiv", "rmod") and (left.to_dense() == 0).any(): - left = left.astype(SparseDtype(np.float64, left.fill_value)) - right = right.astype(SparseDtype(np.float64, right.fill_value)) - - return left, right - - -def _arith_method_SPARSE_SERIES(cls, op, special): - """ - Wrapper function for Series arithmetic operations, to avoid - code duplication. - """ - op_name = _get_op_name(op, special) - - def wrapper(self, other): - if isinstance(other, ABCDataFrame): - return NotImplemented - elif isinstance(other, ABCSeries): - if not isinstance(other, ABCSparseSeries): - other = other.to_sparse(fill_value=self.fill_value) - return _sparse_series_op(self, other, op, op_name) - elif is_scalar(other): - with np.errstate(all="ignore"): - new_values = op(self.values, other) - return self._constructor(new_values, index=self.index, name=self.name) - else: # pragma: no cover - raise TypeError( - "operation with {other} not supported".format(other=type(other)) - ) - - wrapper.__name__ = op_name - return wrapper - - -def _sparse_series_op(left, right, op, name): - left, right = left.align(right, join="outer", copy=False) - new_index = left.index - new_name = get_op_result_name(left, right) - - from pandas.core.arrays.sparse import _sparse_array_op - - lvalues, rvalues = _cast_sparse_series_op(left.values, right.values, name) - result = _sparse_array_op(lvalues, rvalues, op, name) - return left._constructor(result, index=new_index, name=new_name) - - def maybe_dispatch_ufunc_to_dunder_op( self: ArrayLike, ufunc: Callable, method: str, *inputs: ArrayLike, **kwargs: Any ): diff --git a/pandas/core/ops/methods.py b/pandas/core/ops/methods.py index eba0a797a791f..477c847fb01e6 100644 --- a/pandas/core/ops/methods.py +++ b/pandas/core/ops/methods.py @@ -3,12 +3,7 @@ """ import operator -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCSeries, - ABCSparseArray, - ABCSparseSeries, -) +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCSparseArray from pandas.core.ops.roperator import ( radd, @@ -51,7 +46,6 @@ def _get_method_wrappers(cls): from pandas.core.ops import ( _arith_method_FRAME, _arith_method_SERIES, - _arith_method_SPARSE_SERIES, _bool_method_SERIES, _comp_method_FRAME, _comp_method_SERIES, @@ -59,24 +53,14 @@ def _get_method_wrappers(cls): _flex_method_SERIES, ) - if issubclass(cls, ABCSparseSeries): - # Be sure to catch this before ABCSeries and ABCSparseArray, - # as they will both come see SparseSeries as a subclass - arith_flex = _flex_method_SERIES - comp_flex = _flex_method_SERIES - arith_special = _arith_method_SPARSE_SERIES - comp_special = _arith_method_SPARSE_SERIES - bool_special = _bool_method_SERIES - # TODO: I don't think the functions defined by bool_method are tested - elif issubclass(cls, ABCSeries): - # Just Series; SparseSeries is caught above + if issubclass(cls, ABCSeries): + # Just Series arith_flex = _flex_method_SERIES comp_flex = _flex_method_SERIES arith_special = _arith_method_SERIES comp_special = _comp_method_SERIES bool_special = _bool_method_SERIES elif issubclass(cls, ABCDataFrame): - # Same for DataFrame and SparseDataFrame arith_flex = _arith_method_FRAME comp_flex = _flex_comp_method_FRAME arith_special = _arith_method_FRAME @@ -176,7 +160,7 @@ def _create_methods(cls, arith_method, comp_method, bool_method, special): # constructors. have_divmod = issubclass(cls, ABCSeries) - # divmod is available for Series and SparseSeries + # divmod is available for Series # yapf: disable new_methods = dict( diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 4446f27da6be0..60bab817705e3 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -6,8 +6,6 @@ import numpy as np -from pandas.core.dtypes.generic import ABCSparseDataFrame - from pandas import DataFrame, Index, MultiIndex, Series from pandas.core import common as com from pandas.core.arrays.categorical import ( @@ -715,15 +713,13 @@ def _get_series_result_type(result, objs=None): return appropriate class of Series concat input is either dict or array-like """ - from pandas import SparseSeries, SparseDataFrame, DataFrame + # TODO: See if we can just inline with _constructor_expanddim + # now that sparse is removed. + from pandas import DataFrame # concat Series with axis 1 if isinstance(result, dict): - # concat Series with axis 1 - if all(isinstance(c, (SparseSeries, SparseDataFrame)) for c in result.values()): - return SparseDataFrame - else: - return DataFrame + return DataFrame # otherwise it is a SingleBlockManager (axis = 0) return objs[0]._constructor @@ -732,13 +728,6 @@ def _get_series_result_type(result, objs=None): def _get_frame_result_type(result, objs): """ return appropriate class of DataFrame-like concat - if all blocks are sparse, return SparseDataFrame - otherwise, return 1st obj """ - - if result.blocks and (any(isinstance(obj, ABCSparseDataFrame) for obj in objs)): - from pandas.core.sparse.api import SparseDataFrame - - return SparseDataFrame - else: - return next(obj for obj in objs if not isinstance(obj, ABCSparseDataFrame)) + # TODO: just inline this as _constructor. + return objs[0] diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c32ca47c19160..e654685d24d9d 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -57,7 +57,7 @@ class _Unstacker: float and missing values will be set to NaN. constructor : object Pandas ``DataFrame`` or subclass used to create unstacked - response. If None, DataFrame or SparseDataFrame will be used. + response. If None, DataFrame will be used. Examples -------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 8458e6f5bbaad..b0616c053df6d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -44,7 +44,6 @@ ABCDatetimeIndex, ABCSeries, ABCSparseArray, - ABCSparseSeries, ) from pandas.core.dtypes.missing import ( isna, @@ -56,7 +55,7 @@ import pandas as pd from pandas.core import algorithms, base, generic, nanops, ops from pandas.core.accessor import CachedAccessor -from pandas.core.arrays import ExtensionArray, SparseArray +from pandas.core.arrays import ExtensionArray from pandas.core.arrays.categorical import Categorical, CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com @@ -246,7 +245,7 @@ def __init__( elif isinstance(data, np.ndarray): pass - elif isinstance(data, (ABCSeries, ABCSparseSeries)): + elif isinstance(data, ABCSeries): if name is None: name = data.name if index is None: @@ -385,10 +384,6 @@ def from_array( FutureWarning, stacklevel=2, ) - if isinstance(arr, ABCSparseArray): - from pandas.core.sparse.series import SparseSeries - - cls = SparseSeries return cls( arr, index=index, name=name, dtype=dtype, copy=copy, fastpath=fastpath ) @@ -1772,38 +1767,6 @@ def to_frame(self, name=None): return df - def to_sparse(self, kind="block", fill_value=None): - """ - Convert Series to SparseSeries. - - .. deprecated:: 0.25.0 - - Parameters - ---------- - kind : {'block', 'int'}, default 'block' - fill_value : float, defaults to NaN (missing) - Value to use for filling NaN values. - - Returns - ------- - SparseSeries - Sparse representation of the Series. - """ - - warnings.warn( - "Series.to_sparse is deprecated and will be removed in a future version", - FutureWarning, - stacklevel=2, - ) - from pandas.core.sparse.series import SparseSeries - - values = SparseArray(self, kind=kind, fill_value=fill_value) - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", message="SparseSeries") - return SparseSeries(values, index=self.index, name=self.name).__finalize__( - self - ) - def _set_name(self, name, inplace=False): """ Set the Series name. diff --git a/pandas/core/sparse/api.py b/pandas/core/sparse/api.py index 6a00fa570b2ac..e7bf94cdc08ea 100644 --- a/pandas/core/sparse/api.py +++ b/pandas/core/sparse/api.py @@ -1,4 +1,3 @@ -# flake8: noqa from pandas.core.arrays.sparse import SparseArray, SparseDtype -from pandas.core.sparse.frame import SparseDataFrame -from pandas.core.sparse.series import SparseSeries + +__all__ = ["SparseArray", "SparseDtype"] diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py deleted file mode 100644 index aaa99839144b4..0000000000000 --- a/pandas/core/sparse/frame.py +++ /dev/null @@ -1,1055 +0,0 @@ -""" -Data structures for sparse float data. Life is made simpler by dealing only -with float64 data -""" -import warnings - -import numpy as np - -from pandas._libs.lib import is_scalar, item_from_zerodim -from pandas._libs.sparse import BlockIndex, get_blocks -from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender - -from pandas.core.dtypes.cast import maybe_upcast -from pandas.core.dtypes.common import ensure_platform_int, is_scipy_sparse -from pandas.core.dtypes.missing import isna, notna - -import pandas.core.algorithms as algos -from pandas.core.arrays.sparse import SparseArray, SparseFrameAccessor -import pandas.core.common as com -from pandas.core.frame import DataFrame -import pandas.core.generic as generic -from pandas.core.index import Index, MultiIndex, ensure_index -from pandas.core.internals import BlockManager, create_block_manager_from_arrays -from pandas.core.internals.construction import extract_index, prep_ndarray -import pandas.core.ops as ops -from pandas.core.series import Series -from pandas.core.sparse.series import SparseSeries - -_shared_doc_kwargs = dict(klass="SparseDataFrame") -depr_msg = """\ -SparseDataFrame is deprecated and will be removed in a future version. -Use a regular DataFrame whose columns are SparseArrays instead. - -See http://pandas.pydata.org/pandas-docs/stable/\ -user_guide/sparse.html#migrating for more. -""" - - -class SparseDataFrame(DataFrame): - """ - DataFrame containing sparse floating point data in the form of SparseSeries - objects - - .. deprecated:: 0.25.0 - - Use a DataFrame with sparse values instead. - - Parameters - ---------- - data : same types as can be passed to DataFrame or scipy.sparse.spmatrix - .. versionchanged:: 0.23.0 - If data is a dict, argument order is maintained for Python 3.6 - and later. - - index : array-like, optional - column : array-like, optional - default_kind : {'block', 'integer'}, default 'block' - Default sparse kind for converting Series to SparseSeries. Will not - override SparseSeries passed into constructor - default_fill_value : float - Default fill_value for converting Series to SparseSeries - (default: nan). Will not override SparseSeries passed in. - """ - - _subtyp = "sparse_frame" - - def __init__( - self, - data=None, - index=None, - columns=None, - default_kind=None, - default_fill_value=None, - dtype=None, - copy=False, - ): - if not is_scalar(default_fill_value): - raise ValueError("'default_fill_value' must be a scalar") - - warnings.warn(depr_msg, FutureWarning, stacklevel=2) - # pick up the defaults from the Sparse structures - if isinstance(data, SparseDataFrame): - if index is None: - index = data.index - if columns is None: - columns = data.columns - if default_fill_value is None: - default_fill_value = data.default_fill_value - if default_kind is None: - default_kind = data.default_kind - elif isinstance(data, (SparseSeries, SparseArray)): - if index is None: - index = data.index - if default_fill_value is None: - default_fill_value = data.fill_value - if columns is None and hasattr(data, "name"): - columns = [data.name] - if columns is None: - raise Exception("cannot pass a series w/o a name or columns") - data = {columns[0]: data} - - if default_fill_value is None: - default_fill_value = np.nan - if default_kind is None: - default_kind = "block" - - self._default_kind = default_kind - self._default_fill_value = default_fill_value - - if is_scipy_sparse(data): - mgr = self._init_spmatrix( - data, index, columns, dtype=dtype, fill_value=default_fill_value - ) - elif isinstance(data, dict): - mgr = self._init_dict(data, index, columns, dtype=dtype) - elif isinstance(data, (np.ndarray, list)): - mgr = self._init_matrix(data, index, columns, dtype=dtype) - elif isinstance(data, SparseDataFrame): - mgr = self._init_mgr( - data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy - ) - elif isinstance(data, DataFrame): - mgr = self._init_dict(data, data.index, data.columns, dtype=dtype) - elif isinstance(data, Series): - mgr = self._init_dict( - data.to_frame(), data.index, columns=None, dtype=dtype - ) - elif isinstance(data, BlockManager): - mgr = self._init_mgr( - data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy - ) - elif data is None: - data = DataFrame() - - if index is None: - index = Index([]) - else: - index = ensure_index(index) - - if columns is None: - columns = Index([]) - else: - for c in columns: - data[c] = SparseArray( - self._default_fill_value, - index=index, - kind=self._default_kind, - fill_value=self._default_fill_value, - ) - mgr = to_manager(data, columns, index) - if dtype is not None: - mgr = mgr.astype(dtype) - else: - msg = ( - 'SparseDataFrame called with unknown type "{data_type}" ' - "for data argument" - ) - raise TypeError(msg.format(data_type=type(data).__name__)) - - generic.NDFrame.__init__(self, mgr) - - @property - def _constructor(self): - return SparseDataFrame - - _constructor_sliced = SparseSeries - - def _init_dict(self, data, index, columns, dtype=None): - # pre-filter out columns if we passed it - if columns is not None: - columns = ensure_index(columns) - data = {k: v for k, v in data.items() if k in columns} - else: - keys = com.dict_keys_to_ordered_list(data) - columns = Index(keys) - - if index is None: - index = extract_index(list(data.values())) - - def sp_maker(x): - return SparseArray( - x, - kind=self._default_kind, - fill_value=self._default_fill_value, - copy=True, - dtype=dtype, - ) - - sdict = {} - for k, v in data.items(): - if isinstance(v, Series): - # Force alignment, no copy necessary - if not v.index.equals(index): - v = v.reindex(index) - - if not isinstance(v, SparseSeries): - v = sp_maker(v.values) - elif isinstance(v, SparseArray): - v = v.copy() - else: - if isinstance(v, dict): - v = [v.get(i, np.nan) for i in index] - - v = sp_maker(v) - - if index is not None and len(v) != len(index): - msg = "Length of passed values is {}, index implies {}" - raise ValueError(msg.format(len(v), len(index))) - sdict[k] = v - - if len(columns.difference(sdict)): - # TODO: figure out how to handle this case, all nan's? - # add in any other columns we want to have (completeness) - nan_arr = np.empty(len(index), dtype="float64") - nan_arr.fill(np.nan) - nan_arr = SparseArray( - nan_arr, - kind=self._default_kind, - fill_value=self._default_fill_value, - copy=False, - ) - sdict.update((c, nan_arr) for c in columns if c not in sdict) - - return to_manager(sdict, columns, index) - - def _init_matrix(self, data, index, columns, dtype=None): - """ - Init self from ndarray or list of lists. - """ - data = prep_ndarray(data, copy=False) - index, columns = SparseFrameAccessor._prep_index(data, index, columns) - data = {idx: data[:, i] for i, idx in enumerate(columns)} - return self._init_dict(data, index, columns, dtype) - - def _init_spmatrix(self, data, index, columns, dtype=None, fill_value=None): - """ - Init self from scipy.sparse matrix. - """ - index, columns = SparseFrameAccessor._prep_index(data, index, columns) - data = data.tocoo() - N = len(index) - - # Construct a dict of SparseSeries - sdict = {} - values = Series(data.data, index=data.row, copy=False) - for col, rowvals in values.groupby(data.col): - # get_blocks expects int32 row indices in sorted order - rowvals = rowvals.sort_index() - rows = rowvals.index.values.astype(np.int32) - blocs, blens = get_blocks(rows) - - sdict[columns[col]] = SparseSeries( - rowvals.values, - index=index, - fill_value=fill_value, - sparse_index=BlockIndex(N, blocs, blens), - ) - - # Add any columns that were empty and thus not grouped on above - sdict.update( - { - column: SparseSeries( - index=index, - fill_value=fill_value, - sparse_index=BlockIndex(N, [], []), - ) - for column in columns - if column not in sdict - } - ) - - return self._init_dict(sdict, index, columns, dtype) - - @Appender(SparseFrameAccessor.to_coo.__doc__) - def to_coo(self): - return SparseFrameAccessor(self).to_coo() - - def __repr__(self): - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Sparse") - return super().__repr__() - - def __getstate__(self): - # pickling - return dict( - _typ=self._typ, - _subtyp=self._subtyp, - _data=self._data, - _default_fill_value=self._default_fill_value, - _default_kind=self._default_kind, - ) - - def _unpickle_sparse_frame_compat(self, state): - """ - Original pickle format - """ - series, cols, idx, fv, kind = state - - if not isinstance(cols, Index): # pragma: no cover - from pandas.io.pickle import _unpickle_array - - columns = _unpickle_array(cols) - else: - columns = cols - - if not isinstance(idx, Index): # pragma: no cover - from pandas.io.pickle import _unpickle_array - - index = _unpickle_array(idx) - else: - index = idx - - series_dict = DataFrame() - for col, (sp_index, sp_values) in series.items(): - series_dict[col] = SparseSeries( - sp_values, sparse_index=sp_index, fill_value=fv - ) - - self._data = to_manager(series_dict, columns, index) - self._default_fill_value = fv - self._default_kind = kind - - @Appender(SparseFrameAccessor.to_dense.__doc__) - def to_dense(self): - return SparseFrameAccessor(self).to_dense() - - def _apply_columns(self, func): - """ - Get new SparseDataFrame applying func to each columns - """ - - new_data = {col: func(series) for col, series in self.items()} - - return self._constructor( - data=new_data, - index=self.index, - columns=self.columns, - default_fill_value=self.default_fill_value, - ).__finalize__(self) - - def astype(self, dtype): - return self._apply_columns(lambda x: x.astype(dtype)) - - def copy(self, deep=True): - """ - Make a copy of this SparseDataFrame - """ - result = super().copy(deep=deep) - result._default_fill_value = self._default_fill_value - result._default_kind = self._default_kind - return result - - @property - def default_fill_value(self): - return self._default_fill_value - - @property - def default_kind(self): - return self._default_kind - - @property - def density(self): - """ - Ratio of non-sparse points to total (dense) data points - represented in the frame - """ - tot_nonsparse = sum(ser.sp_index.npoints for _, ser in self.items()) - tot = len(self.index) * len(self.columns) - return tot_nonsparse / float(tot) - - def fillna( - self, value=None, method=None, axis=0, inplace=False, limit=None, downcast=None - ): - new_self = super().fillna( - value=value, - method=method, - axis=axis, - inplace=inplace, - limit=limit, - downcast=downcast, - ) - if not inplace: - self = new_self - - # set the fill value if we are filling as a scalar with nothing special - # going on - if value is not None and value == value and method is None and limit is None: - self._default_fill_value = value - - if not inplace: - return self - - # ---------------------------------------------------------------------- - # Support different internal representation of SparseDataFrame - - def _sanitize_column(self, key, value, **kwargs): - """ - Creates a new SparseArray from the input value. - - Parameters - ---------- - key : object - value : scalar, Series, or array-like - kwargs : dict - - Returns - ------- - sanitized_column : SparseArray - - """ - - def sp_maker(x, index=None): - return SparseArray( - x, - index=index, - fill_value=self._default_fill_value, - kind=self._default_kind, - ) - - if isinstance(value, SparseSeries): - clean = value.reindex(self.index).as_sparse_array( - fill_value=self._default_fill_value, kind=self._default_kind - ) - - elif isinstance(value, SparseArray): - if len(value) != len(self.index): - raise ValueError("Length of values does not match length of index") - clean = value - - elif hasattr(value, "__iter__"): - if isinstance(value, Series): - clean = value.reindex(self.index) - if not isinstance(value, SparseSeries): - clean = sp_maker(clean) - else: - if len(value) != len(self.index): - raise ValueError("Length of values does not match length of index") - clean = sp_maker(value) - - # Scalar - else: - clean = sp_maker(value, self.index) - - # always return a SparseArray! - return clean - - # ---------------------------------------------------------------------- - # Indexing Methods - - def _get_value(self, index, col, takeable=False): - """ - Quickly retrieve single value at passed column and index - - Please use .at[] or .iat[] accessors. - - Parameters - ---------- - index : row label - col : column label - takeable : interpret the index/col as indexers, default False - - Returns - ------- - value : scalar value - """ - if takeable is True: - series = self._iget_item_cache(col) - else: - series = self._get_item_cache(col) - - return series._get_value(index, takeable=takeable) - - def _slice(self, slobj, axis=0, kind=None): - if axis == 0: - new_index = self.index[slobj] - new_columns = self.columns - else: - new_index = self.index - new_columns = self.columns[slobj] - - return self.reindex(index=new_index, columns=new_columns) - - def xs(self, key, axis=0, copy=False): - """ - Returns a row (cross-section) from the SparseDataFrame as a Series - object. - - Parameters - ---------- - key : some index contained in the index - - Returns - ------- - xs : Series - """ - if axis == 1: - data = self[key] - return data - - i = self.index.get_loc(key) - data = self.take([i])._internal_get_values()[0] - return Series(data, index=self.columns) - - def _set_value(self, index, col, value, takeable=False): - """ - Put single value at passed column and index - - Please use .at[] or .iat[] accessors. - - Parameters - ---------- - index : row label - col : column label - value : scalar value - takeable : interpret the index/col as indexers, default False - - Notes - ----- - This method *always* returns a new object. It is currently not - particularly efficient (and potentially very expensive) but is provided - for API compatibility with DataFrame - - Returns - ------- - frame : DataFrame - """ - dense = self.to_dense()._set_value(index, col, value, takeable=takeable) - return dense.to_sparse( - kind=self._default_kind, fill_value=self._default_fill_value - ) - - # ---------------------------------------------------------------------- - # Arithmetic-related methods - - def _combine_frame(self, other, func, fill_value=None, level=None): - this, other = self.align(other, join="outer", level=level, copy=False) - this._default_fill_value = self._default_fill_value - - new_data = {} - if fill_value is not None: - # TODO: be a bit more intelligent here - for col in this.columns: - if col in this and col in other: - dleft = this[col].to_dense() - dright = other[col].to_dense() - result = dleft._binop(dright, func, fill_value=fill_value) - result = result.to_sparse(fill_value=this[col].fill_value) - new_data[col] = result - else: - - for col in this.columns: - if col in this and col in other: - new_data[col] = func(this[col], other[col]) - - return this._construct_result(other, new_data, func) - - def _combine_match_index(self, other, func, level=None): - this, other = self.align(other, join="outer", axis=0, level=level, copy=False) - this._default_fill_value = self._default_fill_value - - new_data = {} - for col in this.columns: - new_data[col] = func(this[col], other) - - return this._construct_result(other, new_data, func) - - def _combine_match_columns(self, other, func, level=None): - # patched version of DataFrame._combine_match_columns to account for - # NumPy circumventing __rsub__ with float64 types, e.g.: 3.0 - series, - # where 3.0 is numpy.float64 and series is a SparseSeries. Still - # possible for this to happen, which is bothersome - - left, right = self.align(other, join="outer", axis=1, level=level, copy=False) - assert left.columns.equals(right.index) - left._default_fill_value = self._default_fill_value - - new_data = {} - for col in left.columns: - new_data[col] = func(left[col], right[col]) - - # TODO: using this changed some behavior, see GH#28025 - return left._construct_result(other, new_data, func) - - def _combine_const(self, other, func): - return self._apply_columns(lambda x: func(x, other)) - - def _construct_result(self, other, result, func): - """ - Wrap the result of an arithmetic, comparison, or logical operation. - - Parameters - ---------- - other : object - result : SparseDataFrame - func : binary operator - - Returns - ------- - SparseDataFrame - """ - fill_value = self._get_op_result_fill_value(other, func) - - out = self._constructor(result, index=self.index, default_fill_value=fill_value) - out.columns = self.columns - return out.__finalize__(self) - - def _get_op_result_fill_value(self, other, func): - own_default = self.default_fill_value - - if isinstance(other, DataFrame): - # i.e. called from _combine_frame - - other_default = getattr(other, "default_fill_value", np.nan) - - # if the fill values are the same use them? or use a valid one - if own_default == other_default: - # TOOD: won't this evaluate as False if both are np.nan? - fill_value = own_default - elif np.isnan(own_default) and not np.isnan(other_default): - fill_value = other_default - elif not np.isnan(own_default) and np.isnan(other_default): - fill_value = own_default - else: - fill_value = None - - elif isinstance(other, SparseSeries): - # i.e. called from _combine_match_index - - # fill_value is a function of our operator - if isna(other.fill_value) or isna(own_default): - fill_value = np.nan - else: - fill_value = func(np.float64(own_default), np.float64(other.fill_value)) - fill_value = item_from_zerodim(fill_value) - - elif isinstance(other, Series): - # reached via _combine_match_columns - fill_value = self.default_fill_value - - else: - raise NotImplementedError(type(other)) - - return fill_value - - def _reindex_index( - self, index, method, copy, level, fill_value=np.nan, limit=None, takeable=False - ): - if level is not None: - raise TypeError("Reindex by level not supported for sparse") - - if self.index.equals(index): - if copy: - return self.copy() - else: - return self - - if len(self.index) == 0: - return self._constructor(index=index, columns=self.columns).__finalize__( - self - ) - - indexer = self.index.get_indexer(index, method, limit=limit) - indexer = ensure_platform_int(indexer) - mask = indexer == -1 - need_mask = mask.any() - - new_series = {} - for col, series in self.items(): - if mask.all(): - continue - - values = series.values - # .take returns SparseArray - new = values.take(indexer) - if need_mask: - new = new.to_dense() - # convert integer to float if necessary. need to do a lot - # more than that, handle boolean etc also - new, fill_value = maybe_upcast(new, fill_value=fill_value) - np.putmask(new, mask, fill_value) - - new_series[col] = new - - return self._constructor( - new_series, - index=index, - columns=self.columns, - default_fill_value=self._default_fill_value, - ).__finalize__(self) - - def _reindex_columns( - self, columns, method, copy, level, fill_value=None, limit=None, takeable=False - ): - if level is not None: - raise TypeError("Reindex by level not supported for sparse") - - if notna(fill_value): - raise NotImplementedError("'fill_value' argument is not supported") - - if limit: - raise NotImplementedError("'limit' argument is not supported") - - if method is not None: - raise NotImplementedError("'method' argument is not supported") - - # TODO: fill value handling - sdict = {k: v for k, v in self.items() if k in columns} - return self._constructor( - sdict, - index=self.index, - columns=columns, - default_fill_value=self._default_fill_value, - ).__finalize__(self) - - def _reindex_with_indexers( - self, - reindexers, - method=None, - fill_value=None, - limit=None, - copy=False, - allow_dups=False, - ): - - if method is not None or limit is not None: - raise NotImplementedError( - "cannot reindex with a method or limit with sparse" - ) - - if fill_value is None: - fill_value = np.nan - - reindexers = {self._get_axis_number(a): val for (a, val) in reindexers.items()} - - index, row_indexer = reindexers.get(0, (None, None)) - columns, col_indexer = reindexers.get(1, (None, None)) - - if columns is None: - columns = self.columns - - new_arrays = {} - for col in columns: - if col not in self: - continue - if row_indexer is not None: - new_arrays[col] = algos.take_1d( - self[col]._internal_get_values(), row_indexer, fill_value=fill_value - ) - else: - new_arrays[col] = self[col] - - return self._constructor(new_arrays, index=index, columns=columns).__finalize__( - self - ) - - def _join_compat( - self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False - ): - if on is not None: - raise NotImplementedError("'on' keyword parameter is not yet implemented") - return self._join_index(other, how, lsuffix, rsuffix) - - def _join_index(self, other, how, lsuffix, rsuffix): - if isinstance(other, Series): - if other.name is None: - raise ValueError("Other Series must have a name") - - other = SparseDataFrame( - {other.name: other}, default_fill_value=self._default_fill_value - ) - - join_index = self.index.join(other.index, how=how) - - this = self.reindex(join_index) - other = other.reindex(join_index) - - this, other = this._maybe_rename_join(other, lsuffix, rsuffix) - - from pandas import concat - - return concat([this, other], axis=1, verify_integrity=True) - - def _maybe_rename_join(self, other, lsuffix, rsuffix): - to_rename = self.columns.intersection(other.columns) - if len(to_rename) > 0: - if not lsuffix and not rsuffix: - raise ValueError( - "columns overlap but no suffix specified: " - "{to_rename}".format(to_rename=to_rename) - ) - - def lrenamer(x): - if x in to_rename: - return "{x}{lsuffix}".format(x=x, lsuffix=lsuffix) - return x - - def rrenamer(x): - if x in to_rename: - return "{x}{rsuffix}".format(x=x, rsuffix=rsuffix) - return x - - this = self.rename(columns=lrenamer) - other = other.rename(columns=rrenamer) - else: - this = self - - return this, other - - def transpose(self, *args, **kwargs): - """ - Returns a DataFrame with the rows/columns switched. - """ - nv.validate_transpose(args, kwargs) - return self._constructor( - self.values.T, - index=self.columns, - columns=self.index, - default_fill_value=self._default_fill_value, - default_kind=self._default_kind, - ).__finalize__(self) - - T = property(transpose) - - @Appender(DataFrame.count.__doc__) - def count(self, axis=0, **kwds): - if axis is None: - axis = self._stat_axis_number - - return self.apply(lambda x: x.count(), axis=axis) - - def cumsum(self, axis=0, *args, **kwargs): - """ - Return SparseDataFrame of cumulative sums over requested axis. - - Parameters - ---------- - axis : {0, 1} - 0 for row-wise, 1 for column-wise - - Returns - ------- - y : SparseDataFrame - """ - nv.validate_cumsum(args, kwargs) - - if axis is None: - axis = self._stat_axis_number - - return self.apply(lambda x: x.cumsum(), axis=axis) - - @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) - def isna(self): - return self._apply_columns(lambda x: x.isna()) - - isnull = isna - - @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) - def notna(self): - return self._apply_columns(lambda x: x.notna()) - - notnull = notna - - def apply(self, func, axis=0, broadcast=None, reduce=None, result_type=None): - """ - Analogous to DataFrame.apply, for SparseDataFrame - - Parameters - ---------- - func : function - Function to apply to each column - axis : {0, 1, 'index', 'columns'} - broadcast : bool, default False - For aggregation functions, return object of same size with values - propagated - - .. deprecated:: 0.23.0 - This argument will be removed in a future version, replaced - by result_type='broadcast'. - - reduce : boolean or None, default None - Try to apply reduction procedures. If the DataFrame is empty, - apply will use reduce to determine whether the result should be a - Series or a DataFrame. If reduce is None (the default), apply's - return value will be guessed by calling func an empty Series (note: - while guessing, exceptions raised by func will be ignored). If - reduce is True a Series will always be returned, and if False a - DataFrame will always be returned. - - .. deprecated:: 0.23.0 - This argument will be removed in a future version, replaced - by result_type='reduce'. - - result_type : {'expand', 'reduce', 'broadcast, None} - These only act when axis=1 {columns}: - - * 'expand' : list-like results will be turned into columns. - * 'reduce' : return a Series if possible rather than expanding - list-like results. This is the opposite to 'expand'. - * 'broadcast' : results will be broadcast to the original shape - of the frame, the original index & columns will be retained. - - The default behaviour (None) depends on the return value of the - applied function: list-like results will be returned as a Series - of those. However if the apply function returns a Series these - are expanded to columns. - - .. versionadded:: 0.23.0 - - Returns - ------- - applied : Series or SparseDataFrame - """ - if not len(self.columns): - return self - axis = self._get_axis_number(axis) - - if isinstance(func, np.ufunc): - new_series = {} - for k, v in self.items(): - applied = func(v) - applied.fill_value = func(v.fill_value) - new_series[k] = applied - return self._constructor( - new_series, - index=self.index, - columns=self.columns, - default_fill_value=self._default_fill_value, - default_kind=self._default_kind, - ).__finalize__(self) - - from pandas.core.apply import frame_apply - - op = frame_apply( - self, - func=func, - axis=axis, - reduce=reduce, - broadcast=broadcast, - result_type=result_type, - ) - return op.get_result() - - def applymap(self, func): - """ - Apply a function to a DataFrame that is intended to operate - elementwise, i.e. like doing map(func, series) for each series in the - DataFrame - - Parameters - ---------- - func : function - Python function, returns a single value from a single value - - Returns - ------- - applied : DataFrame - """ - return self.apply(lambda x: [func(y) for y in x]) - - -def to_manager(sdf, columns, index): - """ create and return the block manager from a dataframe of series, - columns, index - """ - - # from BlockManager perspective - axes = [ensure_index(columns), ensure_index(index)] - - return create_block_manager_from_arrays([sdf[c] for c in columns], columns, axes) - - -def stack_sparse_frame(frame): - """ - Only makes sense when fill_value is NaN - """ - lengths = [s.sp_index.npoints for _, s in frame.items()] - nobs = sum(lengths) - - # this is pretty fast - minor_codes = np.repeat(np.arange(len(frame.columns)), lengths) - - inds_to_concat = [] - vals_to_concat = [] - # TODO: Figure out whether this can be reached. - # I think this currently can't be reached because you can't build a - # SparseDataFrame with a non-np.NaN fill value (fails earlier). - for _, series in frame.items(): - if not np.isnan(series.fill_value): - raise TypeError("This routine assumes NaN fill value") - - int_index = series.sp_index.to_int_index() - inds_to_concat.append(int_index.indices) - vals_to_concat.append(series.sp_values) - - major_codes = np.concatenate(inds_to_concat) - stacked_values = np.concatenate(vals_to_concat) - index = MultiIndex( - levels=[frame.index, frame.columns], - codes=[major_codes, minor_codes], - verify_integrity=False, - ) - - lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index, columns=["foo"]) - return lp.sort_index(level=0) - - -def homogenize(series_dict): - """ - Conform a set of SparseSeries (with NaN fill_value) to a common SparseIndex - corresponding to the locations where they all have data - - Parameters - ---------- - series_dict : dict or DataFrame - - Notes - ----- - Using the dumbest algorithm I could think of. Should put some more thought - into this - - Returns - ------- - homogenized : dict of SparseSeries - """ - index = None - - need_reindex = False - - for _, series in series_dict.items(): - if not np.isnan(series.fill_value): - raise TypeError("this method is only valid with NaN fill values") - - if index is None: - index = series.sp_index - elif not series.sp_index.equals(index): - need_reindex = True - index = index.intersect(series.sp_index) - - if need_reindex: - output = {} - for name, series in series_dict.items(): - if not series.sp_index.equals(index): - series = series.sparse_reindex(index) - - output[name] = series - else: - output = series_dict - - return output - - -# use unaccelerated ops for sparse objects -ops.add_flex_arithmetic_methods(SparseDataFrame) -ops.add_special_arithmetic_methods(SparseDataFrame) diff --git a/pandas/core/sparse/scipy_sparse.py b/pandas/core/sparse/scipy_sparse.py index e8d8996fdd6ad..11c27451a5801 100644 --- a/pandas/core/sparse/scipy_sparse.py +++ b/pandas/core/sparse/scipy_sparse.py @@ -1,7 +1,7 @@ """ Interaction with scipy.sparse matrices. -Currently only includes SparseSeries.to_coo helpers. +Currently only includes to_coo helpers. """ from collections import OrderedDict @@ -115,7 +115,7 @@ def _sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=F return sparse_matrix, rows, columns -def _coo_to_sparse_series(A, dense_index: bool = False, sparse_series: bool = True): +def _coo_to_sparse_series(A, dense_index: bool = False): """ Convert a scipy.sparse.coo_matrix to a SparseSeries. @@ -123,16 +123,14 @@ def _coo_to_sparse_series(A, dense_index: bool = False, sparse_series: bool = Tr ---------- A : scipy.sparse.coo.coo_matrix dense_index : bool, default False - sparse_series : bool, default True Returns ------- - Series or SparseSeries + Series Raises ------ TypeError if A is not a coo_matrix - """ from pandas import SparseDtype @@ -141,13 +139,7 @@ def _coo_to_sparse_series(A, dense_index: bool = False, sparse_series: bool = Tr except AttributeError: raise TypeError("Expected coo_matrix. Got {} instead.".format(type(A).__name__)) s = s.sort_index() - if sparse_series: - # TODO(SparseSeries): remove this and the sparse_series keyword. - # This is just here to avoid a DeprecationWarning when - # _coo_to_sparse_series is called via Series.sparse.from_coo - s = s.to_sparse() # TODO: specify kind? - else: - s = s.astype(SparseDtype(s.dtype)) + s = s.astype(SparseDtype(s.dtype)) if dense_index: # is there a better constructor method to use here? i = range(A.shape[0]) diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py deleted file mode 100644 index 0c417133b0538..0000000000000 --- a/pandas/core/sparse/series.py +++ /dev/null @@ -1,635 +0,0 @@ -""" -Data structures for sparse float data. Life is made simpler by dealing only -with float64 data -""" -from collections import abc -import warnings - -import numpy as np - -import pandas._libs.index as libindex -import pandas._libs.sparse as splib -from pandas._libs.sparse import BlockIndex, IntIndex -from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, Substitution - -from pandas.core.dtypes.common import is_integer, is_scalar -from pandas.core.dtypes.generic import ABCSeries, ABCSparseSeries -from pandas.core.dtypes.missing import isna, notna - -from pandas.core import generic -from pandas.core.arrays import SparseArray -from pandas.core.arrays.sparse import SparseAccessor -from pandas.core.internals import SingleBlockManager -import pandas.core.ops as ops -from pandas.core.series import Series -from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series, _sparse_series_to_coo - -_shared_doc_kwargs = dict( - axes="index", - klass="SparseSeries", - axes_single_arg="{0, 'index'}", - optional_labels="", - optional_axis="", -) - - -depr_msg = """\ -SparseSeries is deprecated and will be removed in a future version. -Use a Series with sparse values instead. - - >>> series = pd.Series(pd.SparseArray(...)) - -See http://pandas.pydata.org/pandas-docs/stable/\ -user_guide/sparse.html#migrating for more. -""" - - -class SparseSeries(Series): - """Data structure for labeled, sparse floating point data - - .. deprecated:: 0.25.0 - - Use a Series with sparse values instead. - - Parameters - ---------- - data : {array-like, Series, SparseSeries, dict} - .. versionchanged:: 0.23.0 - If data is a dict, argument order is maintained for Python 3.6 - and later. - - kind : {'block', 'integer'} - fill_value : float - Code for missing value. Defaults depends on dtype. - 0 for int dtype, False for bool dtype, and NaN for other dtypes - sparse_index : {BlockIndex, IntIndex}, optional - Only if you have one. Mainly used internally - - Notes - ----- - SparseSeries objects are immutable via the typical Python means. If you - must change values, convert to dense, make your changes, then convert back - to sparse - """ - - _subtyp = "sparse_series" - - def __init__( - self, - data=None, - index=None, - sparse_index=None, - kind="block", - fill_value=None, - name=None, - dtype=None, - copy=False, - fastpath=False, - ): - warnings.warn(depr_msg, FutureWarning, stacklevel=2) - # TODO: Most of this should be refactored and shared with Series - # 1. BlockManager -> array - # 2. Series.index, Series.name, index, name reconciliation - # 3. Implicit reindexing - # 4. Implicit broadcasting - # 5. Dict construction - if data is None: - data = [] - elif isinstance(data, SingleBlockManager): - index = data.index - data = data.blocks[0].values - elif isinstance(data, (ABCSeries, ABCSparseSeries)): - index = data.index if index is None else index - dtype = data.dtype if dtype is None else dtype - name = data.name if name is None else name - - if index is not None: - data = data.reindex(index) - - elif isinstance(data, abc.Mapping): - data, index = Series()._init_dict(data, index=index) - - elif is_scalar(data) and index is not None: - data = np.full(len(index), fill_value=data) - - if isinstance(data, SingleBlockManager): - # SparseArray doesn't accept SingleBlockManager - index = data.index - data = data.blocks[0].values - - super().__init__( - SparseArray( - data, - sparse_index=sparse_index, - kind=kind, - dtype=dtype, - fill_value=fill_value, - copy=copy, - ), - index=index, - name=name, - copy=False, - fastpath=fastpath, - ) - - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - # avoid infinite recursion for other SparseSeries inputs - inputs = tuple(x.values if isinstance(x, type(self)) else x for x in inputs) - result = self.values.__array_ufunc__(ufunc, method, *inputs, **kwargs) - return self._constructor( - result, - index=self.index, - sparse_index=self.sp_index, - fill_value=result.fill_value, - copy=False, - ).__finalize__(self) - - # unary ops - # TODO: See if this can be shared - def __pos__(self): - result = self.values.__pos__() - return self._constructor( - result, - index=self.index, - sparse_index=self.sp_index, - fill_value=result.fill_value, - copy=False, - ).__finalize__(self) - - def __neg__(self): - result = self.values.__neg__() - return self._constructor( - result, - index=self.index, - sparse_index=self.sp_index, - fill_value=result.fill_value, - copy=False, - ).__finalize__(self) - - def __invert__(self): - result = self.values.__invert__() - return self._constructor( - result, - index=self.index, - sparse_index=self.sp_index, - fill_value=result.fill_value, - copy=False, - ).__finalize__(self) - - @property - def block(self): - warnings.warn("SparseSeries.block is deprecated.", FutureWarning, stacklevel=2) - return self._data._block - - @property - def fill_value(self): - return self.values.fill_value - - @fill_value.setter - def fill_value(self, v): - self.values.fill_value = v - - @property - def sp_index(self): - return self.values.sp_index - - @property - def sp_values(self): - return self.values.sp_values - - @property - def npoints(self): - return self.values.npoints - - @classmethod - def from_array( - cls, arr, index=None, name=None, copy=False, fill_value=None, fastpath=False - ): - """Construct SparseSeries from array. - - .. deprecated:: 0.23.0 - Use the pd.SparseSeries(..) constructor instead. - """ - warnings.warn( - "'from_array' is deprecated and will be removed in a " - "future version. Please use the pd.SparseSeries(..) " - "constructor instead.", - FutureWarning, - stacklevel=2, - ) - return cls( - arr, - index=index, - name=name, - copy=copy, - fill_value=fill_value, - fastpath=fastpath, - ) - - @property - def _constructor(self): - return SparseSeries - - @property - def _constructor_expanddim(self): - from pandas.core.sparse.api import SparseDataFrame - - return SparseDataFrame - - @property - def kind(self): - if isinstance(self.sp_index, BlockIndex): - return "block" - elif isinstance(self.sp_index, IntIndex): - return "integer" - - def as_sparse_array(self, kind=None, fill_value=None, copy=False): - """ return my self as a sparse array, do not copy by default """ - - if fill_value is None: - fill_value = self.fill_value - if kind is None: - kind = self.kind - return SparseArray( - self.values, - sparse_index=self.sp_index, - fill_value=fill_value, - kind=kind, - copy=copy, - ) - - def __repr__(self): - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Sparse") - series_rep = Series.__repr__(self) - rep = "{series}\n{index!r}".format(series=series_rep, index=self.sp_index) - return rep - - def _reduce( - self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds - ): - """ perform a reduction operation """ - return op(self.array.to_dense(), skipna=skipna, **kwds) - - def __getstate__(self): - # pickling - return dict( - _typ=self._typ, - _subtyp=self._subtyp, - _data=self._data, - fill_value=self.fill_value, - name=self.name, - ) - - def _unpickle_series_compat(self, state): - - nd_state, own_state = state - - # recreate the ndarray - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) - - index, fill_value, sp_index = own_state[:3] - name = None - if len(own_state) > 3: - name = own_state[3] - - # create a sparse array - if not isinstance(data, SparseArray): - data = SparseArray( - data, sparse_index=sp_index, fill_value=fill_value, copy=False - ) - - # recreate - data = SingleBlockManager(data, index, fastpath=True) - generic.NDFrame.__init__(self, data) - - self._set_axis(0, index) - self.name = name - - def _set_subtyp(self, is_all_dates): - if is_all_dates: - object.__setattr__(self, "_subtyp", "sparse_time_series") - else: - object.__setattr__(self, "_subtyp", "sparse_series") - - # ---------------------------------------------------------------------- - # Indexing Methods - - def _ixs(self, i: int, axis: int = 0): - """ - Return the i-th value or values in the SparseSeries by location - - Parameters - ---------- - i : int - axis : int - default 0, ignored - - Returns - ------- - value : scalar (int) or Series (slice, sequence) - """ - assert is_integer(i), i - # equiv: self._get_val_at(i) since we have an integer - return self.values[i] - - def _get_val_at(self, loc): - """ forward to the array """ - return self.values._get_val_at(loc) - - def __getitem__(self, key): - # TODO: Document difference from Series.__getitem__, deprecate, - # and remove! - if is_integer(key) and key not in self.index: - return self._get_val_at(key) - else: - return super().__getitem__(key) - - def _get_value(self, label, takeable=False): - """ - Retrieve single value at passed index label - - Please use .at[] or .iat[] accessors. - - Parameters - ---------- - index : label - takeable : interpret the index as indexers, default False - - Returns - ------- - value : scalar value - """ - loc = label if takeable is True else self.index.get_loc(label) - return self._get_val_at(loc) - - def _get_values(self, indexer): - try: - return self._constructor( - self._data.get_slice(indexer), fastpath=True - ).__finalize__(self) - except Exception: - return self[indexer] - - def _set_with_engine(self, key, value): - return self._set_value(key, value) - - def _set_value(self, label, value, takeable=False): - """ - Quickly set single value at passed label. If label is not contained, a - new object is created with the label placed at the end of the result - index - - .. deprecated:: 0.21.0 - - Please use .at[] or .iat[] accessors. - - Parameters - ---------- - label : object - Partial indexing with MultiIndex not allowed - value : object - Scalar value - takeable : interpret the index as indexers, default False - - Notes - ----- - This method *always* returns a new object. It is not particularly - efficient but is provided for API compatibility with Series - - Returns - ------- - series : SparseSeries - """ - values = self.to_dense() - - # if the label doesn't exist, we will create a new object here - # and possibly change the index - new_values = values._set_value(label, value, takeable=takeable) - if new_values is not None: - values = new_values - new_index = values.index - values = SparseArray(values, fill_value=self.fill_value, kind=self.kind) - self._data = SingleBlockManager(values, new_index) - self._index = new_index - - def _set_values(self, key, value): - - # this might be inefficient as we have to recreate the sparse array - # rather than setting individual elements, but have to convert - # the passed slice/boolean that's in dense space into a sparse indexer - # not sure how to do that! - if isinstance(key, Series): - key = key.values - - values = self.values.to_dense() - values[key] = libindex.convert_scalar(values, value) - values = SparseArray(values, fill_value=self.fill_value, kind=self.kind) - self._data = SingleBlockManager(values, self.index) - - # ---------------------------------------------------------------------- - # Unsorted - - def abs(self): - """ - Return an object with absolute value taken. Only applicable to objects - that are all numeric - - Returns - ------- - abs: same type as caller - """ - return self._constructor(np.abs(self.values), index=self.index).__finalize__( - self - ) - - def get(self, label, default=None): - """ - Returns value occupying requested label, default to specified - missing value if not present. Analogous to dict.get - - Parameters - ---------- - label : object - Label value looking for - default : object, optional - Value to return if label not in index - - Returns - ------- - y : scalar - """ - if label in self.index: - loc = self.index.get_loc(label) - return self._get_val_at(loc) - else: - return default - - def to_dense(self): - """ - Convert SparseSeries to a Series. - - Returns - ------- - s : Series - """ - return Series(self.values.to_dense(), index=self.index, name=self.name) - - @property - def density(self): - return self.values.density - - def copy(self, deep=True): - """ - Make a copy of the SparseSeries. Only the actual sparse values need to - be copied - """ - # TODO: https://github.com/pandas-dev/pandas/issues/22314 - # We skip the block manager till that is resolved. - new_data = self.values - if deep: - new_data = new_data.copy() - return self._constructor( - new_data, - sparse_index=self.sp_index, - fill_value=self.fill_value, - index=self.index.copy(), - name=self.name, - ).__finalize__(self) - - @Substitution(**_shared_doc_kwargs) - @Appender(generic.NDFrame.reindex.__doc__) - def reindex(self, index=None, method=None, copy=True, limit=None, **kwargs): - # TODO: remove? - return super().reindex( - index=index, method=method, copy=copy, limit=limit, **kwargs - ) - - def sparse_reindex(self, new_index): - """ - Conform sparse values to new SparseIndex - - Parameters - ---------- - new_index : {BlockIndex, IntIndex} - - Returns - ------- - reindexed : SparseSeries - """ - if not isinstance(new_index, splib.SparseIndex): - raise TypeError("new index must be a SparseIndex") - values = self.values - values = values.sp_index.to_int_index().reindex( - values.sp_values.astype("float64"), values.fill_value, new_index - ) - values = SparseArray( - values, sparse_index=new_index, fill_value=self.values.fill_value - ) - return self._constructor(values, index=self.index).__finalize__(self) - - def cumsum(self, axis=0, *args, **kwargs): - """ - Cumulative sum of non-NA/null values. - - When performing the cumulative summation, any non-NA/null values will - be skipped. The resulting SparseSeries will preserve the locations of - NaN values, but the fill value will be `np.nan` regardless. - - Parameters - ---------- - axis : {0} - - Returns - ------- - cumsum : SparseSeries - """ - nv.validate_cumsum(args, kwargs) - # Validate axis - if axis is not None: - self._get_axis_number(axis) - - new_array = self.values.cumsum() - - return self._constructor( - new_array, index=self.index, sparse_index=new_array.sp_index - ).__finalize__(self) - - # TODO: SparseSeries.isna is Sparse, while Series.isna is dense - @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) - def isna(self): - arr = SparseArray( - isna(self.values.sp_values), - sparse_index=self.values.sp_index, - fill_value=isna(self.fill_value), - ) - return self._constructor(arr, index=self.index).__finalize__(self) - - isnull = isna - - @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) - def notna(self): - arr = SparseArray( - notna(self.values.sp_values), - sparse_index=self.values.sp_index, - fill_value=notna(self.fill_value), - ) - return self._constructor(arr, index=self.index).__finalize__(self) - - notnull = notna - - def dropna(self, axis=0, inplace=False, **kwargs): - """ - Analogous to Series.dropna. If fill_value=NaN, returns a dense Series - """ - # TODO: make more efficient - # Validate axis - self._get_axis_number(axis or 0) - dense_valid = self.to_dense().dropna() - if inplace: - raise NotImplementedError( - "Cannot perform inplace dropna operations on a SparseSeries" - ) - if isna(self.fill_value): - return dense_valid - else: - dense_valid = dense_valid[dense_valid != self.fill_value] - return dense_valid.to_sparse(fill_value=self.fill_value) - - def combine_first(self, other): - """ - Combine Series values, choosing the calling Series's values - first. Result index will be the union of the two indexes - - Parameters - ---------- - other : Series - - Returns - ------- - y : Series - """ - if isinstance(other, SparseSeries): - other = other.to_dense() - - dense_combined = self.to_dense().combine_first(other) - return dense_combined.to_sparse(fill_value=self.fill_value) - - @Appender(SparseAccessor.to_coo.__doc__) - def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): - A, rows, columns = _sparse_series_to_coo( - self, row_levels, column_levels, sort_labels=sort_labels - ) - return A, rows, columns - - @classmethod - @Appender(SparseAccessor.from_coo.__doc__) - def from_coo(cls, A, dense_index=False): - return _coo_to_sparse_series(A, dense_index=dense_index) - - -# overwrite series methods with unaccelerated Sparse-specific versions -ops.add_flex_arithmetic_methods(SparseSeries) -ops.add_special_arithmetic_methods(SparseSeries) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index ad47ba23b9221..ecf7c77c172d1 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -85,7 +85,6 @@ from pandas.core.arrays.sparse import BlockIndex, IntIndex from pandas.core.generic import NDFrame from pandas.core.internals import BlockManager, _safe_reshape, make_block -from pandas.core.sparse.api import SparseDataFrame, SparseSeries from pandas.io.common import _stringify_path, get_filepath_or_buffer from pandas.io.msgpack import ExtType, Packer as _Packer, Unpacker as _Unpacker @@ -469,62 +468,37 @@ def encode(obj): } elif isinstance(obj, Series): - if isinstance(obj, SparseSeries): - raise NotImplementedError("msgpack sparse series is not implemented") - # d = {'typ': 'sparse_series', - # 'klass': obj.__class__.__name__, - # 'dtype': obj.dtype.name, - # 'index': obj.index, - # 'sp_index': obj.sp_index, - # 'sp_values': convert(obj.sp_values), - # 'compress': compressor} - # for f in ['name', 'fill_value', 'kind']: - # d[f] = getattr(obj, f, None) - # return d - else: - return { - "typ": "series", - "klass": obj.__class__.__name__, - "name": getattr(obj, "name", None), - "index": obj.index, - "dtype": obj.dtype.name, - "data": convert(obj.values), - "compress": compressor, - } + return { + "typ": "series", + "klass": obj.__class__.__name__, + "name": getattr(obj, "name", None), + "index": obj.index, + "dtype": obj.dtype.name, + "data": convert(obj.values), + "compress": compressor, + } elif issubclass(tobj, NDFrame): - if isinstance(obj, SparseDataFrame): - raise NotImplementedError("msgpack sparse frame is not implemented") - # d = {'typ': 'sparse_dataframe', - # 'klass': obj.__class__.__name__, - # 'columns': obj.columns} - # for f in ['default_fill_value', 'default_kind']: - # d[f] = getattr(obj, f, None) - # d['data'] = dict([(name, ss) - # for name, ss in obj.items()]) - # return d - else: - - data = obj._data - if not data.is_consolidated(): - data = data.consolidate() + data = obj._data + if not data.is_consolidated(): + data = data.consolidate() - # the block manager - return { - "typ": "block_manager", - "klass": obj.__class__.__name__, - "axes": data.axes, - "blocks": [ - { - "locs": b.mgr_locs.as_array, - "values": convert(b.values), - "shape": b.values.shape, - "dtype": b.dtype.name, - "klass": b.__class__.__name__, - "compress": compressor, - } - for b in data.blocks - ], - } + # the block manager + return { + "typ": "block_manager", + "klass": obj.__class__.__name__, + "axes": data.axes, + "blocks": [ + { + "locs": b.mgr_locs.as_array, + "values": convert(b.values), + "shape": b.values.shape, + "dtype": b.dtype.name, + "klass": b.__class__.__name__, + "compress": compressor, + } + for b in data.blocks + ], + } elif ( isinstance(obj, (datetime, date, np.datetime64, timedelta, np.timedelta64)) @@ -708,18 +682,6 @@ def create_block(b): return timedelta(*obj["data"]) elif typ == "timedelta64": return np.timedelta64(int(obj["data"])) - # elif typ == 'sparse_series': - # dtype = dtype_for(obj['dtype']) - # return SparseSeries( - # unconvert(obj['sp_values'], dtype, obj['compress']), - # sparse_index=obj['sp_index'], index=obj['index'], - # fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name']) - # elif typ == 'sparse_dataframe': - # return SparseDataFrame( - # obj['data'], columns=obj['columns'], - # default_fill_value=obj['default_fill_value'], - # default_kind=obj['default_kind'] - # ) elif typ == "block_index": return globals()[obj["klass"]](obj["length"], obj["blocs"], obj["blengths"]) elif typ == "int_index": diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4f3f639de5cb1..55ccd838f8a16 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -40,8 +40,6 @@ MultiIndex, PeriodIndex, Series, - SparseDataFrame, - SparseSeries, TimedeltaIndex, concat, isna, @@ -173,12 +171,7 @@ class DuplicateWarning(Warning): """ # map object types -_TYPE_MAP = { - Series: "series", - SparseSeries: "sparse_series", - DataFrame: "frame", - SparseDataFrame: "sparse_frame", -} +_TYPE_MAP = {Series: "series", DataFrame: "frame"} # storer class map _STORER_MAP = { @@ -186,9 +179,7 @@ class DuplicateWarning(Warning): "DataFrame": "LegacyFrameFixed", "DataMatrix": "LegacyFrameFixed", "series": "SeriesFixed", - "sparse_series": "SparseSeriesFixed", "frame": "FrameFixed", - "sparse_frame": "SparseFrameFixed", } # table class map @@ -3078,83 +3069,6 @@ def write(self, obj, **kwargs): self.attrs.name = obj.name -class SparseFixed(GenericFixed): - def validate_read(self, kwargs): - """ - we don't support start, stop kwds in Sparse - """ - kwargs = super().validate_read(kwargs) - if "start" in kwargs or "stop" in kwargs: - raise NotImplementedError( - "start and/or stop are not supported in fixed Sparse reading" - ) - return kwargs - - -class SparseSeriesFixed(SparseFixed): - pandas_kind = "sparse_series" - attributes = ["name", "fill_value", "kind"] - - def read(self, **kwargs): - kwargs = self.validate_read(kwargs) - index = self.read_index("index") - sp_values = self.read_array("sp_values") - sp_index = self.read_index("sp_index") - return SparseSeries( - sp_values, - index=index, - sparse_index=sp_index, - kind=self.kind or "block", - fill_value=self.fill_value, - name=self.name, - ) - - def write(self, obj, **kwargs): - super().write(obj, **kwargs) - self.write_index("index", obj.index) - self.write_index("sp_index", obj.sp_index) - self.write_array("sp_values", obj.sp_values) - self.attrs.name = obj.name - self.attrs.fill_value = obj.fill_value - self.attrs.kind = obj.kind - - -class SparseFrameFixed(SparseFixed): - pandas_kind = "sparse_frame" - attributes = ["default_kind", "default_fill_value"] - - def read(self, **kwargs): - kwargs = self.validate_read(kwargs) - columns = self.read_index("columns") - sdict = {} - for c in columns: - key = "sparse_series_{columns}".format(columns=c) - s = SparseSeriesFixed(self.parent, getattr(self.group, key)) - s.infer_axes() - sdict[c] = s.read() - return SparseDataFrame( - sdict, - columns=columns, - default_kind=self.default_kind, - default_fill_value=self.default_fill_value, - ) - - def write(self, obj, **kwargs): - """ write it as a collection of individual sparse series """ - super().write(obj, **kwargs) - for name, ss in obj.items(): - key = "sparse_series_{name}".format(name=name) - if key not in self.group._v_children: - node = self._handle.create_group(self.group, key) - else: - node = getattr(self.group, key) - s = SparseSeriesFixed(self.parent, node) - s.write(ss) - self.attrs.default_fill_value = obj.default_fill_value - self.attrs.default_kind = obj.default_kind - self.write_index("columns", obj.columns) - - class BlockManagerFixed(GenericFixed): attributes = ["ndim", "nblocks"] is_shape_reversed = False diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 326bef7f4b480..2f24bbd6f0c85 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -67,9 +67,7 @@ class TestPDApi(Base): "UInt64Index", "Series", "SparseArray", - "SparseDataFrame", "SparseDtype", - "SparseSeries", "Timedelta", "TimedeltaIndex", "Timestamp", @@ -90,7 +88,7 @@ class TestPDApi(Base): "NamedAgg", ] if not compat.PY37: - classes.append("Panel") + classes.extend(["Panel", "SparseSeries", "SparseDataFrame"]) # these are already deprecated; awaiting removal deprecated_classes = [] diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index cb5b437c962f9..071a8db707b69 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -21,8 +21,6 @@ def mix(request): return request.param -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseArrayArithmetics: _base = np.array @@ -391,48 +389,6 @@ def test_mixed_array_comparison(self, kind): self._check_comparison_ops(a, b, values, rvalues) -class TestSparseSeriesArithmetic(TestSparseArrayArithmetics): - - _base = pd.Series - _klass = pd.SparseSeries - - def _assert(self, a, b): - tm.assert_series_equal(a, b) - - def test_alignment(self, mix, all_arithmetic_functions): - op = all_arithmetic_functions - - da = pd.Series(np.arange(4)) - db = pd.Series(np.arange(4), index=[1, 2, 3, 4]) - - sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=0) - sb = pd.SparseSeries( - np.arange(4), index=[1, 2, 3, 4], dtype=np.int64, fill_value=0 - ) - self._check_numeric_ops(sa, sb, da, db, mix, op) - - sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=np.nan) - sb = pd.SparseSeries( - np.arange(4), index=[1, 2, 3, 4], dtype=np.int64, fill_value=np.nan - ) - self._check_numeric_ops(sa, sb, da, db, mix, op) - - da = pd.Series(np.arange(4)) - db = pd.Series(np.arange(4), index=[10, 11, 12, 13]) - - sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=0) - sb = pd.SparseSeries( - np.arange(4), index=[10, 11, 12, 13], dtype=np.int64, fill_value=0 - ) - self._check_numeric_ops(sa, sb, da, db, mix, op) - - sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=np.nan) - sb = pd.SparseSeries( - np.arange(4), index=[10, 11, 12, 13], dtype=np.int64, fill_value=np.nan - ) - self._check_numeric_ops(sa, sb, da, db, mix, op) - - @pytest.mark.parametrize("op", [operator.eq, operator.add]) def test_with_list(op): arr = pd.SparseArray([0, 1], fill_value=0) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index b94e2a16d217a..5d5ee565c7891 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import isna -from pandas.core.sparse.api import SparseArray, SparseDtype, SparseSeries +from pandas.core.sparse.api import SparseArray, SparseDtype import pandas.util.testing as tm from pandas.util.testing import assert_almost_equal @@ -221,36 +221,6 @@ def test_scalar_with_index_infer_dtype(self, scalar, dtype): assert arr.dtype == dtype assert exp.dtype == dtype - @pytest.mark.parametrize("fill", [1, np.nan, 0]) - @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") - def test_sparse_series_round_trip(self, kind, fill): - # see gh-13999 - arr = SparseArray([np.nan, 1, np.nan, 2, 3], kind=kind, fill_value=fill) - res = SparseArray(SparseSeries(arr)) - tm.assert_sp_array_equal(arr, res) - - arr = SparseArray( - [0, 0, 0, 1, 1, 2], dtype=np.int64, kind=kind, fill_value=fill - ) - res = SparseArray(SparseSeries(arr), dtype=np.int64) - tm.assert_sp_array_equal(arr, res) - - res = SparseArray(SparseSeries(arr)) - tm.assert_sp_array_equal(arr, res) - - @pytest.mark.parametrize("fill", [True, False, np.nan]) - @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") - def test_sparse_series_round_trip2(self, kind, fill): - # see gh-13999 - arr = SparseArray( - [True, False, True, True], dtype=np.bool, kind=kind, fill_value=fill - ) - res = SparseArray(SparseSeries(arr)) - tm.assert_sp_array_equal(arr, res) - - res = SparseArray(SparseSeries(arr)) - tm.assert_sp_array_equal(arr, res) - def test_get_item(self): assert np.isnan(self.arr[1]) @@ -1142,7 +1112,6 @@ def test_npoints(self): assert arr.npoints == 1 -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestAccessor: @pytest.mark.parametrize("attr", ["npoints", "density", "fill_value", "sp_values"]) def test_get_attributes(self, attr): diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index db9f647e0f0c7..266f7ac50c663 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -21,11 +21,8 @@ UNSIGNED_EA_INT_DTYPES, UNSIGNED_INT_DTYPES, ) -from pandas.core.sparse.api import SparseDtype import pandas.util.testing as tm -ignore_sparse_warning = pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") - # EA & Actual Dtypes def to_ea_dtypes(dtypes): @@ -179,10 +176,8 @@ def test_is_object(): @pytest.mark.parametrize( "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] ) -@ignore_sparse_warning def test_is_sparse(check_scipy): assert com.is_sparse(pd.SparseArray([1, 2, 3])) - assert com.is_sparse(pd.SparseSeries([1, 2, 3])) assert not com.is_sparse(np.array([1, 2, 3])) @@ -193,14 +188,12 @@ def test_is_sparse(check_scipy): @td.skip_if_no_scipy -@ignore_sparse_warning def test_is_scipy_sparse(): from scipy.sparse import bsr_matrix assert com.is_scipy_sparse(bsr_matrix([1, 2, 3])) assert not com.is_scipy_sparse(pd.SparseArray([1, 2, 3])) - assert not com.is_scipy_sparse(pd.SparseSeries([1, 2, 3])) def test_is_categorical(): @@ -586,7 +579,6 @@ def test_is_bool_dtype(): @pytest.mark.parametrize( "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] ) -@ignore_sparse_warning def test_is_extension_type(check_scipy): assert not com.is_extension_type([1, 2, 3]) assert not com.is_extension_type(np.array([1, 2, 3])) @@ -596,7 +588,6 @@ def test_is_extension_type(check_scipy): assert com.is_extension_type(cat) assert com.is_extension_type(pd.Series(cat)) assert com.is_extension_type(pd.SparseArray([1, 2, 3])) - assert com.is_extension_type(pd.SparseSeries([1, 2, 3])) assert com.is_extension_type(pd.DatetimeIndex(["2000"], tz="US/Eastern")) dtype = DatetimeTZDtype("ns", tz="US/Eastern") @@ -664,14 +655,6 @@ def test__get_dtype(input_param, result): assert com._get_dtype(input_param) == result -@ignore_sparse_warning -def test__get_dtype_sparse(): - ser = pd.SparseSeries([1, 2], dtype="int32") - expected = SparseDtype("int32") - assert com._get_dtype(ser) == expected - assert com._get_dtype(ser.dtype) == expected - - @pytest.mark.parametrize( "input_param,expected_error_message", [ @@ -723,11 +706,3 @@ def test__get_dtype_fails(input_param, expected_error_message): ) def test__is_dtype_type(input_param, result): assert com._is_dtype_type(input_param, lambda tipo: tipo == result) - - -@ignore_sparse_warning -def test__is_dtype_type_sparse(): - ser = pd.SparseSeries([1, 2], dtype="int32") - result = np.dtype("int32") - assert com._is_dtype_type(ser, lambda tipo: tipo == result) - assert com._is_dtype_type(ser.dtype, lambda tipo: tipo == result) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index d3f0d7c43ee6b..3288c9c584565 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -960,9 +960,8 @@ def test_is_bool_dtype(dtype, expected): assert result is expected -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_is_bool_dtype_sparse(): - result = is_bool_dtype(pd.SparseSeries([True, False])) + result = is_bool_dtype(pd.Series(pd.SparseArray([True, False]))) assert result is True diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index b42822a03ebcd..471fd06a29ae9 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -1,4 +1,4 @@ -from warnings import catch_warnings, simplefilter +from warnings import catch_warnings import numpy as np @@ -17,11 +17,6 @@ class TestABCClasses: categorical = pd.Categorical([1, 2, 3], categories=[2, 3, 1]) categorical_df = pd.DataFrame({"values": [1, 2, 3]}, index=categorical) df = pd.DataFrame({"names": ["a", "b", "c"]}, index=multi_index) - with catch_warnings(): - simplefilter("ignore", FutureWarning) - sparse_series = pd.Series([1, 2, 3]).to_sparse() - sparse_frame = pd.SparseDataFrame({"a": [1, -1, None]}) - sparse_array = pd.SparseArray(np.random.randn(10)) datetime_array = pd.core.arrays.DatetimeArray(datetime_index) timedelta_array = pd.core.arrays.TimedeltaArray(timedelta_index) @@ -40,9 +35,7 @@ def test_abc_types(self): assert isinstance(pd.Int64Index([1, 2, 3]), gt.ABCIndexClass) assert isinstance(pd.Series([1, 2, 3]), gt.ABCSeries) assert isinstance(self.df, gt.ABCDataFrame) - assert isinstance(self.sparse_series, gt.ABCSparseSeries) assert isinstance(self.sparse_array, gt.ABCSparseArray) - assert isinstance(self.sparse_frame, gt.ABCSparseDataFrame) assert isinstance(self.categorical, gt.ABCCategorical) assert isinstance(pd.Period("2012", freq="A-DEC"), gt.ABCPeriod) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 00b59fd4dc087..017cbea7ec723 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -27,7 +27,6 @@ import pandas.util.testing as tm -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestDataFrameAlterAxes: def test_set_index_directly(self, float_string_frame): df = float_string_frame @@ -1452,7 +1451,6 @@ def test_droplevel(self): tm.assert_frame_equal(result, expected) -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestIntervalIndex: def test_setitem(self): diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index c6852576f660b..d53a3d81ab5f8 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -10,7 +10,6 @@ Categorical, DataFrame, Series, - SparseDataFrame, SparseDtype, compat, date_range, @@ -220,9 +219,6 @@ def test_iterrows(self, float_frame, float_string_frame): def test_iterrows_iso8601(self): # GH 19671 - if self.klass == SparseDataFrame: - pytest.xfail(reason="SparseBlock datetime type not implemented.") - s = self.klass( { "non_iso8601": ["M1701", "M1802", "M1903", "M2004"], diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index a78b2ab7d1c4c..96f56bdef6286 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -2145,13 +2145,6 @@ def test_loc_duplicates(self): df.loc[trange[bool_idx], "A"] += 6 tm.assert_frame_equal(df, expected) - @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") - def test_iloc_sparse_propegate_fill_value(self): - from pandas.core.sparse.api import SparseDataFrame - - df = SparseDataFrame({"A": [999, 1]}, default_fill_value=999) - assert len(df["A"].sp_values) == len(df.iloc[:, 0].sp_values) - def test_iat(self, float_frame): for i, row in enumerate(float_frame.index): diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index c66a97c2b294b..649a78b785d21 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -190,38 +190,6 @@ def test_subclass_iterrows(self): assert isinstance(row, tm.SubclassedSeries) tm.assert_series_equal(row, df.loc[i]) - @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") - def test_subclass_sparse_slice(self): - rows = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]] - ssdf = tm.SubclassedSparseDataFrame(rows) - ssdf.testattr = "testattr" - - tm.assert_sp_frame_equal(ssdf.loc[:2], tm.SubclassedSparseDataFrame(rows[:3])) - tm.assert_sp_frame_equal(ssdf.iloc[:2], tm.SubclassedSparseDataFrame(rows[:2])) - tm.assert_sp_frame_equal(ssdf[:2], tm.SubclassedSparseDataFrame(rows[:2])) - assert ssdf.loc[:2].testattr == "testattr" - assert ssdf.iloc[:2].testattr == "testattr" - assert ssdf[:2].testattr == "testattr" - - tm.assert_sp_series_equal( - ssdf.loc[1], - tm.SubclassedSparseSeries(rows[1]), - check_names=False, - check_kind=False, - ) - tm.assert_sp_series_equal( - ssdf.iloc[1], - tm.SubclassedSparseSeries(rows[1]), - check_names=False, - check_kind=False, - ) - - @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") - def test_subclass_sparse_transpose(self): - ossdf = tm.SubclassedSparseDataFrame([[1, 2, 3], [4, 5, 6]]) - essdf = tm.SubclassedSparseDataFrame([[1, 4], [2, 5], [3, 6]]) - tm.assert_sp_frame_equal(ossdf.T, essdf) - def test_subclass_stack(self): # GH 15564 df = tm.SubclassedDataFrame( diff --git a/pandas/tests/io/data/legacy_pickle/0.20.3/0.20.3_x86_64_darwin_3.5.6.pickle b/pandas/tests/io/data/legacy_pickle/0.20.3/0.20.3_x86_64_darwin_3.5.6.pickle new file mode 100644 index 0000000000000..88bb6989f5b08 Binary files /dev/null and b/pandas/tests/io/data/legacy_pickle/0.20.3/0.20.3_x86_64_darwin_3.5.6.pickle differ diff --git a/pandas/tests/io/data/sparseframe-0.20.3.pickle.gz b/pandas/tests/io/data/sparseframe-0.20.3.pickle.gz new file mode 100644 index 0000000000000..f4ff0dbaa1ff9 Binary files /dev/null and b/pandas/tests/io/data/sparseframe-0.20.3.pickle.gz differ diff --git a/pandas/tests/io/data/sparseseries-0.20.3.pickle.gz b/pandas/tests/io/data/sparseseries-0.20.3.pickle.gz new file mode 100644 index 0000000000000..b299e7d85808e Binary files /dev/null and b/pandas/tests/io/data/sparseseries-0.20.3.pickle.gz differ diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py index 2d2938697bd80..e63644a44a81f 100755 --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -11,12 +11,12 @@ cd ~/ $ python pandas/pandas/tests/io/generate_legacy_storage_files.py \ - pandas/pandas/tests/io/data/legacy_pickle/0.18.1/ pickle + pandas/pandas/tests/io/data/legacy_pickle/0.20.3/ pickle This script generates a storage file for the current arch, system, and python version pandas version: 0.20.3 - output dir : pandas/pandas/tests/io/data/legacy_pickle/0.18.1/ + output dir : pandas/pandas/tests/io/data/legacy_pickle/0.20.3/ storage format: pickle created pickle file: 0.20.3_x86_64_darwin_3.5.2.pickle @@ -53,8 +53,6 @@ Period, RangeIndex, Series, - SparseDataFrame, - SparseSeries, Timestamp, bdate_range, date_range, @@ -86,6 +84,13 @@ YearEnd, ) +try: + # TODO: remove try/except when 0.24.0 is the legacy version. + from pandas.arrays import SparseArray +except ImportError: + from pandas.core.sparse.api import SparseArray + + _loose_version = LooseVersion(pandas.__version__) @@ -97,7 +102,7 @@ def _create_sp_series(): arr[7:12] = nan arr[-1:] = nan - bseries = SparseSeries(arr, kind="block") + bseries = Series(SparseArray(arr, kind="block")) bseries.name = "bseries" return bseries @@ -111,7 +116,7 @@ def _create_sp_tsseries(): arr[-1:] = nan date_index = bdate_range("1/1/2011", periods=len(arr)) - bseries = SparseSeries(arr, index=date_index, kind="block") + bseries = Series(SparseArray(arr, kind="block"), index=date_index) bseries.name = "btsseries" return bseries @@ -127,7 +132,7 @@ def _create_sp_frame(): } dates = bdate_range("1/1/2011", periods=10) - return SparseDataFrame(data, index=dates) + return DataFrame(data, index=dates).apply(SparseArray) def create_data(): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 9842a706f43d7..7f861da6eb1f8 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1296,21 +1296,18 @@ def test_datetime_tz(self): s_naive = Series(tz_naive) assert stz.to_json() == s_naive.to_json() - @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") - @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") - @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") def test_sparse(self): # GH4377 df.to_json segfaults with non-ndarray blocks df = pd.DataFrame(np.random.randn(10, 4)) df.loc[:8] = np.nan - sdf = df.to_sparse() + sdf = df.astype("Sparse") expected = df.to_json() assert expected == sdf.to_json() s = pd.Series(np.random.randn(10)) s.loc[:8] = np.nan - ss = s.to_sparse() + ss = s.astype("Sparse") expected = s.to_json() assert expected == ss.to_json() diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index 77cac00882771..856d97e29f2c0 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -71,13 +71,6 @@ ignore_natural_naming_warning = pytest.mark.filterwarnings( "ignore:object name:tables.exceptions.NaturalNameWarning" ) -ignore_sparse = pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -ignore_dataframe_tosparse = pytest.mark.filterwarnings( - "ignore:DataFrame.to_sparse:FutureWarning" -) -ignore_series_tosparse = pytest.mark.filterwarnings( - "ignore:Series.to_sparse:FutureWarning" -) # contextmanager to ensure the file cleanup @@ -2353,38 +2346,6 @@ def test_series(self): ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) self._check_roundtrip(ts3, tm.assert_series_equal, check_index_type=False) - @ignore_sparse - @ignore_series_tosparse - def test_sparse_series(self): - - s = tm.makeStringSeries() - s.iloc[3:5] = np.nan - ss = s.to_sparse() - self._check_roundtrip(ss, tm.assert_series_equal, check_series_type=True) - - ss2 = s.to_sparse(kind="integer") - self._check_roundtrip(ss2, tm.assert_series_equal, check_series_type=True) - - ss3 = s.to_sparse(fill_value=0) - self._check_roundtrip(ss3, tm.assert_series_equal, check_series_type=True) - - @ignore_sparse - @ignore_dataframe_tosparse - def test_sparse_frame(self): - - s = tm.makeDataFrame() - s.iloc[3:5, 1:3] = np.nan - s.iloc[8:10, -2] = np.nan - ss = s.to_sparse() - - self._check_double_roundtrip(ss, tm.assert_frame_equal, check_frame_type=True) - - ss2 = s.to_sparse(kind="integer") - self._check_double_roundtrip(ss2, tm.assert_frame_equal, check_frame_type=True) - - ss3 = s.to_sparse(fill_value=0) - self._check_double_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) - def test_float_index(self): # GH #454 @@ -2709,40 +2670,6 @@ def test_overwrite_node(self): tm.assert_series_equal(store["a"], ts) - @ignore_sparse - @ignore_dataframe_tosparse - def test_sparse_with_compression(self): - - # GH 2931 - - # make sparse dataframe - arr = np.random.binomial(n=1, p=0.01, size=(1000, 10)) - df = DataFrame(arr).to_sparse(fill_value=0) - - # case 1: store uncompressed - self._check_double_roundtrip( - df, tm.assert_frame_equal, compression=False, check_frame_type=True - ) - - # case 2: store compressed (works) - self._check_double_roundtrip( - df, tm.assert_frame_equal, compression="zlib", check_frame_type=True - ) - - # set one series to be completely sparse - df[0] = np.zeros(1000) - - # case 3: store df with completely sparse series uncompressed - self._check_double_roundtrip( - df, tm.assert_frame_equal, compression=False, check_frame_type=True - ) - - # case 4: try storing df with completely sparse series compressed - # (fails) - self._check_double_roundtrip( - df, tm.assert_frame_equal, compression="zlib", check_frame_type=True - ) - def test_select(self): with ensure_clean_store(self.path) as store: @@ -3890,8 +3817,6 @@ def test_start_stop_multiple(self): expected = df.loc[[0], ["foo", "bar"]] tm.assert_frame_equal(result, expected) - @ignore_sparse - @ignore_dataframe_tosparse def test_start_stop_fixed(self): with ensure_clean_store(self.path) as store: @@ -3931,10 +3856,6 @@ def test_start_stop_fixed(self): df = tm.makeDataFrame() df.iloc[3:5, 1:3] = np.nan df.iloc[8:10, -2] = np.nan - dfs = df.to_sparse() - store.put("dfs", dfs) - with pytest.raises(NotImplementedError): - store.select("dfs", start=0, stop=5) def test_select_filter_corner(self): diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index ee668d6890756..ea69245924b0c 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -14,8 +14,10 @@ pyarrow_version = LooseVersion(pyarrow.__version__) +filter_sparse = pytest.mark.filterwarnings("ignore:The Sparse") +@filter_sparse @pytest.mark.single class TestFeather: def check_error_on_write(self, df, exc): diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index 33a11087f622d..0bafbab069dd4 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -585,49 +585,6 @@ def test_dataframe_duplicate_column_names(self): assert_frame_equal(result_3, expected_3) -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") -class TestSparse(TestPackers): - def _check_roundtrip(self, obj, comparator, **kwargs): - - # currently these are not implemetned - # i_rec = self.encode_decode(obj) - # comparator(obj, i_rec, **kwargs) - msg = r"msgpack sparse (series|frame) is not implemented" - with pytest.raises(NotImplementedError, match=msg): - self.encode_decode(obj) - - def test_sparse_series(self): - - s = tm.makeStringSeries() - s[3:5] = np.nan - ss = s.to_sparse() - self._check_roundtrip(ss, tm.assert_series_equal, check_series_type=True) - - ss2 = s.to_sparse(kind="integer") - self._check_roundtrip(ss2, tm.assert_series_equal, check_series_type=True) - - ss3 = s.to_sparse(fill_value=0) - self._check_roundtrip(ss3, tm.assert_series_equal, check_series_type=True) - - def test_sparse_frame(self): - - s = tm.makeDataFrame() - s.loc[3:5, 1:3] = np.nan - s.loc[8:10, -2] = np.nan - ss = s.to_sparse() - - self._check_roundtrip(ss, tm.assert_frame_equal, check_frame_type=True) - - ss2 = s.to_sparse(kind="integer") - self._check_roundtrip(ss2, tm.assert_frame_equal, check_frame_type=True) - - ss3 = s.to_sparse(fill_value=0) - self._check_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) - - @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestCompression(TestPackers): """See https://github.com/pandas-dev/pandas/pull/9783 @@ -878,7 +835,6 @@ def legacy_packer(request, datapath): return datapath(request.param) -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestMsgpack: """ diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 30555508f0998..4124c9aff9d34 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -49,8 +49,8 @@ def compare_element(result, expected, typ, version=None): return if typ.startswith("sp_"): - comparator = getattr(tm, "assert_{typ}_equal".format(typ=typ)) - comparator(result, expected, exact_indices=False) + comparator = tm.assert_equal + comparator(result, expected) elif typ == "timestamp": if expected is pd.NaT: assert result is pd.NaT @@ -82,10 +82,6 @@ def compare(data, vf, version): return data -def compare_sp_series_ts(res, exp, typ, version): - tm.assert_sp_series_equal(res, exp) - - def compare_series_ts(result, expected, typ, version): # GH 7748 tm.assert_series_equal(result, expected) @@ -134,10 +130,6 @@ def compare_index_period(result, expected, typ, version): tm.assert_index_equal(result.shift(2), expected.shift(2)) -def compare_sp_frame_float(result, expected, typ, version): - tm.assert_sp_frame_equal(result, expected) - - files = glob.glob( os.path.join(os.path.dirname(__file__), "data", "legacy_pickle", "*", "*.pickle") ) @@ -151,7 +143,6 @@ def legacy_pickle(request, datapath): # --------------------- # tests # --------------------- -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_pickles(current_pickle_data, legacy_pickle): if not is_platform_little_endian(): pytest.skip("known failure on non-little endian") @@ -162,7 +153,6 @@ def test_pickles(current_pickle_data, legacy_pickle): compare(current_pickle_data, legacy_pickle, version) -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_round_trip_current(current_pickle_data): def python_pickler(obj, path): with open(path, "wb") as fh: @@ -238,6 +228,32 @@ def test_pickle_path_localpath(): tm.assert_frame_equal(df, result) +def test_legacy_sparse_warning(datapath): + """ + + Generated with + + >>> df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [0, 0, 1, 1]}).to_sparse() + >>> df.to_pickle("pandas/tests/io/data/sparseframe-0.20.3.pickle.gz", + ... compression="gzip") + + >>> s = df['B'] + >>> s.to_pickle("pandas/tests/io/data/sparseseries-0.20.3.pickle.gz", + ... compression="gzip") + """ + with tm.assert_produces_warning(FutureWarning): + simplefilter("ignore", DeprecationWarning) # from boto + pd.read_pickle( + datapath("io", "data", "sparseseries-0.20.3.pickle.gz"), compression="gzip" + ) + + with tm.assert_produces_warning(FutureWarning): + simplefilter("ignore", DeprecationWarning) # from boto + pd.read_pickle( + datapath("io", "data", "sparseframe-0.20.3.pickle.gz"), compression="gzip" + ) + + # --------------------- # test pickle compression # --------------------- diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 149930059d868..9d08981d39894 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -13,7 +13,6 @@ from pandas.util.testing import assert_frame_equal -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestGetDummies: @pytest.fixture def df(self): @@ -273,7 +272,7 @@ def test_dataframe_dummies_subset(self, df, sparse): expected[["C"]] = df[["C"]] if sparse: cols = ["from_A_a", "from_A_b"] - expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x)) + expected[cols] = expected[cols].astype(pd.SparseDtype("uint8", 0)) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self, df, sparse): @@ -292,7 +291,7 @@ def test_dataframe_dummies_prefix_sep(self, df, sparse): expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]] if sparse: cols = ["A..a", "A..b", "B..b", "B..c"] - expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x)) + expected[cols] = expected[cols].astype(pd.SparseDtype("uint8", 0)) assert_frame_equal(result, expected) @@ -329,7 +328,7 @@ def test_dataframe_dummies_prefix_dict(self, sparse): columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] expected[columns] = expected[columns].astype(np.uint8) if sparse: - expected[columns] = expected[columns].apply(lambda x: pd.SparseSeries(x)) + expected[columns] = expected[columns].astype(pd.SparseDtype("uint8", 0)) assert_frame_equal(result, expected) @@ -495,7 +494,7 @@ def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): expected = expected[["C", "A_b", "B_c", "cat_y"]] if sparse: for col in cols: - expected[col] = pd.SparseSeries(expected[col]) + expected[col] = pd.SparseArray(expected[col]) assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_na(self, df, sparse): @@ -517,7 +516,7 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse): expected = expected.sort_index(axis=1) if sparse: for col in cols: - expected[col] = pd.SparseSeries(expected[col]) + expected[col] = pd.SparseArray(expected[col]) assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index d204d7d2a1d7c..9aba17c076bc7 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -131,12 +131,6 @@ def test_sort_index_name(self): result = self.ts.sort_index(ascending=False) assert result.name == self.ts.name - @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") - @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") - def test_to_sparse_pass_name(self): - result = self.ts.to_sparse() - assert result.name == self.ts.name - def test_constructor_dict(self): d = {"a": 0.0, "b": 1.0, "c": 2.0} result = self.series_klass(d) @@ -206,11 +200,9 @@ def test_constructor_dict_timedelta_index(self): ) self._assert_series_equal(result, expected) - @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_from_array_deprecated(self): - # multiple FutureWarnings, so can't assert stacklevel - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=True): self.series_klass.from_array([1, 2, 3]) def test_sparse_accessor_updates_on_inplace(self): diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 5b77ef58b2ef8..819b9228219aa 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -229,8 +229,6 @@ def test_combine_first_dt_tz_values(self, tz_naive_fixture): exp = pd.Series(exp_vals, name="ser1") assert_series_equal(exp, result) - @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") - @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") def test_concat_empty_series_dtypes(self): # booleans @@ -287,7 +285,10 @@ def test_concat_empty_series_dtypes(self): # sparse # TODO: move? result = pd.concat( - [Series(dtype="float64").to_sparse(), Series(dtype="float64").to_sparse()] + [ + Series(dtype="float64").astype("Sparse"), + Series(dtype="float64").astype("Sparse"), + ] ) assert result.dtype == "Sparse[float64]" @@ -296,10 +297,10 @@ def test_concat_empty_series_dtypes(self): assert result.ftype == "float64:sparse" result = pd.concat( - [Series(dtype="float64").to_sparse(), Series(dtype="float64")] + [Series(dtype="float64").astype("Sparse"), Series(dtype="float64")] ) # TODO: release-note: concat sparse dtype - expected = pd.core.sparse.api.SparseDtype(np.float64) + expected = pd.SparseDtype(np.float64) assert result.dtype == expected # GH 26705 - Assert .ftype is deprecated @@ -307,10 +308,10 @@ def test_concat_empty_series_dtypes(self): assert result.ftype == "float64:sparse" result = pd.concat( - [Series(dtype="float64").to_sparse(), Series(dtype="object")] + [Series(dtype="float64").astype("Sparse"), Series(dtype="object")] ) # TODO: release-note: concat sparse dtype - expected = pd.core.sparse.api.SparseDtype("object") + expected = pd.SparseDtype("object") assert result.dtype == expected # GH 26705 - Assert .ftype is deprecated diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index ddd2c566f4cda..f459ae9e7845d 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -6,7 +6,6 @@ import pytz from pandas._libs.tslib import iNaT -from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td import pandas as pd @@ -992,65 +991,6 @@ def test_series_fillna_limit(self): expected[:3] = np.nan assert_series_equal(result, expected) - @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") - @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") - def test_sparse_series_fillna_limit(self): - index = np.arange(10) - s = Series(np.random.randn(10), index=index) - - ss = s[:2].reindex(index).to_sparse() - # TODO: what is this test doing? why are result an expected - # the same call to fillna? - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - # TODO: release-note fillna performance warning - result = ss.fillna(method="pad", limit=5) - expected = ss.fillna(method="pad", limit=5) - expected = expected.to_dense() - expected[-3:] = np.nan - expected = expected.to_sparse() - assert_series_equal(result, expected) - - ss = s[-2:].reindex(index).to_sparse() - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - result = ss.fillna(method="backfill", limit=5) - expected = ss.fillna(method="backfill") - expected = expected.to_dense() - expected[:3] = np.nan - expected = expected.to_sparse() - assert_series_equal(result, expected) - - @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") - @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") - def test_sparse_series_pad_backfill_limit(self): - index = np.arange(10) - s = Series(np.random.randn(10), index=index) - s = s.to_sparse() - - result = s[:2].reindex(index, method="pad", limit=5) - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - expected = s[:2].reindex(index).fillna(method="pad") - expected = expected.to_dense() - expected[-3:] = np.nan - expected = expected.to_sparse() - assert_series_equal(result, expected) - - result = s[-2:].reindex(index, method="backfill", limit=5) - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - expected = s[-2:].reindex(index).fillna(method="backfill") - expected = expected.to_dense() - expected[:3] = np.nan - expected = expected.to_sparse() - assert_series_equal(result, expected) - - @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_series_pad_backfill_limit(self): index = np.arange(10) s = Series(np.random.randn(10), index=index) diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index 450fdc3f4dd6f..6b82f890e974b 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -1,8 +1,3 @@ -import numpy as np -import pytest - -import pandas as pd -from pandas import SparseDtype import pandas.util.testing as tm @@ -38,67 +33,3 @@ def test_subclass_unstack(self): def test_subclass_empty_repr(self): assert "SubclassedSeries" in repr(tm.SubclassedSeries()) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -class TestSparseSeriesSubclassing: - def test_subclass_sparse_slice(self): - # int64 - s = tm.SubclassedSparseSeries([1, 2, 3, 4, 5]) - exp = tm.SubclassedSparseSeries([2, 3, 4], index=[1, 2, 3]) - tm.assert_sp_series_equal(s.loc[1:3], exp) - assert s.loc[1:3].dtype == SparseDtype(np.int64) - - exp = tm.SubclassedSparseSeries([2, 3], index=[1, 2]) - tm.assert_sp_series_equal(s.iloc[1:3], exp) - assert s.iloc[1:3].dtype == SparseDtype(np.int64) - - exp = tm.SubclassedSparseSeries([2, 3], index=[1, 2]) - tm.assert_sp_series_equal(s[1:3], exp) - assert s[1:3].dtype == SparseDtype(np.int64) - - # float64 - s = tm.SubclassedSparseSeries([1.0, 2.0, 3.0, 4.0, 5.0]) - exp = tm.SubclassedSparseSeries([2.0, 3.0, 4.0], index=[1, 2, 3]) - tm.assert_sp_series_equal(s.loc[1:3], exp) - assert s.loc[1:3].dtype == SparseDtype(np.float64) - - exp = tm.SubclassedSparseSeries([2.0, 3.0], index=[1, 2]) - tm.assert_sp_series_equal(s.iloc[1:3], exp) - assert s.iloc[1:3].dtype == SparseDtype(np.float64) - - exp = tm.SubclassedSparseSeries([2.0, 3.0], index=[1, 2]) - tm.assert_sp_series_equal(s[1:3], exp) - assert s[1:3].dtype == SparseDtype(np.float64) - - def test_subclass_sparse_addition(self): - s1 = tm.SubclassedSparseSeries([1, 3, 5]) - s2 = tm.SubclassedSparseSeries([-2, 5, 12]) - exp = tm.SubclassedSparseSeries([-1, 8, 17]) - tm.assert_sp_series_equal(s1 + s2, exp) - - s1 = tm.SubclassedSparseSeries([4.0, 5.0, 6.0]) - s2 = tm.SubclassedSparseSeries([1.0, 2.0, 3.0]) - exp = tm.SubclassedSparseSeries([5.0, 7.0, 9.0]) - tm.assert_sp_series_equal(s1 + s2, exp) - - def test_subclass_sparse_to_frame(self): - s = tm.SubclassedSparseSeries([1, 2], index=list("ab"), name="xxx") - res = s.to_frame() - - exp_arr = pd.SparseArray([1, 2], dtype=np.int64, kind="block", fill_value=0) - exp = tm.SubclassedSparseDataFrame( - {"xxx": exp_arr}, index=list("ab"), default_fill_value=0 - ) - tm.assert_sp_frame_equal(res, exp) - - # create from int dict - res = tm.SubclassedSparseDataFrame( - {"xxx": [1, 2]}, index=list("ab"), default_fill_value=0 - ) - tm.assert_sp_frame_equal(res, exp) - - s = tm.SubclassedSparseSeries([1.1, 2.1], index=list("ab"), name="xxx") - res = s.to_frame() - exp = tm.SubclassedSparseDataFrame({"xxx": [1.1, 2.1]}, index=list("ab")) - tm.assert_sp_frame_equal(res, exp) diff --git a/pandas/tests/sparse/common.py b/pandas/tests/sparse/common.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/pandas/tests/sparse/frame/__init__.py b/pandas/tests/sparse/frame/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/pandas/tests/sparse/frame/conftest.py b/pandas/tests/sparse/frame/conftest.py deleted file mode 100644 index 989b58419c2cd..0000000000000 --- a/pandas/tests/sparse/frame/conftest.py +++ /dev/null @@ -1,120 +0,0 @@ -import numpy as np -import pytest - -from pandas import DataFrame, SparseArray, SparseDataFrame, bdate_range - -data = { - "A": [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6], - "B": [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6], - "C": np.arange(10, dtype=np.float64), - "D": [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan], -} -dates = bdate_range("1/1/2011", periods=10) - - -# fixture names must be compatible with the tests in -# tests/frame/test_api.SharedWithSparse - - -@pytest.fixture -def float_frame_dense(): - """ - Fixture for dense DataFrame of floats with DatetimeIndex - - Columns are ['A', 'B', 'C', 'D']; some entries are missing - """ - return DataFrame(data, index=dates) - - -@pytest.fixture -def float_frame(): - """ - Fixture for sparse DataFrame of floats with DatetimeIndex - - Columns are ['A', 'B', 'C', 'D']; some entries are missing - """ - # default_kind='block' is the default - return SparseDataFrame(data, index=dates, default_kind="block") - - -@pytest.fixture -def float_frame_int_kind(): - """ - Fixture for sparse DataFrame of floats with DatetimeIndex - - Columns are ['A', 'B', 'C', 'D'] and default_kind='integer'. - Some entries are missing. - """ - return SparseDataFrame(data, index=dates, default_kind="integer") - - -@pytest.fixture -def float_string_frame(): - """ - Fixture for sparse DataFrame of floats and strings with DatetimeIndex - - Columns are ['A', 'B', 'C', 'D', 'foo']; some entries are missing - """ - sdf = SparseDataFrame(data, index=dates) - sdf["foo"] = SparseArray(["bar"] * len(dates)) - return sdf - - -@pytest.fixture -def float_frame_fill0_dense(): - """ - Fixture for dense DataFrame of floats with DatetimeIndex - - Columns are ['A', 'B', 'C', 'D']; missing entries have been filled with 0 - """ - values = SparseDataFrame(data).values - values[np.isnan(values)] = 0 - return DataFrame(values, columns=["A", "B", "C", "D"], index=dates) - - -@pytest.fixture -def float_frame_fill0(): - """ - Fixture for sparse DataFrame of floats with DatetimeIndex - - Columns are ['A', 'B', 'C', 'D']; missing entries have been filled with 0 - """ - values = SparseDataFrame(data).values - values[np.isnan(values)] = 0 - return SparseDataFrame( - values, columns=["A", "B", "C", "D"], default_fill_value=0, index=dates - ) - - -@pytest.fixture -def float_frame_fill2_dense(): - """ - Fixture for dense DataFrame of floats with DatetimeIndex - - Columns are ['A', 'B', 'C', 'D']; missing entries have been filled with 2 - """ - values = SparseDataFrame(data).values - values[np.isnan(values)] = 2 - return DataFrame(values, columns=["A", "B", "C", "D"], index=dates) - - -@pytest.fixture -def float_frame_fill2(): - """ - Fixture for sparse DataFrame of floats with DatetimeIndex - - Columns are ['A', 'B', 'C', 'D']; missing entries have been filled with 2 - """ - values = SparseDataFrame(data).values - values[np.isnan(values)] = 2 - return SparseDataFrame( - values, columns=["A", "B", "C", "D"], default_fill_value=2, index=dates - ) - - -@pytest.fixture -def empty_frame(): - """ - Fixture for empty SparseDataFrame - """ - return SparseDataFrame() diff --git a/pandas/tests/sparse/frame/test_analytics.py b/pandas/tests/sparse/frame/test_analytics.py deleted file mode 100644 index fae879b3d33b5..0000000000000 --- a/pandas/tests/sparse/frame/test_analytics.py +++ /dev/null @@ -1,41 +0,0 @@ -import numpy as np -import pytest - -from pandas import DataFrame, SparseDataFrame, SparseSeries -from pandas.util import testing as tm - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") -def test_quantile(): - # GH 17386 - data = [[1, 1], [2, 10], [3, 100], [np.nan, np.nan]] - q = 0.1 - - sparse_df = SparseDataFrame(data) - result = sparse_df.quantile(q) - - dense_df = DataFrame(data) - dense_expected = dense_df.quantile(q) - sparse_expected = SparseSeries(dense_expected) - - tm.assert_series_equal(result, dense_expected) - tm.assert_sp_series_equal(result, sparse_expected) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") -def test_quantile_multi(): - # GH 17386 - data = [[1, 1], [2, 10], [3, 100], [np.nan, np.nan]] - q = [0.1, 0.5] - - sparse_df = SparseDataFrame(data) - result = sparse_df.quantile(q) - - dense_df = DataFrame(data) - dense_expected = dense_df.quantile(q) - sparse_expected = SparseDataFrame(dense_expected) - - tm.assert_frame_equal(result, dense_expected) - tm.assert_sp_frame_equal(result, sparse_expected) diff --git a/pandas/tests/sparse/frame/test_apply.py b/pandas/tests/sparse/frame/test_apply.py deleted file mode 100644 index d8158db32d8f0..0000000000000 --- a/pandas/tests/sparse/frame/test_apply.py +++ /dev/null @@ -1,117 +0,0 @@ -import numpy as np -import pytest - -from pandas import DataFrame, Series, SparseDataFrame, bdate_range -from pandas.core import nanops -from pandas.core.sparse.api import SparseDtype -from pandas.util import testing as tm - - -@pytest.fixture -def dates(): - return bdate_range("1/1/2011", periods=10) - - -@pytest.fixture -def empty(): - return SparseDataFrame() - - -@pytest.fixture -def frame(dates): - data = { - "A": [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6], - "B": [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6], - "C": np.arange(10, dtype=np.float64), - "D": [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan], - } - - return SparseDataFrame(data, index=dates) - - -@pytest.fixture -def fill_frame(frame): - values = frame.values.copy() - values[np.isnan(values)] = 2 - - return SparseDataFrame( - values, columns=["A", "B", "C", "D"], default_fill_value=2, index=frame.index - ) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") -def test_apply(frame): - applied = frame.apply(np.sqrt) - assert isinstance(applied, SparseDataFrame) - tm.assert_almost_equal(applied.values, np.sqrt(frame.values)) - - # agg / broadcast - # two FutureWarnings, so we can't check stacklevel properly. - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - broadcasted = frame.apply(np.sum, broadcast=True) - assert isinstance(broadcasted, SparseDataFrame) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - exp = frame.to_dense().apply(np.sum, broadcast=True) - tm.assert_frame_equal(broadcasted.to_dense(), exp) - - applied = frame.apply(np.sum) - tm.assert_series_equal(applied, frame.to_dense().apply(nanops.nansum).to_sparse()) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -def test_apply_fill(fill_frame): - applied = fill_frame.apply(np.sqrt) - assert applied["A"].fill_value == np.sqrt(2) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -def test_apply_empty(empty): - assert empty.apply(np.sqrt) is empty - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") -def test_apply_nonuq(): - orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) - sparse = orig.to_sparse() - res = sparse.apply(lambda s: s[0], axis=1) - exp = orig.apply(lambda s: s[0], axis=1) - - # dtype must be kept - assert res.dtype == SparseDtype(np.int64) - - # ToDo: apply must return subclassed dtype - assert isinstance(res, Series) - tm.assert_series_equal(res.to_dense(), exp) - - # df.T breaks - sparse = orig.T.to_sparse() - res = sparse.apply(lambda s: s[0], axis=0) # noqa - exp = orig.T.apply(lambda s: s[0], axis=0) - - # TODO: no non-unique columns supported in sparse yet - # tm.assert_series_equal(res.to_dense(), exp) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -def test_applymap(frame): - # just test that it works - result = frame.applymap(lambda x: x * 2) - assert isinstance(result, SparseDataFrame) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -def test_apply_keep_sparse_dtype(): - # GH 23744 - sdf = SparseDataFrame( - np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), - columns=["b", "a", "c"], - default_fill_value=1, - ) - df = DataFrame(sdf) - - expected = sdf.apply(np.exp) - result = df.apply(np.exp) - tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py deleted file mode 100644 index e372e2563e682..0000000000000 --- a/pandas/tests/sparse/frame/test_frame.py +++ /dev/null @@ -1,1596 +0,0 @@ -import operator -from types import LambdaType - -import numpy as np -from numpy import nan -import pytest - -from pandas._libs.sparse import BlockIndex, IntIndex -from pandas.errors import PerformanceWarning - -import pandas as pd -from pandas import DataFrame, Series, bdate_range, compat -from pandas.core import ops -from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.sparse import frame as spf -from pandas.core.sparse.api import ( - SparseArray, - SparseDataFrame, - SparseDtype, - SparseSeries, -) -from pandas.tests.frame.test_api import SharedWithSparse -from pandas.util import testing as tm - -from pandas.tseries.offsets import BDay - - -def test_deprecated(): - with tm.assert_produces_warning(FutureWarning): - pd.SparseDataFrame({"A": [1, 2]}) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") -class TestSparseDataFrame(SharedWithSparse): - klass = SparseDataFrame - - # SharedWithSparse tests use generic, klass-agnostic assertion - _assert_frame_equal = staticmethod(tm.assert_sp_frame_equal) - _assert_series_equal = staticmethod(tm.assert_sp_series_equal) - - def test_iterrows(self, float_frame, float_string_frame): - # Same as parent, but we don't ensure the sparse kind is the same. - for k, v in float_frame.iterrows(): - exp = float_frame.loc[k] - tm.assert_sp_series_equal(v, exp, check_kind=False) - - for k, v in float_string_frame.iterrows(): - exp = float_string_frame.loc[k] - tm.assert_sp_series_equal(v, exp, check_kind=False) - - def test_itertuples(self, float_frame): - for i, tup in enumerate(float_frame.itertuples()): - s = self.klass._constructor_sliced(tup[1:]) - s.name = tup[0] - expected = float_frame.iloc[i, :].reset_index(drop=True) - tm.assert_sp_series_equal(s, expected, check_kind=False) - - def test_fill_value_when_combine_const(self): - # GH12723 - dat = np.array([0, 1, np.nan, 3, 4, 5], dtype="float") - df = SparseDataFrame({"foo": dat}, index=range(6)) - - exp = df.fillna(0).add(2) - res = df.add(2, fill_value=0) - tm.assert_sp_frame_equal(res, exp) - - def test_values(self, empty_frame, float_frame): - empty = empty_frame.values - assert empty.shape == (0, 0) - - no_cols = SparseDataFrame(index=np.arange(10)) - mat = no_cols.values - assert mat.shape == (10, 0) - - no_index = SparseDataFrame(columns=np.arange(10)) - mat = no_index.values - assert mat.shape == (0, 10) - - def test_copy(self, float_frame): - cp = float_frame.copy() - assert isinstance(cp, SparseDataFrame) - tm.assert_sp_frame_equal(cp, float_frame) - - # as of v0.15.0 - # this is now identical (but not is_a ) - assert cp.index.identical(float_frame.index) - - def test_constructor(self, float_frame, float_frame_int_kind, float_frame_fill0): - for col, series in float_frame.items(): - assert isinstance(series, SparseSeries) - - assert isinstance(float_frame_int_kind["A"].sp_index, IntIndex) - - # constructed zframe from matrix above - assert float_frame_fill0["A"].fill_value == 0 - # XXX: changed asarray - expected = pd.SparseArray( - [0, 0, 0, 0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0], fill_value=0, kind="block" - ) - tm.assert_sp_array_equal(expected, float_frame_fill0["A"].values) - tm.assert_numpy_array_equal( - np.array([0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]), - float_frame_fill0["A"].to_dense().values, - ) - - # construct no data - sdf = SparseDataFrame(columns=np.arange(10), index=np.arange(10)) - for col, series in sdf.items(): - assert isinstance(series, SparseSeries) - - # construct from nested dict - data = {c: s.to_dict() for c, s in float_frame.items()} - - sdf = SparseDataFrame(data) - tm.assert_sp_frame_equal(sdf, float_frame) - - # TODO: test data is copied from inputs - - # init dict with different index - idx = float_frame.index[:5] - cons = SparseDataFrame( - float_frame, - index=idx, - columns=float_frame.columns, - default_fill_value=float_frame.default_fill_value, - default_kind=float_frame.default_kind, - copy=True, - ) - reindexed = float_frame.reindex(idx) - - tm.assert_sp_frame_equal(cons, reindexed, exact_indices=False) - - # assert level parameter breaks reindex - with pytest.raises(TypeError): - float_frame.reindex(idx, level=0) - - repr(float_frame) - - def test_constructor_fill_value_not_scalar_raises(self): - d = {"b": [2, 3], "a": [0, 1]} - fill_value = np.array(np.nan) - with pytest.raises(ValueError, match="must be a scalar"): - SparseDataFrame(data=d, default_fill_value=fill_value) - - def test_constructor_dict_order(self): - # GH19018 - # initialization ordering: by insertion order if python>= 3.6, else - # order by value - d = {"b": [2, 3], "a": [0, 1]} - frame = SparseDataFrame(data=d) - if compat.PY36: - expected = SparseDataFrame(data=d, columns=list("ba")) - else: - expected = SparseDataFrame(data=d, columns=list("ab")) - tm.assert_sp_frame_equal(frame, expected) - - def test_constructor_ndarray(self, float_frame): - # no index or columns - sp = SparseDataFrame(float_frame.values) - - # 1d - sp = SparseDataFrame( - float_frame["A"].values, index=float_frame.index, columns=["A"] - ) - tm.assert_sp_frame_equal(sp, float_frame.reindex(columns=["A"])) - - # raise on level argument - msg = "Reindex by level not supported for sparse" - with pytest.raises(TypeError, match=msg): - float_frame.reindex(columns=["A"], level=1) - - # wrong length index / columns - with pytest.raises(ValueError, match="^Index length"): - SparseDataFrame(float_frame.values, index=float_frame.index[:-1]) - - with pytest.raises(ValueError, match="^Column length"): - SparseDataFrame(float_frame.values, columns=float_frame.columns[:-1]) - - # GH 9272 - def test_constructor_empty(self): - sp = SparseDataFrame() - assert len(sp.index) == 0 - assert len(sp.columns) == 0 - - def test_constructor_dataframe(self, float_frame): - dense = float_frame.to_dense() - sp = SparseDataFrame(dense) - tm.assert_sp_frame_equal(sp, float_frame) - - def test_constructor_convert_index_once(self): - arr = np.array([1.5, 2.5, 3.5]) - sdf = SparseDataFrame(columns=range(4), index=arr) - assert sdf[0].index is sdf[1].index - - def test_constructor_from_series(self): - - # GH 2873 - x = Series(np.random.randn(10000), name="a") - x = x.to_sparse(fill_value=0) - assert isinstance(x, SparseSeries) - df = SparseDataFrame(x) - assert isinstance(df, SparseDataFrame) - - x = Series(np.random.randn(10000), name="a") - y = Series(np.random.randn(10000), name="b") - x2 = x.astype(float) - x2.loc[:9998] = np.NaN - # TODO: x_sparse is unused...fix - x_sparse = x2.to_sparse(fill_value=np.NaN) # noqa - - # Currently fails too with weird ufunc error - # df1 = SparseDataFrame([x_sparse, y]) - - y.loc[:9998] = 0 - # TODO: y_sparse is unsused...fix - y_sparse = y.to_sparse(fill_value=0) # noqa - # without sparse value raises error - # df2 = SparseDataFrame([x2_sparse, y]) - - def test_constructor_from_dense_series(self): - # GH 19393 - # series with name - x = Series(np.random.randn(10000), name="a") - result = SparseDataFrame(x) - expected = x.to_frame().to_sparse() - tm.assert_sp_frame_equal(result, expected) - - # series with no name - x = Series(np.random.randn(10000)) - result = SparseDataFrame(x) - expected = x.to_frame().to_sparse() - tm.assert_sp_frame_equal(result, expected) - - def test_constructor_from_unknown_type(self): - # GH 19393 - class Unknown: - pass - - with pytest.raises( - TypeError, - match=( - "SparseDataFrame called with unknown type " - '"Unknown" for data argument' - ), - ): - SparseDataFrame(Unknown()) - - def test_constructor_preserve_attr(self): - # GH 13866 - arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0) - assert arr.dtype == SparseDtype(np.int64) - assert arr.fill_value == 0 - - df = pd.SparseDataFrame({"x": arr}) - assert df["x"].dtype == SparseDtype(np.int64) - assert df["x"].fill_value == 0 - - s = pd.SparseSeries(arr, name="x") - assert s.dtype == SparseDtype(np.int64) - assert s.fill_value == 0 - - df = pd.SparseDataFrame(s) - assert df["x"].dtype == SparseDtype(np.int64) - assert df["x"].fill_value == 0 - - df = pd.SparseDataFrame({"x": s}) - assert df["x"].dtype == SparseDtype(np.int64) - assert df["x"].fill_value == 0 - - def test_constructor_nan_dataframe(self): - # GH 10079 - trains = np.arange(100) - thresholds = [10, 20, 30, 40, 50, 60] - tuples = [(i, j) for i in trains for j in thresholds] - index = pd.MultiIndex.from_tuples(tuples, names=["trains", "thresholds"]) - matrix = np.empty((len(index), len(trains))) - matrix.fill(np.nan) - df = pd.DataFrame(matrix, index=index, columns=trains, dtype=float) - result = df.to_sparse() - expected = pd.SparseDataFrame(matrix, index=index, columns=trains, dtype=float) - tm.assert_sp_frame_equal(result, expected) - - def test_type_coercion_at_construction(self): - # GH 15682 - result = pd.SparseDataFrame( - {"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]}, - dtype="uint8", - default_fill_value=0, - ) - expected = pd.SparseDataFrame( - { - "a": pd.SparseSeries([1, 0, 0], dtype="uint8"), - "b": pd.SparseSeries([0, 1, 0], dtype="uint8"), - "c": pd.SparseSeries([0, 0, 1], dtype="uint8"), - }, - default_fill_value=0, - ) - tm.assert_sp_frame_equal(result, expected) - - def test_default_dtype(self): - result = pd.SparseDataFrame(columns=list("ab"), index=range(2)) - expected = pd.SparseDataFrame( - [[np.nan, np.nan], [np.nan, np.nan]], columns=list("ab"), index=range(2) - ) - tm.assert_sp_frame_equal(result, expected) - - def test_nan_data_with_int_dtype_raises_error(self): - sdf = pd.SparseDataFrame( - [[np.nan, np.nan], [np.nan, np.nan]], columns=list("ab"), index=range(2) - ) - msg = "Cannot convert non-finite values" - with pytest.raises(ValueError, match=msg): - pd.SparseDataFrame(sdf, dtype=np.int64) - - def test_dtypes(self): - df = DataFrame(np.random.randn(10000, 4)) - df.loc[:9998] = np.nan - sdf = df.to_sparse() - result = sdf.dtypes - expected = Series(["Sparse[float64, nan]"] * 4) - tm.assert_series_equal(result, expected) - - def test_shape( - self, float_frame, float_frame_int_kind, float_frame_fill0, float_frame_fill2 - ): - # see gh-10452 - assert float_frame.shape == (10, 4) - assert float_frame_int_kind.shape == (10, 4) - assert float_frame_fill0.shape == (10, 4) - assert float_frame_fill2.shape == (10, 4) - - def test_str(self): - df = DataFrame(np.random.randn(10000, 4)) - df.loc[:9998] = np.nan - - sdf = df.to_sparse() - str(sdf) - - def test_array_interface(self, float_frame): - res = np.sqrt(float_frame) - dres = np.sqrt(float_frame.to_dense()) - tm.assert_frame_equal(res.to_dense(), dres) - - def test_pickle( - self, - float_frame, - float_frame_int_kind, - float_frame_dense, - float_frame_fill0, - float_frame_fill0_dense, - float_frame_fill2, - float_frame_fill2_dense, - ): - def _test_roundtrip(frame, orig): - result = tm.round_trip_pickle(frame) - tm.assert_sp_frame_equal(frame, result) - tm.assert_frame_equal(result.to_dense(), orig, check_dtype=False) - - _test_roundtrip(SparseDataFrame(), DataFrame()) - _test_roundtrip(float_frame, float_frame_dense) - _test_roundtrip(float_frame_int_kind, float_frame_dense) - _test_roundtrip(float_frame_fill0, float_frame_fill0_dense) - _test_roundtrip(float_frame_fill2, float_frame_fill2_dense) - - def test_dense_to_sparse(self): - df = DataFrame({"A": [nan, nan, nan, 1, 2], "B": [1, 2, nan, nan, nan]}) - sdf = df.to_sparse() - assert isinstance(sdf, SparseDataFrame) - assert np.isnan(sdf.default_fill_value) - assert isinstance(sdf["A"].sp_index, BlockIndex) - tm.assert_frame_equal(sdf.to_dense(), df) - - sdf = df.to_sparse(kind="integer") - assert isinstance(sdf["A"].sp_index, IntIndex) - - df = DataFrame({"A": [0, 0, 0, 1, 2], "B": [1, 2, 0, 0, 0]}, dtype=float) - sdf = df.to_sparse(fill_value=0) - assert sdf.default_fill_value == 0 - tm.assert_frame_equal(sdf.to_dense(), df) - - def test_deprecated_dense_to_sparse(self): - # GH 26557 - # Deprecated 0.25.0 - - df = pd.DataFrame({"A": [1, np.nan, 3]}) - sparse_df = pd.SparseDataFrame({"A": [1, np.nan, 3]}) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.to_sparse() - tm.assert_frame_equal(result, sparse_df) - - def test_density(self): - df = SparseSeries([nan, nan, nan, 0, 1, 2, 3, 4, 5, 6]) - assert df.density == 0.7 - - df = SparseDataFrame( - { - "A": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], - "B": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - "C": np.arange(10), - "D": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan], - } - ) - - assert df.density == 0.75 - - def test_sparse_to_dense(self): - pass - - def test_sparse_series_ops(self, float_frame): - self._check_frame_ops(float_frame) - - def test_sparse_series_ops_i(self, float_frame_int_kind): - self._check_frame_ops(float_frame_int_kind) - - def test_sparse_series_ops_z(self, float_frame_fill0): - self._check_frame_ops(float_frame_fill0) - - def test_sparse_series_ops_fill(self, float_frame_fill2): - self._check_frame_ops(float_frame_fill2) - - def _check_frame_ops(self, frame): - def _compare_to_dense(a, b, da, db, op): - sparse_result = op(a, b) - dense_result = op(da, db) - - # catch lambdas but not non-lambdas e.g. operator.add - if op in [operator.floordiv, ops.rfloordiv] or isinstance(op, LambdaType): - # GH#27231 Series sets 1//0 to np.inf, which SparseArray - # does not do (yet) - mask = np.isinf(dense_result) & ~np.isinf(sparse_result.to_dense()) - dense_result[mask] = np.nan - - fill = sparse_result.default_fill_value - dense_result = dense_result.to_sparse(fill_value=fill) - tm.assert_sp_frame_equal(sparse_result, dense_result, exact_indices=False) - - if isinstance(a, DataFrame) and isinstance(db, DataFrame): - mixed_result = op(a, db) - assert isinstance(mixed_result, SparseDataFrame) - tm.assert_sp_frame_equal( - mixed_result, sparse_result, exact_indices=False - ) - - opnames = ["add", "sub", "mul", "truediv", "floordiv"] - - fidx = frame.index - - # time series operations - - series = [ - frame["A"], - frame["B"], - frame["C"], - frame["D"], - frame["A"].reindex(fidx[:7]), - frame["A"].reindex(fidx[::2]), - SparseSeries([], index=[]), - ] - - for op in opnames: - _compare_to_dense( - frame, - frame[::2], - frame.to_dense(), - frame[::2].to_dense(), - getattr(operator, op), - ) - - # 2304, no auto-broadcasting - for i, s in enumerate(series): - f = lambda a, b: getattr(a, op)(b, axis="index") - _compare_to_dense(frame, s, frame.to_dense(), s.to_dense(), f) - - # FIXME: dont leave commented-out - # rops are not implemented - # _compare_to_dense(s, frame, s.to_dense(), - # frame.to_dense(), f) - - # cross-sectional operations - series = [ - frame.xs(fidx[0]), - frame.xs(fidx[3]), - frame.xs(fidx[5]), - frame.xs(fidx[7]), - frame.xs(fidx[5])[:2], - ] - - for name in opnames: - op = getattr(operator, name) - for s in series: - _compare_to_dense(frame, s, frame.to_dense(), s, op) - _compare_to_dense(s, frame, s, frame.to_dense(), op) - - # it works! - frame + frame.loc[:, ["A", "B"]] - - def test_op_corners(self, float_frame, empty_frame): - empty = empty_frame + empty_frame - assert empty.empty - - foo = float_frame + empty_frame - assert isinstance(foo.index, DatetimeIndex) - tm.assert_frame_equal(foo, float_frame * np.nan) - - foo = empty_frame + float_frame - tm.assert_frame_equal(foo, float_frame * np.nan) - - def test_scalar_ops(self): - pass - - def test_getitem(self): - # 1585 select multiple columns - sdf = SparseDataFrame(index=[0, 1, 2], columns=["a", "b", "c"]) - - result = sdf[["a", "b"]] - exp = sdf.reindex(columns=["a", "b"]) - tm.assert_sp_frame_equal(result, exp) - - with pytest.raises(KeyError, match=r"\['d'\] not in index"): - sdf[["a", "d"]] - - def test_iloc(self, float_frame): - - # GH 2227 - result = float_frame.iloc[:, 0] - assert isinstance(result, SparseSeries) - tm.assert_sp_series_equal(result, float_frame["A"]) - - # preserve sparse index type. #2251 - data = {"A": [0, 1]} - iframe = SparseDataFrame(data, default_kind="integer") - tm.assert_class_equal(iframe["A"].sp_index, iframe.iloc[:, 0].sp_index) - - def test_set_value(self, float_frame): - - # ok, as the index gets converted to object - frame = float_frame.copy() - res = frame._set_value("foobar", "B", 1.5) - assert res.index.dtype == "object" - - res = float_frame - res.index = res.index.astype(object) - - res = float_frame._set_value("foobar", "B", 1.5) - assert res is not float_frame - assert res.index[-1] == "foobar" - assert res._get_value("foobar", "B") == 1.5 - - res2 = res._set_value("foobar", "qux", 1.5) - assert res2 is not res - tm.assert_index_equal( - res2.columns, pd.Index(list(float_frame.columns) + ["qux"]) - ) - assert res2._get_value("foobar", "qux") == 1.5 - - def test_fancy_index_misc(self, float_frame): - # axis = 0 - sliced = float_frame.iloc[-2:, :] - expected = float_frame.reindex(index=float_frame.index[-2:]) - tm.assert_sp_frame_equal(sliced, expected) - - # axis = 1 - sliced = float_frame.iloc[:, -2:] - expected = float_frame.reindex(columns=float_frame.columns[-2:]) - tm.assert_sp_frame_equal(sliced, expected) - - def test_getitem_overload(self, float_frame): - # slicing - sl = float_frame[:20] - tm.assert_sp_frame_equal(sl, float_frame.reindex(float_frame.index[:20])) - - # boolean indexing - d = float_frame.index[5] - indexer = float_frame.index > d - - subindex = float_frame.index[indexer] - subframe = float_frame[indexer] - - tm.assert_index_equal(subindex, subframe.index) - msg = "Item wrong length 9 instead of 10" - with pytest.raises(ValueError, match=msg): - float_frame[indexer[:-1]] - - def test_setitem( - self, - float_frame, - float_frame_int_kind, - float_frame_dense, - float_frame_fill0, - float_frame_fill0_dense, - float_frame_fill2, - float_frame_fill2_dense, - ): - def _check_frame(frame, orig): - N = len(frame) - - # insert SparseSeries - frame["E"] = frame["A"] - assert isinstance(frame["E"], SparseSeries) - tm.assert_sp_series_equal(frame["E"], frame["A"], check_names=False) - - # insert SparseSeries differently-indexed - to_insert = frame["A"][::2] - frame["E"] = to_insert - expected = to_insert.to_dense().reindex(frame.index) - result = frame["E"].to_dense() - tm.assert_series_equal(result, expected, check_names=False) - assert result.name == "E" - - # insert Series - frame["F"] = frame["A"].to_dense() - assert isinstance(frame["F"], SparseSeries) - tm.assert_sp_series_equal(frame["F"], frame["A"], check_names=False) - - # insert Series differently-indexed - to_insert = frame["A"].to_dense()[::2] - frame["G"] = to_insert - expected = to_insert.reindex(frame.index) - expected.name = "G" - tm.assert_series_equal(frame["G"].to_dense(), expected) - - # insert ndarray - frame["H"] = np.random.randn(N) - assert isinstance(frame["H"], SparseSeries) - - to_sparsify = np.random.randn(N) - to_sparsify[N // 2 :] = frame.default_fill_value - frame["I"] = to_sparsify - assert len(frame["I"].sp_values) == N // 2 - - # insert ndarray wrong size - # GH 25484 - msg = "Length of values does not match length of index" - with pytest.raises(ValueError, match=msg): - frame["foo"] = np.random.randn(N - 1) - - # scalar value - frame["J"] = 5 - assert len(frame["J"].sp_values) == N - assert (frame["J"].sp_values == 5).all() - - frame["K"] = frame.default_fill_value - assert len(frame["K"].sp_values) == 0 - - _check_frame(float_frame, float_frame_dense) - _check_frame(float_frame_int_kind, float_frame_dense) - _check_frame(float_frame_fill0, float_frame_fill0_dense) - _check_frame(float_frame_fill2, float_frame_fill2_dense) - - @pytest.mark.parametrize( - "values", - [ - [True, False], - [0, 1], - [1, None], - ["a", "b"], - [pd.Timestamp("2017"), pd.NaT], - [pd.Timedelta("10s"), pd.NaT], - ], - ) - def test_setitem_more(self, values): - df = pd.DataFrame({"A": values}) - df["A"] = pd.SparseArray(values) - expected = pd.DataFrame({"A": pd.SparseArray(values)}) - tm.assert_frame_equal(df, expected) - - def test_setitem_corner(self, float_frame): - float_frame["a"] = float_frame["B"] - tm.assert_sp_series_equal(float_frame["a"], float_frame["B"], check_names=False) - - def test_setitem_array(self, float_frame): - arr = float_frame["B"] - - float_frame["E"] = arr - tm.assert_sp_series_equal(float_frame["E"], float_frame["B"], check_names=False) - - float_frame["F"] = arr[:-1] - index = float_frame.index[:-1] - tm.assert_sp_series_equal( - float_frame["E"].reindex(index), - float_frame["F"].reindex(index), - check_names=False, - ) - - def test_setitem_chained_no_consolidate(self): - # https://github.com/pandas-dev/pandas/pull/19268 - # issuecomment-361696418 - # chained setitem used to cause consolidation - sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]]) - with pd.option_context("mode.chained_assignment", None): - sdf[0][1] = 2 - assert len(sdf._data.blocks) == 2 - - def test_delitem(self, float_frame): - A = float_frame["A"] - C = float_frame["C"] - - del float_frame["B"] - assert "B" not in float_frame - tm.assert_sp_series_equal(float_frame["A"], A) - tm.assert_sp_series_equal(float_frame["C"], C) - - del float_frame["D"] - assert "D" not in float_frame - - del float_frame["A"] - assert "A" not in float_frame - - def test_set_columns(self, float_frame): - float_frame.columns = float_frame.columns - msg = ( - "Length mismatch: Expected axis has 4 elements, new values have" - " 3 elements" - ) - with pytest.raises(ValueError, match=msg): - float_frame.columns = float_frame.columns[:-1] - - def test_set_index(self, float_frame): - float_frame.index = float_frame.index - msg = ( - "Length mismatch: Expected axis has 10 elements, new values" - " have 9 elements" - ) - with pytest.raises(ValueError, match=msg): - float_frame.index = float_frame.index[:-1] - - def test_ctor_reindex(self): - idx = pd.Index([0, 1, 2, 3]) - msg = "Length of passed values is 2, index implies 4" - with pytest.raises(ValueError, match=msg): - pd.SparseDataFrame({"A": [1, 2]}, index=idx) - - def test_append(self, float_frame): - a = float_frame[:5] - b = float_frame[5:] - - appended = a.append(b) - tm.assert_sp_frame_equal(appended, float_frame, exact_indices=False) - - a = float_frame.iloc[:5, :3] - b = float_frame.iloc[5:] - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False - ): - # Stacklevel is set for pd.concat, not append - appended = a.append(b) - tm.assert_sp_frame_equal( - appended.iloc[:, :3], float_frame.iloc[:, :3], exact_indices=False - ) - - a = a[["B", "C", "A"]].head(2) - b = b.head(2) - - expected = pd.SparseDataFrame( - { - "B": [0.0, 1, None, 3], - "C": [0.0, 1, 5, 6], - "A": [None, None, 2, 3], - "D": [None, None, 5, None], - }, - index=a.index | b.index, - columns=["B", "C", "A", "D"], - ) - with tm.assert_produces_warning(None, raise_on_extra_warnings=False): - appended = a.append(b, sort=False) - - tm.assert_frame_equal(appended, expected) - - with tm.assert_produces_warning(None, raise_on_extra_warnings=False): - appended = a.append(b, sort=True) - - tm.assert_sp_frame_equal( - appended, - expected[["A", "B", "C", "D"]], - consolidate_block_indices=True, - check_kind=False, - ) - - def test_astype(self): - sparse = pd.SparseDataFrame( - { - "A": SparseArray([1, 2, 3, 4], dtype=np.int64), - "B": SparseArray([4, 5, 6, 7], dtype=np.int64), - } - ) - assert sparse["A"].dtype == SparseDtype(np.int64) - assert sparse["B"].dtype == SparseDtype(np.int64) - - # retain fill_value - res = sparse.astype(np.float64) - exp = pd.SparseDataFrame( - { - "A": SparseArray([1.0, 2.0, 3.0, 4.0], fill_value=0, kind="integer"), - "B": SparseArray([4.0, 5.0, 6.0, 7.0], fill_value=0, kind="integer"), - }, - default_fill_value=np.nan, - ) - tm.assert_sp_frame_equal(res, exp) - assert res["A"].dtype == SparseDtype(np.float64, 0) - assert res["B"].dtype == SparseDtype(np.float64, 0) - - # update fill_value - res = sparse.astype(SparseDtype(np.float64, np.nan)) - exp = pd.SparseDataFrame( - { - "A": SparseArray( - [1.0, 2.0, 3.0, 4.0], fill_value=np.nan, kind="integer" - ), - "B": SparseArray( - [4.0, 5.0, 6.0, 7.0], fill_value=np.nan, kind="integer" - ), - }, - default_fill_value=np.nan, - ) - tm.assert_sp_frame_equal(res, exp) - assert res["A"].dtype == SparseDtype(np.float64, np.nan) - assert res["B"].dtype == SparseDtype(np.float64, np.nan) - - def test_astype_bool(self): - sparse = pd.SparseDataFrame( - { - "A": SparseArray([0, 2, 0, 4], fill_value=0, dtype=np.int64), - "B": SparseArray([0, 5, 0, 7], fill_value=0, dtype=np.int64), - }, - default_fill_value=0, - ) - assert sparse["A"].dtype == SparseDtype(np.int64) - assert sparse["B"].dtype == SparseDtype(np.int64) - - res = sparse.astype(SparseDtype(bool, False)) - exp = pd.SparseDataFrame( - { - "A": SparseArray( - [False, True, False, True], - dtype=np.bool, - fill_value=False, - kind="integer", - ), - "B": SparseArray( - [False, True, False, True], - dtype=np.bool, - fill_value=False, - kind="integer", - ), - }, - default_fill_value=False, - ) - tm.assert_sp_frame_equal(res, exp) - assert res["A"].dtype == SparseDtype(np.bool) - assert res["B"].dtype == SparseDtype(np.bool) - - def test_astype_object(self): - # This may change in GH-23125 - df = pd.DataFrame({"A": SparseArray([0, 1]), "B": SparseArray([0, 1])}) - result = df.astype(object) - dtype = SparseDtype(object, 0) - expected = pd.DataFrame( - { - "A": SparseArray([0, 1], dtype=dtype), - "B": SparseArray([0, 1], dtype=dtype), - } - ) - tm.assert_frame_equal(result, expected) - - def test_fillna(self, float_frame_fill0, float_frame_fill0_dense): - df = float_frame_fill0.reindex(list(range(5))) - dense = float_frame_fill0_dense.reindex(list(range(5))) - - result = df.fillna(0) - expected = dense.fillna(0) - tm.assert_sp_frame_equal( - result, expected.to_sparse(fill_value=0), exact_indices=False - ) - tm.assert_frame_equal(result.to_dense(), expected) - - result = df.copy() - result.fillna(0, inplace=True) - expected = dense.fillna(0) - - tm.assert_sp_frame_equal( - result, expected.to_sparse(fill_value=0), exact_indices=False - ) - tm.assert_frame_equal(result.to_dense(), expected) - - result = df.copy() - result = df["A"] - result.fillna(0, inplace=True) - - expected = dense["A"].fillna(0) - # this changes internal SparseArray repr - # tm.assert_sp_series_equal(result, expected.to_sparse(fill_value=0)) - tm.assert_series_equal(result.to_dense(), expected) - - def test_fillna_fill_value(self): - df = pd.DataFrame({"A": [1, 0, 0], "B": [np.nan, np.nan, 4]}) - - sparse = pd.SparseDataFrame(df) - tm.assert_frame_equal( - sparse.fillna(-1).to_dense(), df.fillna(-1), check_dtype=False - ) - - sparse = pd.SparseDataFrame(df, default_fill_value=0) - tm.assert_frame_equal( - sparse.fillna(-1).to_dense(), df.fillna(-1), check_dtype=False - ) - - def test_sparse_frame_pad_backfill_limit(self): - index = np.arange(10) - df = DataFrame(np.random.randn(10, 4), index=index) - sdf = df.to_sparse() - - result = sdf[:2].reindex(index, method="pad", limit=5) - - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - expected = sdf[:2].reindex(index).fillna(method="pad") - expected = expected.to_dense() - expected.values[-3:] = np.nan - expected = expected.to_sparse() - tm.assert_frame_equal(result, expected) - - result = sdf[-2:].reindex(index, method="backfill", limit=5) - - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - expected = sdf[-2:].reindex(index).fillna(method="backfill") - expected = expected.to_dense() - expected.values[:3] = np.nan - expected = expected.to_sparse() - tm.assert_frame_equal(result, expected) - - def test_sparse_frame_fillna_limit(self): - index = np.arange(10) - df = DataFrame(np.random.randn(10, 4), index=index) - sdf = df.to_sparse() - - result = sdf[:2].reindex(index) - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - result = result.fillna(method="pad", limit=5) - - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - expected = sdf[:2].reindex(index).fillna(method="pad") - expected = expected.to_dense() - expected.values[-3:] = np.nan - expected = expected.to_sparse() - tm.assert_frame_equal(result, expected) - - result = sdf[-2:].reindex(index) - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - result = result.fillna(method="backfill", limit=5) - - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - expected = sdf[-2:].reindex(index).fillna(method="backfill") - expected = expected.to_dense() - expected.values[:3] = np.nan - expected = expected.to_sparse() - tm.assert_frame_equal(result, expected) - - def test_rename(self, float_frame): - result = float_frame.rename(index=str) - expected = SparseDataFrame( - float_frame.values, - index=float_frame.index.strftime("%Y-%m-%d %H:%M:%S"), - columns=list("ABCD"), - ) - tm.assert_sp_frame_equal(result, expected) - - result = float_frame.rename(columns="{}1".format) - data = { - "A1": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], - "B1": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - "C1": np.arange(10, dtype=np.float64), - "D1": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan], - } - expected = SparseDataFrame(data, index=float_frame.index) - tm.assert_sp_frame_equal(result, expected) - - def test_corr(self, float_frame): - res = float_frame.corr() - # XXX: this stays sparse - tm.assert_frame_equal(res, float_frame.to_dense().corr().to_sparse()) - - def test_describe(self, float_frame): - float_frame["foo"] = np.nan - float_frame.dtypes.value_counts() - str(float_frame) - desc = float_frame.describe() # noqa - - def test_join(self, float_frame): - left = float_frame.loc[:, ["A", "B"]] - right = float_frame.loc[:, ["C", "D"]] - joined = left.join(right) - tm.assert_sp_frame_equal(joined, float_frame, exact_indices=False) - - right = float_frame.loc[:, ["B", "D"]] - msg = ( - r"columns overlap but no suffix specified: Index\(\['B'\]," - r" dtype='object'\)" - ) - with pytest.raises(ValueError, match=msg): - left.join(right) - - with pytest.raises(ValueError, match="Other Series must have a name"): - float_frame.join( - Series(np.random.randn(len(float_frame)), index=float_frame.index) - ) - - def test_reindex( - self, float_frame, float_frame_int_kind, float_frame_fill0, float_frame_fill2 - ): - def _check_frame(frame): - index = frame.index - sidx = index[::2] - sidx2 = index[:5] # noqa - - sparse_result = frame.reindex(sidx) - dense_result = frame.to_dense().reindex(sidx) - tm.assert_frame_equal(sparse_result.to_dense(), dense_result) - - tm.assert_frame_equal(frame.reindex(list(sidx)).to_dense(), dense_result) - - sparse_result2 = sparse_result.reindex(index) - dense_result2 = dense_result.reindex(index) - tm.assert_frame_equal(sparse_result2.to_dense(), dense_result2) - - # propagate CORRECT fill value - tm.assert_almost_equal( - sparse_result.default_fill_value, frame.default_fill_value - ) - tm.assert_almost_equal(sparse_result["A"].fill_value, frame["A"].fill_value) - - # length zero - length_zero = frame.reindex([]) - assert len(length_zero) == 0 - assert len(length_zero.columns) == len(frame.columns) - assert len(length_zero["A"]) == 0 - - # frame being reindexed has length zero - length_n = length_zero.reindex(index) - assert len(length_n) == len(frame) - assert len(length_n.columns) == len(frame.columns) - assert len(length_n["A"]) == len(frame) - - # reindex columns - reindexed = frame.reindex(columns=["A", "B", "Z"]) - assert len(reindexed.columns) == 3 - tm.assert_almost_equal(reindexed["Z"].fill_value, frame.default_fill_value) - assert np.isnan(reindexed["Z"].sp_values).all() - - _check_frame(float_frame) - _check_frame(float_frame_int_kind) - _check_frame(float_frame_fill0) - _check_frame(float_frame_fill2) - - # with copy=False - reindexed = float_frame.reindex(float_frame.index, copy=False) - reindexed["F"] = reindexed["A"] - assert "F" in float_frame - - reindexed = float_frame.reindex(float_frame.index) - reindexed["G"] = reindexed["A"] - assert "G" not in float_frame - - def test_reindex_fill_value(self, float_frame_fill0, float_frame_fill0_dense): - rng = bdate_range("20110110", periods=20) - - result = float_frame_fill0.reindex(rng, fill_value=0) - exp = float_frame_fill0_dense.reindex(rng, fill_value=0) - exp = exp.to_sparse(float_frame_fill0.default_fill_value) - tm.assert_sp_frame_equal(result, exp) - - def test_reindex_method(self): - - sparse = SparseDataFrame( - data=[[11.0, 12.0, 14.0], [21.0, 22.0, 24.0], [41.0, 42.0, 44.0]], - index=[1, 2, 4], - columns=[1, 2, 4], - dtype=float, - ) - - # Over indices - - # default method - result = sparse.reindex(index=range(6)) - expected = SparseDataFrame( - data=[ - [nan, nan, nan], - [11.0, 12.0, 14.0], - [21.0, 22.0, 24.0], - [nan, nan, nan], - [41.0, 42.0, 44.0], - [nan, nan, nan], - ], - index=range(6), - columns=[1, 2, 4], - dtype=float, - ) - tm.assert_sp_frame_equal(result, expected) - - # method='bfill' - result = sparse.reindex(index=range(6), method="bfill") - expected = SparseDataFrame( - data=[ - [11.0, 12.0, 14.0], - [11.0, 12.0, 14.0], - [21.0, 22.0, 24.0], - [41.0, 42.0, 44.0], - [41.0, 42.0, 44.0], - [nan, nan, nan], - ], - index=range(6), - columns=[1, 2, 4], - dtype=float, - ) - tm.assert_sp_frame_equal(result, expected) - - # method='ffill' - result = sparse.reindex(index=range(6), method="ffill") - expected = SparseDataFrame( - data=[ - [nan, nan, nan], - [11.0, 12.0, 14.0], - [21.0, 22.0, 24.0], - [21.0, 22.0, 24.0], - [41.0, 42.0, 44.0], - [41.0, 42.0, 44.0], - ], - index=range(6), - columns=[1, 2, 4], - dtype=float, - ) - tm.assert_sp_frame_equal(result, expected) - - # Over columns - - # default method - result = sparse.reindex(columns=range(6)) - expected = SparseDataFrame( - data=[ - [nan, 11.0, 12.0, nan, 14.0, nan], - [nan, 21.0, 22.0, nan, 24.0, nan], - [nan, 41.0, 42.0, nan, 44.0, nan], - ], - index=[1, 2, 4], - columns=range(6), - dtype=float, - ) - tm.assert_sp_frame_equal(result, expected) - - # method='bfill' - with pytest.raises(NotImplementedError): - sparse.reindex(columns=range(6), method="bfill") - - # method='ffill' - with pytest.raises(NotImplementedError): - sparse.reindex(columns=range(6), method="ffill") - - def test_take(self, float_frame): - result = float_frame.take([1, 0, 2], axis=1) - expected = float_frame.reindex(columns=["B", "A", "C"]) - tm.assert_sp_frame_equal(result, expected) - - def test_to_dense( - self, - float_frame, - float_frame_int_kind, - float_frame_dense, - float_frame_fill0, - float_frame_fill0_dense, - float_frame_fill2, - float_frame_fill2_dense, - ): - def _check(frame, orig): - dense_dm = frame.to_dense() - # Sparse[float] != float - tm.assert_frame_equal(frame, dense_dm, check_dtype=False) - tm.assert_frame_equal(dense_dm, orig, check_dtype=False) - - _check(float_frame, float_frame_dense) - _check(float_frame_int_kind, float_frame_dense) - _check(float_frame_fill0, float_frame_fill0_dense) - _check(float_frame_fill2, float_frame_fill2_dense) - - def test_stack_sparse_frame( - self, float_frame, float_frame_int_kind, float_frame_fill0, float_frame_fill2 - ): - def _check(frame): - dense_frame = frame.to_dense() # noqa - - from_dense_lp = frame.stack().to_frame() - - from_sparse_lp = spf.stack_sparse_frame(frame) - - tm.assert_numpy_array_equal(from_dense_lp.values, from_sparse_lp.values) - - _check(float_frame) - _check(float_frame_int_kind) - - # for now - msg = "This routine assumes NaN fill value" - with pytest.raises(TypeError, match=msg): - _check(float_frame_fill0) - with pytest.raises(TypeError, match=msg): - _check(float_frame_fill2) - - def test_transpose( - self, - float_frame, - float_frame_int_kind, - float_frame_dense, - float_frame_fill0, - float_frame_fill0_dense, - float_frame_fill2, - float_frame_fill2_dense, - ): - def _check(frame, orig): - transposed = frame.T - untransposed = transposed.T - tm.assert_sp_frame_equal(frame, untransposed) - - tm.assert_frame_equal(frame.T.to_dense(), orig.T) - tm.assert_frame_equal(frame.T.T.to_dense(), orig.T.T) - tm.assert_sp_frame_equal(frame, frame.T.T, exact_indices=False) - - _check(float_frame, float_frame_dense) - _check(float_frame_int_kind, float_frame_dense) - _check(float_frame_fill0, float_frame_fill0_dense) - _check(float_frame_fill2, float_frame_fill2_dense) - - def test_shift( - self, - float_frame, - float_frame_int_kind, - float_frame_dense, - float_frame_fill0, - float_frame_fill0_dense, - float_frame_fill2, - float_frame_fill2_dense, - ): - def _check(frame, orig): - shifted = frame.shift(0) - exp = orig.shift(0) - tm.assert_frame_equal(shifted.to_dense(), exp) - - shifted = frame.shift(1) - exp = orig.shift(1) - tm.assert_frame_equal(shifted.to_dense(), exp) - - shifted = frame.shift(-2) - exp = orig.shift(-2) - tm.assert_frame_equal(shifted.to_dense(), exp) - - shifted = frame.shift(2, freq="B") - exp = orig.shift(2, freq="B") - exp = exp.to_sparse(frame.default_fill_value, kind=frame.default_kind) - tm.assert_frame_equal(shifted, exp) - - shifted = frame.shift(2, freq=BDay()) - exp = orig.shift(2, freq=BDay()) - exp = exp.to_sparse(frame.default_fill_value, kind=frame.default_kind) - tm.assert_frame_equal(shifted, exp) - - _check(float_frame, float_frame_dense) - _check(float_frame_int_kind, float_frame_dense) - _check(float_frame_fill0, float_frame_fill0_dense) - _check(float_frame_fill2, float_frame_fill2_dense) - - def test_count(self, float_frame): - dense_result = float_frame.to_dense().count() - - result = float_frame.count() - tm.assert_series_equal(result.to_dense(), dense_result) - - result = float_frame.count(axis=None) - tm.assert_series_equal(result.to_dense(), dense_result) - - result = float_frame.count(axis=0) - tm.assert_series_equal(result.to_dense(), dense_result) - - result = float_frame.count(axis=1) - dense_result = float_frame.to_dense().count(axis=1) - - # win32 don't check dtype - tm.assert_series_equal(result, dense_result, check_dtype=False) - - def test_numpy_transpose(self): - sdf = SparseDataFrame([1, 2, 3], index=[1, 2, 3], columns=["a"]) - result = np.transpose(np.transpose(sdf)) - tm.assert_sp_frame_equal(result, sdf) - - msg = "the 'axes' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.transpose(sdf, axes=1) - - def test_combine_first(self, float_frame): - df = float_frame - - result = df[::2].combine_first(df) - - expected = df[::2].to_dense().combine_first(df.to_dense()) - expected = expected.to_sparse(fill_value=df.default_fill_value) - - tm.assert_sp_frame_equal(result, expected) - - @pytest.mark.xfail(reason="No longer supported.") - def test_combine_first_with_dense(self): - # We could support this if we allow - # pd.core.dtypes.cast.find_common_type to special case SparseDtype - # but I don't think that's worth it. - df = self.frame - - result = df[::2].combine_first(df.to_dense()) - expected = df[::2].to_dense().combine_first(df.to_dense()) - expected = expected.to_sparse(fill_value=df.default_fill_value) - - tm.assert_sp_frame_equal(result, expected) - - def test_combine_add(self, float_frame): - df = float_frame.to_dense() - df2 = df.copy() - df2["C"][:3] = np.nan - df["A"][:3] = 5.7 - - result = df.to_sparse().add(df2.to_sparse(), fill_value=0) - expected = df.add(df2, fill_value=0).to_sparse() - tm.assert_sp_frame_equal(result, expected) - - def test_isin(self): - sparse_df = DataFrame({"flag": [1.0, 0.0, 1.0]}).to_sparse(fill_value=0.0) - xp = sparse_df[sparse_df.flag == 1.0] - rs = sparse_df[sparse_df.flag.isin([1.0])] - tm.assert_frame_equal(xp, rs) - - def test_sparse_pow_issue(self): - # 2220 - df = SparseDataFrame({"A": [1.1, 3.3], "B": [2.5, -3.9]}) - - # note : no error without nan - df = SparseDataFrame({"A": [nan, 0, 1]}) - - # note that 2 ** df works fine, also df ** 1 - result = 1 ** df - - r1 = result.take([0], 1)["A"] - r2 = result["A"] - - assert len(r2.sp_values) == len(r1.sp_values) - - def test_as_blocks(self): - df = SparseDataFrame({"A": [1.1, 3.3], "B": [nan, -3.9]}, dtype="float64") - - # deprecated 0.21.0 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df_blocks = df.blocks - assert list(df_blocks.keys()) == ["Sparse[float64, nan]"] - tm.assert_frame_equal(df_blocks["Sparse[float64, nan]"], df) - - @pytest.mark.xfail(reason="nan column names in _init_dict problematic (GH#16894)") - def test_nan_columnname(self): - # GH 8822 - nan_colname = DataFrame(Series(1.0, index=[0]), columns=[nan]) - nan_colname_sparse = nan_colname.to_sparse() - assert np.isnan(nan_colname_sparse.columns[0]) - - def test_isna(self): - # GH 8276 - df = pd.SparseDataFrame( - {"A": [np.nan, np.nan, 1, 2, np.nan], "B": [0, np.nan, np.nan, 2, np.nan]} - ) - - res = df.isna() - exp = pd.SparseDataFrame( - { - "A": [True, True, False, False, True], - "B": [False, True, True, False, True], - }, - default_fill_value=True, - ) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp) - - # if fill_value is not nan, True can be included in sp_values - df = pd.SparseDataFrame( - {"A": [0, 0, 1, 2, np.nan], "B": [0, np.nan, 0, 2, np.nan]}, - default_fill_value=0.0, - ) - res = df.isna() - assert isinstance(res, pd.SparseDataFrame) - exp = pd.DataFrame( - { - "A": [False, False, False, False, True], - "B": [False, True, False, False, True], - } - ) - tm.assert_frame_equal(res.to_dense(), exp) - - def test_notna(self): - # GH 8276 - df = pd.SparseDataFrame( - {"A": [np.nan, np.nan, 1, 2, np.nan], "B": [0, np.nan, np.nan, 2, np.nan]} - ) - - res = df.notna() - exp = pd.SparseDataFrame( - { - "A": [False, False, True, True, False], - "B": [True, False, False, True, False], - }, - default_fill_value=False, - ) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp) - - # if fill_value is not nan, True can be included in sp_values - df = pd.SparseDataFrame( - {"A": [0, 0, 1, 2, np.nan], "B": [0, np.nan, 0, 2, np.nan]}, - default_fill_value=0.0, - ) - res = df.notna() - assert isinstance(res, pd.SparseDataFrame) - exp = pd.DataFrame( - { - "A": [True, True, True, True, False], - "B": [True, False, True, True, False], - } - ) - tm.assert_frame_equal(res.to_dense(), exp) - - def test_default_fill_value_with_no_data(self): - # GH 16807 - expected = pd.SparseDataFrame( - [[1.0, 1.0], [1.0, 1.0]], columns=list("ab"), index=range(2) - ) - result = pd.SparseDataFrame( - columns=list("ab"), index=range(2), default_fill_value=1.0 - ) - tm.assert_frame_equal(expected, result) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") -class TestSparseDataFrameArithmetic: - def test_numeric_op_scalar(self): - df = pd.DataFrame( - { - "A": [nan, nan, 0, 1], - "B": [0, 1, 2, nan], - "C": [1.0, 2.0, 3.0, 4.0], - "D": [nan, nan, nan, nan], - } - ) - sparse = df.to_sparse() - - tm.assert_sp_frame_equal(sparse + 1, (df + 1).to_sparse()) - - def test_comparison_op_scalar(self): - # GH 13001 - df = pd.DataFrame( - { - "A": [nan, nan, 0, 1], - "B": [0, 1, 2, nan], - "C": [1.0, 2.0, 3.0, 4.0], - "D": [nan, nan, nan, nan], - } - ) - sparse = df.to_sparse() - - # comparison changes internal repr, compare with dense - res = sparse > 1 - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), df > 1) - - res = sparse != 0 - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), df != 0) - - def test_add_series_retains_dtype(self): - # SparseDataFrame._combine_match_columns used to incorrectly cast - # to float - d = {0: [2j, 3j], 1: [0, 1]} - sdf = SparseDataFrame(data=d, default_fill_value=1) - result = sdf + sdf[0] - - df = sdf.to_dense() - expected = df + df[0] - tm.assert_frame_equal(result.to_dense(), expected) - - # Make it explicit to be on the safe side - edata = {0: [4j, 5j], 1: [3j, 1 + 3j]} - expected = DataFrame(edata) - tm.assert_frame_equal(result.to_dense(), expected) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") -class TestSparseDataFrameAnalytics: - def test_cumsum(self, float_frame): - expected = SparseDataFrame(float_frame.to_dense().cumsum()) - - result = float_frame.cumsum() - tm.assert_sp_frame_equal(result, expected) - - result = float_frame.cumsum(axis=None) - tm.assert_sp_frame_equal(result, expected) - - result = float_frame.cumsum(axis=0) - tm.assert_sp_frame_equal(result, expected) - - def test_numpy_cumsum(self, float_frame): - result = np.cumsum(float_frame) - expected = SparseDataFrame(float_frame.to_dense().cumsum()) - tm.assert_sp_frame_equal(result, expected) - - msg = "the 'dtype' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.cumsum(float_frame, dtype=np.int64) - - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.cumsum(float_frame, out=result) - - def test_numpy_func_call(self, float_frame): - # no exception should be raised even though - # numpy passes in 'axis=None' or `axis=-1' - funcs = ["sum", "cumsum", "var", "mean", "prod", "cumprod", "std", "min", "max"] - for func in funcs: - getattr(np, func)(float_frame) - - @pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH 17386)") - def test_quantile(self): - # GH 17386 - data = [[1, 1], [2, 10], [3, 100], [nan, nan]] - q = 0.1 - - sparse_df = SparseDataFrame(data) - result = sparse_df.quantile(q) - - dense_df = DataFrame(data) - dense_expected = dense_df.quantile(q) - sparse_expected = SparseSeries(dense_expected) - - tm.assert_series_equal(result, dense_expected) - tm.assert_sp_series_equal(result, sparse_expected) - - @pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH 17386)") - def test_quantile_multi(self): - # GH 17386 - data = [[1, 1], [2, 10], [3, 100], [nan, nan]] - q = [0.1, 0.5] - - sparse_df = SparseDataFrame(data) - result = sparse_df.quantile(q) - - dense_df = DataFrame(data) - dense_expected = dense_df.quantile(q) - sparse_expected = SparseDataFrame(dense_expected) - - tm.assert_frame_equal(result, dense_expected) - tm.assert_sp_frame_equal(result, sparse_expected) - - def test_assign_with_sparse_frame(self): - # GH 19163 - df = pd.DataFrame({"a": [1, 2, 3]}) - res = df.to_sparse(fill_value=False).assign(newcol=False) - exp = df.assign(newcol=False).to_sparse(fill_value=False) - - tm.assert_sp_frame_equal(res, exp) - - for column in res.columns: - assert type(res[column]) is SparseSeries - - @pytest.mark.parametrize("inplace", [True, False]) - @pytest.mark.parametrize("how", ["all", "any"]) - def test_dropna(self, inplace, how): - # Tests regression #21172. - expected = pd.SparseDataFrame({"F2": [0, 1]}) - input_df = pd.SparseDataFrame( - {"F1": [float("nan"), float("nan")], "F2": [0, 1]} - ) - result_df = input_df.dropna(axis=1, inplace=inplace, how=how) - if inplace: - result_df = input_df - tm.assert_sp_frame_equal(expected, result_df) diff --git a/pandas/tests/sparse/frame/test_indexing.py b/pandas/tests/sparse/frame/test_indexing.py deleted file mode 100644 index c93e9d1e0e8d1..0000000000000 --- a/pandas/tests/sparse/frame/test_indexing.py +++ /dev/null @@ -1,103 +0,0 @@ -import numpy as np -import pytest - -from pandas import DataFrame, SparseDataFrame -from pandas.util import testing as tm - -pytestmark = pytest.mark.skip("Wrong SparseBlock initialization (GH 17386)") - - -@pytest.mark.parametrize( - "data", - [ - [[1, 1], [2, 2], [3, 3], [4, 4], [0, 0]], - [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0], [np.nan, np.nan]], - [ - [1.0, 1.0 + 1.0j], - [2.0 + 2.0j, 2.0], - [3.0, 3.0 + 3.0j], - [4.0 + 4.0j, 4.0], - [np.nan, np.nan], - ], - ], -) -@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") -def test_where_with_numeric_data(data): - # GH 17386 - lower_bound = 1.5 - - sparse = SparseDataFrame(data) - result = sparse.where(sparse > lower_bound) - - dense = DataFrame(data) - dense_expected = dense.where(dense > lower_bound) - sparse_expected = SparseDataFrame(dense_expected) - - tm.assert_frame_equal(result, dense_expected) - tm.assert_sp_frame_equal(result, sparse_expected) - - -@pytest.mark.parametrize( - "data", - [ - [[1, 1], [2, 2], [3, 3], [4, 4], [0, 0]], - [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0], [np.nan, np.nan]], - [ - [1.0, 1.0 + 1.0j], - [2.0 + 2.0j, 2.0], - [3.0, 3.0 + 3.0j], - [4.0 + 4.0j, 4.0], - [np.nan, np.nan], - ], - ], -) -@pytest.mark.parametrize("other", [True, -100, 0.1, 100.0 + 100.0j]) -@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") -def test_where_with_numeric_data_and_other(data, other): - # GH 17386 - lower_bound = 1.5 - - sparse = SparseDataFrame(data) - result = sparse.where(sparse > lower_bound, other) - - dense = DataFrame(data) - dense_expected = dense.where(dense > lower_bound, other) - sparse_expected = SparseDataFrame(dense_expected, default_fill_value=other) - - tm.assert_frame_equal(result, dense_expected) - tm.assert_sp_frame_equal(result, sparse_expected) - - -@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") -def test_where_with_bool_data(): - # GH 17386 - data = [[False, False], [True, True], [False, False]] - cond = True - - sparse = SparseDataFrame(data) - result = sparse.where(sparse == cond) - - dense = DataFrame(data) - dense_expected = dense.where(dense == cond) - sparse_expected = SparseDataFrame(dense_expected) - - tm.assert_frame_equal(result, dense_expected) - tm.assert_sp_frame_equal(result, sparse_expected) - - -@pytest.mark.parametrize("other", [True, 0, 0.1, 100.0 + 100.0j]) -@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") -def test_where_with_bool_data_and_other(other): - # GH 17386 - data = [[False, False], [True, True], [False, False]] - cond = True - - sparse = SparseDataFrame(data) - result = sparse.where(sparse == cond, other) - - dense = DataFrame(data) - dense_expected = dense.where(dense == cond, other) - sparse_expected = SparseDataFrame(dense_expected, default_fill_value=other) - - tm.assert_frame_equal(result, dense_expected) - tm.assert_sp_frame_equal(result, sparse_expected) diff --git a/pandas/tests/sparse/frame/test_to_csv.py b/pandas/tests/sparse/frame/test_to_csv.py deleted file mode 100644 index 4ba4fba7391d4..0000000000000 --- a/pandas/tests/sparse/frame/test_to_csv.py +++ /dev/null @@ -1,24 +0,0 @@ -import numpy as np -import pytest - -from pandas import SparseDataFrame, read_csv -from pandas.util import testing as tm - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") -class TestSparseDataFrameToCsv: - fill_values = [np.nan, 0, None, 1] - - @pytest.mark.parametrize("fill_value", fill_values) - def test_to_csv_sparse_dataframe(self, fill_value): - # GH19384 - sdf = SparseDataFrame( - {"a": type(self).fill_values}, default_fill_value=fill_value - ) - - with tm.ensure_clean("sparse_df.csv") as path: - sdf.to_csv(path, index=False) - df = read_csv(path, skip_blank_lines=False) - - tm.assert_sp_frame_equal(df.to_sparse(fill_value=fill_value), sdf) diff --git a/pandas/tests/sparse/frame/test_to_from_scipy.py b/pandas/tests/sparse/frame/test_to_from_scipy.py deleted file mode 100644 index 9d1ccc62146ab..0000000000000 --- a/pandas/tests/sparse/frame/test_to_from_scipy.py +++ /dev/null @@ -1,196 +0,0 @@ -import numpy as np -import pytest - -from pandas.core.dtypes.common import is_bool_dtype - -import pandas as pd -from pandas import SparseDataFrame, SparseSeries -from pandas.core.sparse.api import SparseDtype -from pandas.util import testing as tm - -scipy = pytest.importorskip("scipy") -ignore_matrix_warning = pytest.mark.filterwarnings( - "ignore:the matrix subclass:PendingDeprecationWarning" -) - - -@pytest.mark.parametrize("index", [None, list("abc")]) # noqa: F811 -@pytest.mark.parametrize("columns", [None, list("def")]) -@pytest.mark.parametrize("fill_value", [None, 0, np.nan]) -@pytest.mark.parametrize("dtype", [bool, int, float, np.uint16]) -@ignore_matrix_warning -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): - # GH 4343 - # Make one ndarray and from it one sparse matrix, both to be used for - # constructing frames and comparing results - arr = np.eye(3, dtype=dtype) - # GH 16179 - arr[0, 1] = dtype(2) - try: - spm = spmatrix(arr) - assert spm.dtype == arr.dtype - except (TypeError, AssertionError): - # If conversion to sparse fails for this spmatrix type and arr.dtype, - # then the combination is not currently supported in NumPy, so we - # can just skip testing it thoroughly - return - - sdf = SparseDataFrame( - spm, index=index, columns=columns, default_fill_value=fill_value - ) - - # Expected result construction is kind of tricky for all - # dtype-fill_value combinations; easiest to cast to something generic - # and except later on - rarr = arr.astype(object) - rarr[arr == 0] = np.nan - expected = SparseDataFrame(rarr, index=index, columns=columns).fillna( - fill_value if fill_value is not None else np.nan - ) - - # Assert frame is as expected - sdf_obj = sdf.astype(object) - tm.assert_sp_frame_equal(sdf_obj, expected) - tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense()) - - # Assert spmatrices equal - assert dict(sdf.to_coo().todok()) == dict(spm.todok()) - - # Ensure dtype is preserved if possible - # XXX: verify this - res_dtype = bool if is_bool_dtype(dtype) else dtype - tm.assert_contains_all( - sdf.dtypes.apply(lambda dtype: dtype.subtype), {np.dtype(res_dtype)} - ) - assert sdf.to_coo().dtype == res_dtype - - # However, adding a str column results in an upcast to object - sdf["strings"] = np.arange(len(sdf)).astype(str) - assert sdf.to_coo().dtype == np.object_ - - -@pytest.mark.parametrize("fill_value", [None, 0, np.nan]) # noqa: F811 -@ignore_matrix_warning -@pytest.mark.filterwarnings("ignore:object dtype is not supp:UserWarning") -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -def test_from_to_scipy_object(spmatrix, fill_value): - # GH 4343 - dtype = object - columns = list("cd") - index = list("ab") - - if spmatrix is scipy.sparse.dok_matrix: - pytest.skip("dok_matrix from object does not work in SciPy") - - # Make one ndarray and from it one sparse matrix, both to be used for - # constructing frames and comparing results - arr = np.eye(2, dtype=dtype) - try: - spm = spmatrix(arr) - assert spm.dtype == arr.dtype - except (TypeError, AssertionError): - # If conversion to sparse fails for this spmatrix type and arr.dtype, - # then the combination is not currently supported in NumPy, so we - # can just skip testing it thoroughly - return - - sdf = SparseDataFrame( - spm, index=index, columns=columns, default_fill_value=fill_value - ) - - # Expected result construction is kind of tricky for all - # dtype-fill_value combinations; easiest to cast to something generic - # and except later on - rarr = arr.astype(object) - rarr[arr == 0] = np.nan - expected = SparseDataFrame(rarr, index=index, columns=columns).fillna( - fill_value if fill_value is not None else np.nan - ) - - # Assert frame is as expected - sdf_obj = sdf.astype(SparseDtype(object, fill_value)) - tm.assert_sp_frame_equal(sdf_obj, expected) - tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense()) - - # Assert spmatrices equal - assert dict(sdf.to_coo().todok()) == dict(spm.todok()) - - # Ensure dtype is preserved if possible - res_dtype = object - tm.assert_contains_all( - sdf.dtypes.apply(lambda dtype: dtype.subtype), {np.dtype(res_dtype)} - ) - assert sdf.to_coo().dtype == res_dtype - - -@ignore_matrix_warning -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -def test_from_scipy_correct_ordering(spmatrix): - # GH 16179 - arr = np.arange(1, 5).reshape(2, 2) - try: - spm = spmatrix(arr) - assert spm.dtype == arr.dtype - except (TypeError, AssertionError): - # If conversion to sparse fails for this spmatrix type and arr.dtype, - # then the combination is not currently supported in NumPy, so we - # can just skip testing it thoroughly - return - - sdf = SparseDataFrame(spm) - expected = SparseDataFrame(arr) - tm.assert_sp_frame_equal(sdf, expected) - tm.assert_frame_equal(sdf.to_dense(), expected.to_dense()) - - -@ignore_matrix_warning -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -def test_from_scipy_fillna(spmatrix): - # GH 16112 - arr = np.eye(3) - arr[1:, 0] = np.nan - - try: - spm = spmatrix(arr) - assert spm.dtype == arr.dtype - except (TypeError, AssertionError): - # If conversion to sparse fails for this spmatrix type and arr.dtype, - # then the combination is not currently supported in NumPy, so we - # can just skip testing it thoroughly - return - - sdf = SparseDataFrame(spm).fillna(-1.0) - - # Returning frame should fill all nan values with -1.0 - expected = SparseDataFrame( - { - 0: SparseSeries([1.0, -1, -1]), - 1: SparseSeries([np.nan, 1, np.nan]), - 2: SparseSeries([np.nan, np.nan, 1]), - }, - default_fill_value=-1, - ) - - # fill_value is expected to be what .fillna() above was called with - # We don't use -1 as initial fill_value in expected SparseSeries - # construction because this way we obtain "compressed" SparseArrays, - # avoiding having to construct them ourselves - for col in expected: - expected[col].fill_value = -1 - - tm.assert_sp_frame_equal(sdf, expected) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") -def test_index_names_multiple_nones(): - # https://github.com/pandas-dev/pandas/pull/24092 - sparse = pytest.importorskip("scipy.sparse") - - s = pd.Series(1, index=pd.MultiIndex.from_product([["A", "B"], [0, 1]])).to_sparse() - result, _, _ = s.to_coo() - assert isinstance(result, sparse.coo_matrix) - result = result.toarray() - expected = np.ones((2, 2), dtype="int64") - tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/sparse/series/__init__.py b/pandas/tests/sparse/series/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/pandas/tests/sparse/series/test_indexing.py b/pandas/tests/sparse/series/test_indexing.py deleted file mode 100644 index c75f3b2134f91..0000000000000 --- a/pandas/tests/sparse/series/test_indexing.py +++ /dev/null @@ -1,113 +0,0 @@ -import numpy as np -import pytest - -from pandas import Series, SparseSeries -from pandas.util import testing as tm - -pytestmark = pytest.mark.skip("Wrong SparseBlock initialization (GH 17386)") - - -@pytest.mark.parametrize( - "data", - [ - [1, 1, 2, 2, 3, 3, 4, 4, 0, 0], - [1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, np.nan, np.nan], - [ - 1.0, - 1.0 + 1.0j, - 2.0 + 2.0j, - 2.0, - 3.0, - 3.0 + 3.0j, - 4.0 + 4.0j, - 4.0, - np.nan, - np.nan, - ], - ], -) -@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") -def test_where_with_numeric_data(data): - # GH 17386 - lower_bound = 1.5 - - sparse = SparseSeries(data) - result = sparse.where(sparse > lower_bound) - - dense = Series(data) - dense_expected = dense.where(dense > lower_bound) - sparse_expected = SparseSeries(dense_expected) - - tm.assert_series_equal(result, dense_expected) - tm.assert_sp_series_equal(result, sparse_expected) - - -@pytest.mark.parametrize( - "data", - [ - [1, 1, 2, 2, 3, 3, 4, 4, 0, 0], - [1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, np.nan, np.nan], - [ - 1.0, - 1.0 + 1.0j, - 2.0 + 2.0j, - 2.0, - 3.0, - 3.0 + 3.0j, - 4.0 + 4.0j, - 4.0, - np.nan, - np.nan, - ], - ], -) -@pytest.mark.parametrize("other", [True, -100, 0.1, 100.0 + 100.0j]) -@pytest.mark.skip(reason="Wrong SparseBlock initialization (Segfault) (GH 17386)") -def test_where_with_numeric_data_and_other(data, other): - # GH 17386 - lower_bound = 1.5 - - sparse = SparseSeries(data) - result = sparse.where(sparse > lower_bound, other) - - dense = Series(data) - dense_expected = dense.where(dense > lower_bound, other) - sparse_expected = SparseSeries(dense_expected, fill_value=other) - - tm.assert_series_equal(result, dense_expected) - tm.assert_sp_series_equal(result, sparse_expected) - - -@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") -def test_where_with_bool_data(): - # GH 17386 - data = [False, False, True, True, False, False] - cond = True - - sparse = SparseSeries(data) - result = sparse.where(sparse == cond) - - dense = Series(data) - dense_expected = dense.where(dense == cond) - sparse_expected = SparseSeries(dense_expected) - - tm.assert_series_equal(result, dense_expected) - tm.assert_sp_series_equal(result, sparse_expected) - - -@pytest.mark.parametrize("other", [True, 0, 0.1, 100.0 + 100.0j]) -@pytest.mark.skip(reason="Wrong SparseBlock initialization (Segfault) (GH 17386)") -def test_where_with_bool_data_and_other(other): - # GH 17386 - data = [False, False, True, True, False, False] - cond = True - - sparse = SparseSeries(data) - result = sparse.where(sparse == cond, other) - - dense = Series(data) - dense_expected = dense.where(dense == cond, other) - sparse_expected = SparseSeries(dense_expected, fill_value=other) - - tm.assert_series_equal(result, dense_expected) - tm.assert_sp_series_equal(result, sparse_expected) diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py deleted file mode 100644 index 046e7745fd4ec..0000000000000 --- a/pandas/tests/sparse/series/test_series.py +++ /dev/null @@ -1,1596 +0,0 @@ -from datetime import datetime -import operator - -import numpy as np -from numpy import nan -import pytest - -from pandas._libs.sparse import BlockIndex, IntIndex -from pandas.compat import PY36 -from pandas.errors import PerformanceWarning -import pandas.util._test_decorators as td - -import pandas as pd -from pandas import DataFrame, Series, SparseDtype, SparseSeries, bdate_range, isna -from pandas.core import ops -from pandas.core.reshape.util import cartesian_product -import pandas.core.sparse.frame as spf -from pandas.tests.series.test_api import SharedWithSparse -import pandas.util.testing as tm - -from pandas.tseries.offsets import BDay - - -def test_deprecated(): - with tm.assert_produces_warning(FutureWarning): - pd.SparseSeries([0, 1]) - - -def _test_data1(): - # nan-based - arr = np.arange(20, dtype=float) - index = np.arange(20) - arr[:2] = nan - arr[5:10] = nan - arr[-3:] = nan - - return arr, index - - -def _test_data2(): - # nan-based - arr = np.arange(15, dtype=float) - index = np.arange(15) - arr[7:12] = nan - arr[-1:] = nan - return arr, index - - -def _test_data1_zero(): - # zero-based - arr, index = _test_data1() - arr[np.isnan(arr)] = 0 - return arr, index - - -def _test_data2_zero(): - # zero-based - arr, index = _test_data2() - arr[np.isnan(arr)] = 0 - return arr, index - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") -class TestSparseSeries(SharedWithSparse): - - series_klass = SparseSeries - # SharedWithSparse tests use generic, series_klass-agnostic assertion - _assert_series_equal = staticmethod(tm.assert_sp_series_equal) - - def setup_method(self, method): - arr, index = _test_data1() - - date_index = bdate_range("1/1/2011", periods=len(index)) - - self.bseries = SparseSeries(arr, index=index, kind="block", name="bseries") - self.ts = self.bseries - - self.btseries = SparseSeries(arr, index=date_index, kind="block") - - self.iseries = SparseSeries(arr, index=index, kind="integer", name="iseries") - - arr, index = _test_data2() - self.bseries2 = SparseSeries(arr, index=index, kind="block") - self.iseries2 = SparseSeries(arr, index=index, kind="integer") - - arr, index = _test_data1_zero() - self.zbseries = SparseSeries( - arr, index=index, kind="block", fill_value=0, name="zbseries" - ) - self.ziseries = SparseSeries(arr, index=index, kind="integer", fill_value=0) - - arr, index = _test_data2_zero() - self.zbseries2 = SparseSeries(arr, index=index, kind="block", fill_value=0) - self.ziseries2 = SparseSeries(arr, index=index, kind="integer", fill_value=0) - - def test_constructor_dict_input(self): - # gh-16905 - constructor_dict = {1: 1.0} - index = [0, 1, 2] - - # Series with index passed in - series = pd.Series(constructor_dict) - expected = SparseSeries(series, index=index) - - result = SparseSeries(constructor_dict, index=index) - tm.assert_sp_series_equal(result, expected) - - # Series with index and dictionary with no index - expected = SparseSeries(series) - - result = SparseSeries(constructor_dict) - tm.assert_sp_series_equal(result, expected) - - def test_constructor_dict_order(self): - # GH19018 - # initialization ordering: by insertion order if python>= 3.6, else - # order by value - d = {"b": 1, "a": 0, "c": 2} - result = SparseSeries(d) - if PY36: - expected = SparseSeries([1, 0, 2], index=list("bac")) - else: - expected = SparseSeries([0, 1, 2], index=list("abc")) - tm.assert_sp_series_equal(result, expected) - - def test_constructor_dtype(self): - arr = SparseSeries([np.nan, 1, 2, np.nan]) - assert arr.dtype == SparseDtype(np.float64) - assert np.isnan(arr.fill_value) - - arr = SparseSeries([np.nan, 1, 2, np.nan], fill_value=0) - assert arr.dtype == SparseDtype(np.float64, 0) - assert arr.fill_value == 0 - - arr = SparseSeries([0, 1, 2, 4], dtype=np.int64, fill_value=np.nan) - assert arr.dtype == SparseDtype(np.int64, np.nan) - assert np.isnan(arr.fill_value) - - arr = SparseSeries([0, 1, 2, 4], dtype=np.int64) - assert arr.dtype == SparseDtype(np.int64, 0) - assert arr.fill_value == 0 - - arr = SparseSeries([0, 1, 2, 4], fill_value=0, dtype=np.int64) - assert arr.dtype == SparseDtype(np.int64, 0) - assert arr.fill_value == 0 - - def test_iteration_and_str(self): - [x for x in self.bseries] - str(self.bseries) - - def test_construct_DataFrame_with_sp_series(self): - # it works! - df = DataFrame({"col": self.bseries}) - - # printing & access - df.iloc[:1] - df["col"] - df.dtypes - str(df) - - # blocking - expected = Series({"col": "float64:sparse"}) - - # GH 26705 - Assert .ftypes is deprecated - with tm.assert_produces_warning(FutureWarning): - result = df.ftypes - tm.assert_series_equal(expected, result) - - def test_constructor_preserve_attr(self): - arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0) - assert arr.dtype == SparseDtype(np.int64) - assert arr.fill_value == 0 - - s = pd.SparseSeries(arr, name="x") - assert s.dtype == SparseDtype(np.int64) - assert s.fill_value == 0 - - def test_series_density(self): - # GH2803 - ts = Series(np.random.randn(10)) - ts[2:-2] = nan - sts = ts.to_sparse() - density = sts.density # don't die - assert density == 4 / 10.0 - - def test_sparse_to_dense(self): - arr, index = _test_data1() - series = self.bseries.to_dense() - tm.assert_series_equal(series, Series(arr, name="bseries")) - - series = self.iseries.to_dense() - tm.assert_series_equal(series, Series(arr, name="iseries")) - - arr, index = _test_data1_zero() - series = self.zbseries.to_dense() - tm.assert_series_equal(series, Series(arr, name="zbseries")) - - series = self.ziseries.to_dense() - tm.assert_series_equal(series, Series(arr)) - - def test_to_dense_fill_value(self): - s = pd.Series([1, np.nan, np.nan, 3, np.nan]) - res = SparseSeries(s).to_dense() - tm.assert_series_equal(res, s) - - res = SparseSeries(s, fill_value=0).to_dense() - tm.assert_series_equal(res, s) - - s = pd.Series([1, np.nan, 0, 3, 0]) - res = SparseSeries(s, fill_value=0).to_dense() - tm.assert_series_equal(res, s) - - res = SparseSeries(s, fill_value=0).to_dense() - tm.assert_series_equal(res, s) - - s = pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan]) - res = SparseSeries(s).to_dense() - tm.assert_series_equal(res, s) - - s = pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan]) - res = SparseSeries(s, fill_value=0).to_dense() - tm.assert_series_equal(res, s) - - def test_dense_to_sparse(self): - series = self.bseries.to_dense() - bseries = series.to_sparse(kind="block") - iseries = series.to_sparse(kind="integer") - tm.assert_sp_series_equal(bseries, self.bseries) - tm.assert_sp_series_equal(iseries, self.iseries, check_names=False) - assert iseries.name == self.bseries.name - - assert len(series) == len(bseries) - assert len(series) == len(iseries) - assert series.shape == bseries.shape - assert series.shape == iseries.shape - - # non-NaN fill value - series = self.zbseries.to_dense() - zbseries = series.to_sparse(kind="block", fill_value=0) - ziseries = series.to_sparse(kind="integer", fill_value=0) - tm.assert_sp_series_equal(zbseries, self.zbseries) - tm.assert_sp_series_equal(ziseries, self.ziseries, check_names=False) - assert ziseries.name == self.zbseries.name - - assert len(series) == len(zbseries) - assert len(series) == len(ziseries) - assert series.shape == zbseries.shape - assert series.shape == ziseries.shape - - def test_to_dense_preserve_name(self): - assert self.bseries.name is not None - result = self.bseries.to_dense() - assert result.name == self.bseries.name - - def test_constructor(self): - # test setup guys - assert np.isnan(self.bseries.fill_value) - assert isinstance(self.bseries.sp_index, BlockIndex) - assert np.isnan(self.iseries.fill_value) - assert isinstance(self.iseries.sp_index, IntIndex) - - assert self.zbseries.fill_value == 0 - tm.assert_numpy_array_equal( - self.zbseries.values.to_dense(), self.bseries.to_dense().fillna(0).values - ) - - # pass SparseSeries - def _check_const(sparse, name): - # use passed series name - result = SparseSeries(sparse) - tm.assert_sp_series_equal(result, sparse) - assert sparse.name == name - assert result.name == name - - # use passed name - result = SparseSeries(sparse, name="x") - tm.assert_sp_series_equal(result, sparse, check_names=False) - assert result.name == "x" - - _check_const(self.bseries, "bseries") - _check_const(self.iseries, "iseries") - _check_const(self.zbseries, "zbseries") - - # Sparse time series works - date_index = bdate_range("1/1/2000", periods=len(self.bseries)) - s5 = SparseSeries(self.bseries, index=date_index) - assert isinstance(s5, SparseSeries) - - # pass Series - bseries2 = SparseSeries(self.bseries.to_dense()) - tm.assert_numpy_array_equal(self.bseries.sp_values, bseries2.sp_values) - - # pass dict? - - # don't copy the data by default - values = np.ones(self.bseries.npoints) - sp = SparseSeries(values, sparse_index=self.bseries.sp_index) - sp.sp_values[:5] = 97 - assert values[0] == 97 - - assert len(sp) == 20 - assert sp.shape == (20,) - - # but can make it copy! - sp = SparseSeries(values, sparse_index=self.bseries.sp_index, copy=True) - sp.sp_values[:5] = 100 - assert values[0] == 97 - - assert len(sp) == 20 - assert sp.shape == (20,) - - def test_constructor_scalar(self): - data = 5 - sp = SparseSeries(data, np.arange(100)) - sp = sp.reindex(np.arange(200)) - assert (sp.loc[:99] == data).all() - assert isna(sp.loc[100:]).all() - - data = np.nan - sp = SparseSeries(data, np.arange(100)) - assert len(sp) == 100 - assert sp.shape == (100,) - - def test_constructor_ndarray(self): - pass - - def test_constructor_nonnan(self): - arr = [0, 0, 0, nan, nan] - sp_series = SparseSeries(arr, fill_value=0) - tm.assert_numpy_array_equal(sp_series.values.to_dense(), np.array(arr)) - assert len(sp_series) == 5 - assert sp_series.shape == (5,) - - def test_constructor_empty(self): - # see gh-9272 - sp = SparseSeries() - assert len(sp.index) == 0 - assert sp.shape == (0,) - - def test_copy_astype(self): - cop = self.bseries.astype(np.float64) - assert cop is not self.bseries - assert cop.sp_index is self.bseries.sp_index - assert cop.dtype == SparseDtype(np.float64) - - cop2 = self.iseries.copy() - - tm.assert_sp_series_equal(cop, self.bseries) - tm.assert_sp_series_equal(cop2, self.iseries) - - # test that data is copied - cop[:5] = 97 - assert cop.sp_values[0] == 97 - assert self.bseries.sp_values[0] != 97 - - # correct fill value - zbcop = self.zbseries.copy() - zicop = self.ziseries.copy() - - tm.assert_sp_series_equal(zbcop, self.zbseries) - tm.assert_sp_series_equal(zicop, self.ziseries) - - # no deep copy - view = self.bseries.copy(deep=False) - view.sp_values[:5] = 5 - assert (self.bseries.sp_values[:5] == 5).all() - - def test_shape(self): - # see gh-10452 - assert self.bseries.shape == (20,) - assert self.btseries.shape == (20,) - assert self.iseries.shape == (20,) - - assert self.bseries2.shape == (15,) - assert self.iseries2.shape == (15,) - - assert self.zbseries2.shape == (15,) - assert self.ziseries2.shape == (15,) - - def test_astype(self): - result = self.bseries.astype(SparseDtype(np.int64, 0)) - expected = ( - self.bseries.to_dense().fillna(0).astype(np.int64).to_sparse(fill_value=0) - ) - tm.assert_sp_series_equal(result, expected) - - def test_astype_all(self): - orig = pd.Series(np.array([1, 2, 3])) - s = SparseSeries(orig) - - types = [np.float64, np.float32, np.int64, np.int32, np.int16, np.int8] - for typ in types: - dtype = SparseDtype(typ) - res = s.astype(dtype) - assert res.dtype == dtype - tm.assert_series_equal(res.to_dense(), orig.astype(typ)) - - def test_kind(self): - assert self.bseries.kind == "block" - assert self.iseries.kind == "integer" - - def test_to_frame(self): - # GH 9850 - s = pd.SparseSeries([1, 2, 0, nan, 4, nan, 0], name="x") - exp = pd.SparseDataFrame({"x": [1, 2, 0, nan, 4, nan, 0]}) - tm.assert_sp_frame_equal(s.to_frame(), exp) - - exp = pd.SparseDataFrame({"y": [1, 2, 0, nan, 4, nan, 0]}) - tm.assert_sp_frame_equal(s.to_frame(name="y"), exp) - - s = pd.SparseSeries([1, 2, 0, nan, 4, nan, 0], name="x", fill_value=0) - exp = pd.SparseDataFrame({"x": [1, 2, 0, nan, 4, nan, 0]}, default_fill_value=0) - - tm.assert_sp_frame_equal(s.to_frame(), exp) - exp = pd.DataFrame({"y": [1, 2, 0, nan, 4, nan, 0]}) - tm.assert_frame_equal(s.to_frame(name="y").to_dense(), exp) - - def test_pickle(self): - def _test_roundtrip(series): - unpickled = tm.round_trip_pickle(series) - tm.assert_sp_series_equal(series, unpickled) - tm.assert_series_equal(series.to_dense(), unpickled.to_dense()) - - self._check_all(_test_roundtrip) - - def _check_all(self, check_func): - check_func(self.bseries) - check_func(self.iseries) - check_func(self.zbseries) - check_func(self.ziseries) - - def test_getitem(self): - def _check_getitem(sp, dense): - for idx, val in dense.items(): - tm.assert_almost_equal(val, sp[idx]) - - for i in range(len(dense)): - tm.assert_almost_equal(sp[i], dense[i]) - # j = np.float64(i) - # assert_almost_equal(sp[j], dense[j]) - - # API change 1/6/2012 - # negative getitem works - # for i in xrange(len(dense)): - # assert_almost_equal(sp[-i], dense[-i]) - - _check_getitem(self.bseries, self.bseries.to_dense()) - _check_getitem(self.btseries, self.btseries.to_dense()) - - _check_getitem(self.zbseries, self.zbseries.to_dense()) - _check_getitem(self.iseries, self.iseries.to_dense()) - _check_getitem(self.ziseries, self.ziseries.to_dense()) - - # exception handling - with pytest.raises(IndexError, match="Out of bounds access"): - self.bseries[len(self.bseries) + 1] - - # index not contained - msg = r"Timestamp\('2011-01-31 00:00:00', freq='B'\)" - with pytest.raises(KeyError, match=msg): - self.btseries[self.btseries.index[-1] + BDay()] - - def test_get_get_value(self): - tm.assert_almost_equal(self.bseries.get(10), self.bseries[10]) - assert self.bseries.get(len(self.bseries) + 1) is None - - dt = self.btseries.index[10] - result = self.btseries.get(dt) - expected = self.btseries.to_dense()[dt] - tm.assert_almost_equal(result, expected) - - tm.assert_almost_equal(self.bseries._get_value(10), self.bseries[10]) - - def test_set_value(self): - - idx = self.btseries.index[7] - self.btseries._set_value(idx, 0) - assert self.btseries[idx] == 0 - - self.iseries._set_value("foobar", 0) - assert self.iseries.index[-1] == "foobar" - assert self.iseries["foobar"] == 0 - - def test_getitem_slice(self): - idx = self.bseries.index - res = self.bseries[::2] - assert isinstance(res, SparseSeries) - - expected = self.bseries.reindex(idx[::2]) - tm.assert_sp_series_equal(res, expected) - - res = self.bseries[:5] - assert isinstance(res, SparseSeries) - tm.assert_sp_series_equal(res, self.bseries.reindex(idx[:5])) - - res = self.bseries[5:] - tm.assert_sp_series_equal(res, self.bseries.reindex(idx[5:])) - - # negative indices - res = self.bseries[:-3] - tm.assert_sp_series_equal(res, self.bseries.reindex(idx[:-3])) - - def test_take(self): - def _compare_with_dense(sp): - dense = sp.to_dense() - - def _compare(idx): - dense_result = dense.take(idx).values - sparse_result = sp.take(idx) - assert isinstance(sparse_result, SparseSeries) - tm.assert_almost_equal(dense_result, sparse_result.values.to_dense()) - - _compare([1.0, 2.0, 3.0, 4.0, 5.0, 0.0]) - _compare([7, 2, 9, 0, 4]) - _compare([3, 6, 3, 4, 7]) - - self._check_all(_compare_with_dense) - - msg = "index 21 is out of bounds for size 20" - with pytest.raises(IndexError, match=msg): - self.bseries.take([0, len(self.bseries) + 1]) - - # Corner case - # XXX: changed test. Why wsa this considered a corner case? - sp = SparseSeries(np.ones(10) * nan) - exp = pd.Series(np.repeat(nan, 5)) - tm.assert_series_equal(sp.take([0, 1, 2, 3, 4]), exp.to_sparse()) - - def test_numpy_take(self): - sp = SparseSeries([1.0, 2.0, 3.0]) - indices = [1, 2] - - tm.assert_series_equal( - np.take(sp, indices, axis=0).to_dense(), - np.take(sp.to_dense(), indices, axis=0), - ) - - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.take(sp, indices, out=np.empty(sp.shape)) - - msg = "the 'mode' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.take(sp, indices, out=None, mode="clip") - - def test_setitem(self): - self.bseries[5] = 7.0 - assert self.bseries[5] == 7.0 - - def test_setslice(self): - self.bseries[5:10] = 7.0 - tm.assert_series_equal( - self.bseries[5:10].to_dense(), - Series(7.0, index=range(5, 10), name=self.bseries.name), - ) - - def test_operators(self): - def _check_op(a, b, op): - sp_result = op(a, b) - adense = a.to_dense() if isinstance(a, SparseSeries) else a - bdense = b.to_dense() if isinstance(b, SparseSeries) else b - dense_result = op(adense, bdense) - if "floordiv" in op.__name__: - # Series sets 1//0 to np.inf, which SparseSeries does not do (yet) - mask = np.isinf(dense_result) - dense_result[mask] = np.nan - tm.assert_almost_equal(sp_result.to_dense(), dense_result) - - def check(a, b): - _check_op(a, b, operator.add) - _check_op(a, b, operator.sub) - _check_op(a, b, operator.truediv) - _check_op(a, b, operator.floordiv) - _check_op(a, b, operator.mul) - - _check_op(a, b, ops.radd) - _check_op(a, b, ops.rsub) - _check_op(a, b, ops.rtruediv) - _check_op(a, b, ops.rfloordiv) - _check_op(a, b, ops.rmul) - - # FIXME: don't leave commented-out - # NaN ** 0 = 1 in C? - # _check_op(a, b, operator.pow) - # _check_op(a, b, ops.rpow) - - check(self.bseries, self.bseries) - check(self.iseries, self.iseries) - check(self.bseries, self.iseries) - - check(self.bseries, self.bseries2) - check(self.bseries, self.iseries2) - check(self.iseries, self.iseries2) - - # scalar value - check(self.bseries, 5) - - # zero-based - check(self.zbseries, self.zbseries * 2) - check(self.zbseries, self.zbseries2) - check(self.ziseries, self.ziseries2) - - # with dense - result = self.bseries + self.bseries.to_dense() - tm.assert_sp_series_equal(result, self.bseries + self.bseries) - - def test_binary_operators(self): - - # skipping for now ##### - import pytest - - pytest.skip("skipping sparse binary operators test") - - def _check_inplace_op(iop, op): - tmp = self.bseries.copy() - - expected = op(tmp, self.bseries) - iop(tmp, self.bseries) - tm.assert_sp_series_equal(tmp, expected) - - inplace_ops = ["add", "sub", "mul", "truediv", "floordiv", "pow"] - for op in inplace_ops: - _check_inplace_op( - getattr(operator, "i{op}".format(op=op)), getattr(operator, op) - ) - - @pytest.mark.parametrize( - "values, op, fill_value", - [ - ([True, False, False, True], operator.invert, True), - ([True, False, False, True], operator.invert, False), - ([0, 1, 2, 3], operator.pos, 0), - ([0, 1, 2, 3], operator.neg, 0), - ([0, np.nan, 2, 3], operator.pos, np.nan), - ([0, np.nan, 2, 3], operator.neg, np.nan), - ], - ) - def test_unary_operators(self, values, op, fill_value): - # https://github.com/pandas-dev/pandas/issues/22835 - values = np.asarray(values) - if op is operator.invert: - new_fill_value = not fill_value - else: - new_fill_value = op(fill_value) - s = SparseSeries( - values, fill_value=fill_value, index=["a", "b", "c", "d"], name="name" - ) - result = op(s) - expected = SparseSeries( - op(values), - fill_value=new_fill_value, - index=["a", "b", "c", "d"], - name="name", - ) - tm.assert_sp_series_equal(result, expected) - - def test_abs(self): - s = SparseSeries([1, 2, -3], name="x") - expected = SparseSeries([1, 2, 3], name="x") - result = s.abs() - tm.assert_sp_series_equal(result, expected) - assert result.name == "x" - - result = abs(s) - tm.assert_sp_series_equal(result, expected) - assert result.name == "x" - - result = np.abs(s) - tm.assert_sp_series_equal(result, expected) - assert result.name == "x" - - s = SparseSeries([1, -2, 2, -3], fill_value=-2, name="x") - expected = SparseSeries( - [1, 2, 3], sparse_index=s.sp_index, fill_value=2, name="x" - ) - result = s.abs() - tm.assert_sp_series_equal(result, expected) - assert result.name == "x" - - result = abs(s) - tm.assert_sp_series_equal(result, expected) - assert result.name == "x" - - result = np.abs(s) - tm.assert_sp_series_equal(result, expected) - assert result.name == "x" - - def test_reindex(self): - def _compare_with_series(sps, new_index): - spsre = sps.reindex(new_index) - - series = sps.to_dense() - seriesre = series.reindex(new_index) - seriesre = seriesre.to_sparse(fill_value=sps.fill_value) - - tm.assert_sp_series_equal(spsre, seriesre) - tm.assert_series_equal(spsre.to_dense(), seriesre.to_dense()) - - _compare_with_series(self.bseries, self.bseries.index[::2]) - _compare_with_series(self.bseries, list(self.bseries.index[::2])) - _compare_with_series(self.bseries, self.bseries.index[:10]) - _compare_with_series(self.bseries, self.bseries.index[5:]) - - _compare_with_series(self.zbseries, self.zbseries.index[::2]) - _compare_with_series(self.zbseries, self.zbseries.index[:10]) - _compare_with_series(self.zbseries, self.zbseries.index[5:]) - - # special cases - same_index = self.bseries.reindex(self.bseries.index) - tm.assert_sp_series_equal(self.bseries, same_index) - assert same_index is not self.bseries - - # corner cases - sp = SparseSeries([], index=[]) - # TODO: sp_zero is not used anywhere...remove? - sp_zero = SparseSeries([], index=[], fill_value=0) # noqa - _compare_with_series(sp, np.arange(10)) - - # with copy=False - reindexed = self.bseries.reindex(self.bseries.index, copy=True) - reindexed.sp_values[:] = 1.0 - assert (self.bseries.sp_values != 1.0).all() - - reindexed = self.bseries.reindex(self.bseries.index, copy=False) - reindexed.sp_values[:] = 1.0 - tm.assert_numpy_array_equal(self.bseries.sp_values, np.repeat(1.0, 10)) - - def test_sparse_reindex(self): - length = 10 - - def _check(values, index1, index2, fill_value): - first_series = SparseSeries( - values, sparse_index=index1, fill_value=fill_value - ) - reindexed = first_series.sparse_reindex(index2) - assert reindexed.sp_index is index2 - - int_indices1 = index1.to_int_index().indices - int_indices2 = index2.to_int_index().indices - - expected = Series(values, index=int_indices1) - expected = expected.reindex(int_indices2).fillna(fill_value) - tm.assert_almost_equal(expected.values, reindexed.sp_values) - - # make sure level argument asserts - # TODO: expected is not used anywhere...remove? - expected = expected.reindex(int_indices2).fillna(fill_value) # noqa - - def _check_with_fill_value(values, first, second, fill_value=nan): - i_index1 = IntIndex(length, first) - i_index2 = IntIndex(length, second) - - b_index1 = i_index1.to_block_index() - b_index2 = i_index2.to_block_index() - - _check(values, i_index1, i_index2, fill_value) - _check(values, b_index1, b_index2, fill_value) - - def _check_all(values, first, second): - _check_with_fill_value(values, first, second, fill_value=nan) - _check_with_fill_value(values, first, second, fill_value=0) - - index1 = [2, 4, 5, 6, 8, 9] - values1 = np.arange(6.0) - - _check_all(values1, index1, [2, 4, 5]) - _check_all(values1, index1, [2, 3, 4, 5, 6, 7, 8, 9]) - _check_all(values1, index1, [0, 1]) - _check_all(values1, index1, [0, 1, 7, 8, 9]) - _check_all(values1, index1, []) - - first_series = SparseSeries( - values1, sparse_index=IntIndex(length, index1), fill_value=nan - ) - with pytest.raises(TypeError, match="new index must be a SparseIndex"): - first_series.sparse_reindex(0) - - def test_repr(self): - # TODO: These aren't used - bsrepr = repr(self.bseries) # noqa - isrepr = repr(self.iseries) # noqa - - def test_iter(self): - pass - - def test_truncate(self): - pass - - def test_fillna(self): - pass - - def test_groupby(self): - pass - - def test_reductions(self): - def _compare_with_dense(obj, op): - sparse_result = getattr(obj, op)() - series = obj.to_dense() - dense_result = getattr(series, op)() - assert sparse_result == dense_result - - to_compare = ["count", "sum", "mean", "std", "var", "skew"] - - def _compare_all(obj): - for op in to_compare: - _compare_with_dense(obj, op) - - _compare_all(self.bseries) - - self.bseries.sp_values[5:10] = np.NaN - _compare_all(self.bseries) - - _compare_all(self.zbseries) - self.zbseries.sp_values[5:10] = np.NaN - _compare_all(self.zbseries) - - series = self.zbseries.copy() - series.fill_value = 2 - _compare_all(series) - - nonna = Series(np.random.randn(20)).to_sparse() - _compare_all(nonna) - - nonna2 = Series(np.random.randn(20)).to_sparse(fill_value=0) - _compare_all(nonna2) - - def test_dropna(self): - sp = SparseSeries([0, 0, 0, nan, nan, 5, 6], fill_value=0) - - sp_valid = sp.dropna() - - expected = sp.to_dense().dropna() - expected = expected[expected != 0] - exp_arr = pd.SparseArray(expected.values, fill_value=0, kind="block") - tm.assert_sp_array_equal(sp_valid.values, exp_arr) - tm.assert_index_equal(sp_valid.index, expected.index) - assert len(sp_valid.sp_values) == 2 - - result = self.bseries.dropna() - expected = self.bseries.to_dense().dropna() - assert not isinstance(result, SparseSeries) - tm.assert_series_equal(result, expected) - - def test_homogenize(self): - def _check_matches(indices, expected): - data = { - i: SparseSeries( - idx.to_int_index().indices, sparse_index=idx, fill_value=np.nan - ) - for i, idx in enumerate(indices) - } - - # homogenized is only valid with NaN fill values - homogenized = spf.homogenize(data) - - for k, v in homogenized.items(): - assert v.sp_index.equals(expected) - - indices1 = [ - BlockIndex(10, [2], [7]), - BlockIndex(10, [1, 6], [3, 4]), - BlockIndex(10, [0], [10]), - ] - expected1 = BlockIndex(10, [2, 6], [2, 3]) - _check_matches(indices1, expected1) - - indices2 = [BlockIndex(10, [2], [7]), BlockIndex(10, [2], [7])] - expected2 = indices2[0] - _check_matches(indices2, expected2) - - # must have NaN fill value - data = {"a": SparseSeries(np.arange(7), sparse_index=expected2, fill_value=0)} - with pytest.raises(TypeError, match="NaN fill value"): - spf.homogenize(data) - - def test_fill_value_corner(self): - cop = self.zbseries.copy() - cop.fill_value = 0 - result = self.bseries / cop - - assert np.isnan(result.fill_value) - - cop2 = self.zbseries.copy() - cop2.fill_value = 1 - result = cop2 / cop - # 1 / 0 is inf - assert np.isinf(result.fill_value) - - def test_fill_value_when_combine_const(self): - # GH12723 - s = SparseSeries([0, 1, np.nan, 3, 4, 5], index=np.arange(6)) - - exp = s.fillna(0).add(2) - res = s.add(2, fill_value=0) - tm.assert_series_equal(res, exp) - - def test_shift(self): - series = SparseSeries([nan, 1.0, 2.0, 3.0, nan, nan], index=np.arange(6)) - - shifted = series.shift(0) - # assert shifted is not series - tm.assert_sp_series_equal(shifted, series) - - f = lambda s: s.shift(1) - _dense_series_compare(series, f) - - f = lambda s: s.shift(-2) - _dense_series_compare(series, f) - - series = SparseSeries( - [nan, 1.0, 2.0, 3.0, nan, nan], index=bdate_range("1/1/2000", periods=6) - ) - f = lambda s: s.shift(2, freq="B") - _dense_series_compare(series, f) - - f = lambda s: s.shift(2, freq=BDay()) - _dense_series_compare(series, f) - - def test_shift_nan(self): - # GH 12908 - orig = pd.Series([np.nan, 2, np.nan, 4, 0, np.nan, 0]) - sparse = orig.to_sparse() - - tm.assert_sp_series_equal( - sparse.shift(0), orig.shift(0).to_sparse(), check_kind=False - ) - tm.assert_sp_series_equal( - sparse.shift(1), orig.shift(1).to_sparse(), check_kind=False - ) - tm.assert_sp_series_equal( - sparse.shift(2), orig.shift(2).to_sparse(), check_kind=False - ) - tm.assert_sp_series_equal( - sparse.shift(3), orig.shift(3).to_sparse(), check_kind=False - ) - - tm.assert_sp_series_equal(sparse.shift(-1), orig.shift(-1).to_sparse()) - tm.assert_sp_series_equal(sparse.shift(-2), orig.shift(-2).to_sparse()) - tm.assert_sp_series_equal(sparse.shift(-3), orig.shift(-3).to_sparse()) - tm.assert_sp_series_equal(sparse.shift(-4), orig.shift(-4).to_sparse()) - - sparse = orig.to_sparse(fill_value=0) - tm.assert_sp_series_equal( - sparse.shift(0), orig.shift(0).to_sparse(fill_value=sparse.fill_value) - ) - tm.assert_sp_series_equal( - sparse.shift(1), orig.shift(1).to_sparse(fill_value=0), check_kind=False - ) - tm.assert_sp_series_equal( - sparse.shift(2), orig.shift(2).to_sparse(fill_value=0), check_kind=False - ) - tm.assert_sp_series_equal( - sparse.shift(3), orig.shift(3).to_sparse(fill_value=0), check_kind=False - ) - - tm.assert_sp_series_equal( - sparse.shift(-1), orig.shift(-1).to_sparse(fill_value=0), check_kind=False - ) - tm.assert_sp_series_equal( - sparse.shift(-2), orig.shift(-2).to_sparse(fill_value=0), check_kind=False - ) - tm.assert_sp_series_equal( - sparse.shift(-3), orig.shift(-3).to_sparse(fill_value=0), check_kind=False - ) - tm.assert_sp_series_equal( - sparse.shift(-4), orig.shift(-4).to_sparse(fill_value=0), check_kind=False - ) - - def test_shift_dtype(self): - # GH 12908 - orig = pd.Series([1, 2, 3, 4], dtype=np.int64) - - sparse = orig.to_sparse() - tm.assert_sp_series_equal(sparse.shift(0), orig.shift(0).to_sparse()) - - sparse = orig.to_sparse(fill_value=np.nan) - tm.assert_sp_series_equal( - sparse.shift(0), orig.shift(0).to_sparse(fill_value=np.nan) - ) - # shift(1) or more span changes dtype to float64 - # XXX: SparseSeries doesn't need to shift dtype here. - # Do we want to astype in shift, for backwards compat? - # If not, document it. - tm.assert_sp_series_equal( - sparse.shift(1).astype("f8"), orig.shift(1).to_sparse(kind="integer") - ) - tm.assert_sp_series_equal( - sparse.shift(2).astype("f8"), orig.shift(2).to_sparse(kind="integer") - ) - tm.assert_sp_series_equal( - sparse.shift(3).astype("f8"), orig.shift(3).to_sparse(kind="integer") - ) - - tm.assert_sp_series_equal( - sparse.shift(-1).astype("f8"), orig.shift(-1).to_sparse(), check_kind=False - ) - tm.assert_sp_series_equal( - sparse.shift(-2).astype("f8"), orig.shift(-2).to_sparse(), check_kind=False - ) - tm.assert_sp_series_equal( - sparse.shift(-3).astype("f8"), orig.shift(-3).to_sparse(), check_kind=False - ) - tm.assert_sp_series_equal( - sparse.shift(-4).astype("f8"), orig.shift(-4).to_sparse(), check_kind=False - ) - - @pytest.mark.parametrize("fill_value", [0, 1, np.nan]) - @pytest.mark.parametrize("periods", [0, 1, 2, 3, -1, -2, -3, -4]) - def test_shift_dtype_fill_value(self, fill_value, periods): - # GH 12908 - orig = pd.Series([1, 0, 0, 4], dtype=np.dtype("int64")) - - sparse = orig.to_sparse(fill_value=fill_value) - - result = sparse.shift(periods) - expected = orig.shift(periods).to_sparse(fill_value=fill_value) - - tm.assert_sp_series_equal( - result, expected, check_kind=False, consolidate_block_indices=True - ) - - def test_combine_first(self): - s = self.bseries - - result = s[::2].combine_first(s) - result2 = s[::2].combine_first(s.to_dense()) - - expected = s[::2].to_dense().combine_first(s.to_dense()) - expected = expected.to_sparse(fill_value=s.fill_value) - - tm.assert_sp_series_equal(result, result2) - tm.assert_sp_series_equal(result, expected) - - @pytest.mark.parametrize("deep", [True, False]) - @pytest.mark.parametrize("fill_value", [0, 1, np.nan, None]) - def test_memory_usage_deep(self, deep, fill_value): - values = [1.0] + [fill_value] * 20 - sparse_series = SparseSeries(values, fill_value=fill_value) - dense_series = Series(values) - sparse_usage = sparse_series.memory_usage(deep=deep) - dense_usage = dense_series.memory_usage(deep=deep) - - assert sparse_usage < dense_usage - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") -class TestSparseHandlingMultiIndexes: - def setup_method(self, method): - miindex = pd.MultiIndex.from_product( - [["x", "y"], ["10", "20"]], names=["row-foo", "row-bar"] - ) - micol = pd.MultiIndex.from_product( - [["a", "b", "c"], ["1", "2"]], names=["col-foo", "col-bar"] - ) - dense_multiindex_frame = ( - pd.DataFrame(index=miindex, columns=micol).sort_index().sort_index(axis=1) - ) - self.dense_multiindex_frame = dense_multiindex_frame.fillna(value=3.14) - - def test_to_sparse_preserve_multiindex_names_columns(self): - sparse_multiindex_frame = self.dense_multiindex_frame.to_sparse() - sparse_multiindex_frame = sparse_multiindex_frame.copy() - tm.assert_index_equal( - sparse_multiindex_frame.columns, self.dense_multiindex_frame.columns - ) - - def test_round_trip_preserve_multiindex_names(self): - sparse_multiindex_frame = self.dense_multiindex_frame.to_sparse() - round_trip_multiindex_frame = sparse_multiindex_frame.to_dense() - tm.assert_frame_equal( - self.dense_multiindex_frame, - round_trip_multiindex_frame, - check_column_type=True, - check_names=True, - ) - - -@td.skip_if_no_scipy -@pytest.mark.filterwarnings("ignore:the matrix subclass:PendingDeprecationWarning") -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") -class TestSparseSeriesScipyInteraction: - # Issue 8048: add SparseSeries coo methods - - def setup_method(self, method): - import scipy.sparse - - # SparseSeries inputs used in tests, the tests rely on the order - self.sparse_series = [] - s = pd.Series([3.0, nan, 1.0, 2.0, nan, nan]) - s.index = pd.MultiIndex.from_tuples( - [ - (1, 2, "a", 0), - (1, 2, "a", 1), - (1, 1, "b", 0), - (1, 1, "b", 1), - (2, 1, "b", 0), - (2, 1, "b", 1), - ], - names=["A", "B", "C", "D"], - ) - self.sparse_series.append(s.to_sparse()) - - ss = self.sparse_series[0].copy() - ss.index.names = [3, 0, 1, 2] - self.sparse_series.append(ss) - - ss = pd.Series( - [nan] * 12, index=cartesian_product((range(3), range(4))) - ).to_sparse() - for k, v in zip([(0, 0), (1, 2), (1, 3)], [3.0, 1.0, 2.0]): - ss[k] = v - self.sparse_series.append(ss) - - # results used in tests - self.coo_matrices = [] - self.coo_matrices.append( - scipy.sparse.coo_matrix( - ([3.0, 1.0, 2.0], ([0, 1, 1], [0, 2, 3])), shape=(3, 4) - ) - ) - self.coo_matrices.append( - scipy.sparse.coo_matrix( - ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4) - ) - ) - self.coo_matrices.append( - scipy.sparse.coo_matrix( - ([3.0, 1.0, 2.0], ([0, 1, 1], [0, 0, 1])), shape=(3, 2) - ) - ) - self.ils = [ - [(1, 2), (1, 1), (2, 1)], - [(1, 1), (1, 2), (2, 1)], - [(1, 2, "a"), (1, 1, "b"), (2, 1, "b")], - ] - self.jls = [[("a", 0), ("a", 1), ("b", 0), ("b", 1)], [0, 1]] - - def test_to_coo_text_names_integer_row_levels_nosort(self): - ss = self.sparse_series[0] - kwargs = {"row_levels": [0, 1], "column_levels": [2, 3]} - result = (self.coo_matrices[0], self.ils[0], self.jls[0]) - self._run_test(ss, kwargs, result) - - def test_to_coo_text_names_integer_row_levels_sort(self): - ss = self.sparse_series[0] - kwargs = {"row_levels": [0, 1], "column_levels": [2, 3], "sort_labels": True} - result = (self.coo_matrices[1], self.ils[1], self.jls[0]) - self._run_test(ss, kwargs, result) - - def test_to_coo_text_names_text_row_levels_nosort_col_level_single(self): - ss = self.sparse_series[0] - kwargs = { - "row_levels": ["A", "B", "C"], - "column_levels": ["D"], - "sort_labels": False, - } - result = (self.coo_matrices[2], self.ils[2], self.jls[1]) - self._run_test(ss, kwargs, result) - - def test_to_coo_integer_names_integer_row_levels_nosort(self): - ss = self.sparse_series[1] - kwargs = {"row_levels": [3, 0], "column_levels": [1, 2]} - result = (self.coo_matrices[0], self.ils[0], self.jls[0]) - self._run_test(ss, kwargs, result) - - def test_to_coo_text_names_text_row_levels_nosort(self): - ss = self.sparse_series[0] - kwargs = {"row_levels": ["A", "B"], "column_levels": ["C", "D"]} - result = (self.coo_matrices[0], self.ils[0], self.jls[0]) - self._run_test(ss, kwargs, result) - - def test_to_coo_bad_partition_nonnull_intersection(self): - ss = self.sparse_series[0] - msg = "Is not a partition because intersection is not null" - with pytest.raises(ValueError, match=msg): - ss.to_coo(["A", "B", "C"], ["C", "D"]) - - def test_to_coo_bad_partition_small_union(self): - ss = self.sparse_series[0] - msg = "Is not a partition because union is not the whole" - with pytest.raises(ValueError, match=msg): - ss.to_coo(["A"], ["C", "D"]) - - def test_to_coo_nlevels_less_than_two(self): - ss = self.sparse_series[0] - ss.index = np.arange(len(ss.index)) - msg = "to_coo requires MultiIndex with nlevels > 2" - with pytest.raises(ValueError, match=msg): - ss.to_coo() - - def test_to_coo_bad_ilevel(self): - ss = self.sparse_series[0] - with pytest.raises(KeyError, match="Level E not found"): - ss.to_coo(["A", "B"], ["C", "D", "E"]) - - def test_to_coo_duplicate_index_entries(self): - ss = pd.concat([self.sparse_series[0], self.sparse_series[0]]).to_sparse() - msg = "Duplicate index entries are not allowed in to_coo transformation" - with pytest.raises(ValueError, match=msg): - ss.to_coo(["A", "B"], ["C", "D"]) - - def test_from_coo_dense_index(self): - ss = SparseSeries.from_coo(self.coo_matrices[0], dense_index=True) - check = self.sparse_series[2] - tm.assert_sp_series_equal(ss, check) - - def test_from_coo_nodense_index(self): - ss = SparseSeries.from_coo(self.coo_matrices[0], dense_index=False) - check = self.sparse_series[2] - check = check.dropna().to_sparse() - tm.assert_sp_series_equal(ss, check) - - def test_from_coo_long_repr(self): - # GH 13114 - # test it doesn't raise error. Formatting is tested in test_format - import scipy.sparse - - sparse = SparseSeries.from_coo(scipy.sparse.rand(350, 18)) - repr(sparse) - - def _run_test(self, ss, kwargs, check): - results = ss.to_coo(**kwargs) - self._check_results_to_coo(results, check) - # for every test, also test symmetry property (transpose), switch - # row_levels and column_levels - d = kwargs.copy() - d["row_levels"] = kwargs["column_levels"] - d["column_levels"] = kwargs["row_levels"] - results = ss.to_coo(**d) - results = (results[0].T, results[2], results[1]) - self._check_results_to_coo(results, check) - - def _check_results_to_coo(self, results, check): - (A, il, jl) = results - (A_result, il_result, jl_result) = check - # convert to dense and compare - tm.assert_numpy_array_equal(A.todense(), A_result.todense()) - # or compare directly as difference of sparse - # assert(abs(A - A_result).max() < 1e-12) # max is failing in python - # 2.6 - assert il == il_result - assert jl == jl_result - - def test_concat(self): - val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) - val2 = np.array([3, np.nan, 4, 0, 0]) - - for kind in ["integer", "block"]: - sparse1 = pd.SparseSeries(val1, name="x", kind=kind) - sparse2 = pd.SparseSeries(val2, name="y", kind=kind) - - res = pd.concat([sparse1, sparse2]) - exp = pd.concat([pd.Series(val1), pd.Series(val2)]) - exp = pd.SparseSeries(exp, kind=kind) - tm.assert_sp_series_equal(res, exp) - - sparse1 = pd.SparseSeries(val1, fill_value=0, name="x", kind=kind) - sparse2 = pd.SparseSeries(val2, fill_value=0, name="y", kind=kind) - - res = pd.concat([sparse1, sparse2]) - exp = pd.concat([pd.Series(val1), pd.Series(val2)]) - exp = pd.SparseSeries(exp, fill_value=0, kind=kind) - tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True) - - def test_concat_axis1(self): - val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) - val2 = np.array([3, np.nan, 4, 0, 0]) - - sparse1 = pd.SparseSeries(val1, name="x") - sparse2 = pd.SparseSeries(val2, name="y") - - res = pd.concat([sparse1, sparse2], axis=1) - exp = pd.concat([pd.Series(val1, name="x"), pd.Series(val2, name="y")], axis=1) - exp = pd.SparseDataFrame(exp) - tm.assert_sp_frame_equal(res, exp) - - def test_concat_different_fill(self): - val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) - val2 = np.array([3, np.nan, 4, 0, 0]) - - for kind in ["integer", "block"]: - sparse1 = pd.SparseSeries(val1, name="x", kind=kind) - sparse2 = pd.SparseSeries(val2, name="y", kind=kind, fill_value=0) - - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - res = pd.concat([sparse1, sparse2]) - exp = pd.concat([pd.Series(val1), pd.Series(val2)]) - exp = pd.SparseSeries(exp, kind=kind) - tm.assert_sp_series_equal(res, exp) - - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - res = pd.concat([sparse2, sparse1]) - exp = pd.concat([pd.Series(val2), pd.Series(val1)]) - exp = pd.SparseSeries(exp, kind=kind, fill_value=0) - tm.assert_sp_series_equal(res, exp) - - def test_concat_axis1_different_fill(self): - val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) - val2 = np.array([3, np.nan, 4, 0, 0]) - - sparse1 = pd.SparseSeries(val1, name="x") - sparse2 = pd.SparseSeries(val2, name="y", fill_value=0) - - res = pd.concat([sparse1, sparse2], axis=1) - exp = pd.concat([pd.Series(val1, name="x"), pd.Series(val2, name="y")], axis=1) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), exp) - - def test_concat_different_kind(self): - val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) - val2 = np.array([3, np.nan, 4, 0, 0]) - - sparse1 = pd.SparseSeries(val1, name="x", kind="integer") - sparse2 = pd.SparseSeries(val2, name="y", kind="block", fill_value=0) - - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - res = pd.concat([sparse1, sparse2]) - exp = pd.concat([pd.Series(val1), pd.Series(val2)]) - exp = pd.SparseSeries(exp, kind="integer") - tm.assert_sp_series_equal(res, exp) - - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - res = pd.concat([sparse2, sparse1]) - exp = pd.concat([pd.Series(val2), pd.Series(val1)]) - exp = pd.SparseSeries(exp, kind="block", fill_value=0) - tm.assert_sp_series_equal(res, exp) - - def test_concat_sparse_dense(self): - # use first input's fill_value - val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) - val2 = np.array([3, np.nan, 4, 0, 0]) - - for kind in ["integer", "block"]: - sparse = pd.SparseSeries(val1, name="x", kind=kind) - dense = pd.Series(val2, name="y") - - res = pd.concat([sparse, dense]) - exp = pd.concat([pd.Series(val1), dense]) - exp = pd.SparseSeries(exp, kind=kind) - tm.assert_sp_series_equal(res, exp) - - res = pd.concat([dense, sparse, dense]) - exp = pd.concat([dense, pd.Series(val1), dense]) - exp = exp.astype("Sparse") - tm.assert_series_equal(res, exp) - - sparse = pd.SparseSeries(val1, name="x", kind=kind, fill_value=0) - dense = pd.Series(val2, name="y") - - res = pd.concat([sparse, dense]) - exp = pd.concat([pd.Series(val1), dense]) - exp = exp.astype(SparseDtype(exp.dtype, 0)) - tm.assert_series_equal(res, exp) - - res = pd.concat([dense, sparse, dense]) - exp = pd.concat([dense, pd.Series(val1), dense]) - exp = exp.astype(SparseDtype(exp.dtype, 0)) - tm.assert_series_equal(res, exp) - - def test_value_counts(self): - vals = [1, 2, nan, 0, nan, 1, 2, nan, nan, 1, 2, 0, 1, 1] - dense = pd.Series(vals, name="xx") - - sparse = pd.SparseSeries(vals, name="xx") - tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) - tm.assert_series_equal( - sparse.value_counts(dropna=False), dense.value_counts(dropna=False) - ) - - sparse = pd.SparseSeries(vals, name="xx", fill_value=0) - tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) - tm.assert_series_equal( - sparse.value_counts(dropna=False), dense.value_counts(dropna=False) - ) - - def test_value_counts_dup(self): - vals = [1, 2, nan, 0, nan, 1, 2, nan, nan, 1, 2, 0, 1, 1] - - # numeric op may cause sp_values to include the same value as - # fill_value - dense = pd.Series(vals, name="xx") / 0.0 - sparse = pd.SparseSeries(vals, name="xx") / 0.0 - tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) - tm.assert_series_equal( - sparse.value_counts(dropna=False), dense.value_counts(dropna=False) - ) - - vals = [1, 2, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 1] - - dense = pd.Series(vals, name="xx") * 0.0 - sparse = pd.SparseSeries(vals, name="xx") * 0.0 - tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) - tm.assert_series_equal( - sparse.value_counts(dropna=False), dense.value_counts(dropna=False) - ) - - def test_value_counts_int(self): - vals = [1, 2, 0, 1, 2, 1, 2, 0, 1, 1] - dense = pd.Series(vals, name="xx") - - # fill_value is np.nan, but should not be included in the result - sparse = pd.SparseSeries(vals, name="xx") - tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) - tm.assert_series_equal( - sparse.value_counts(dropna=False), dense.value_counts(dropna=False) - ) - - sparse = pd.SparseSeries(vals, name="xx", fill_value=0) - tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) - tm.assert_series_equal( - sparse.value_counts(dropna=False), dense.value_counts(dropna=False) - ) - - def test_isna(self): - # GH 8276 - s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name="xxx") - - res = s.isna() - exp = pd.SparseSeries( - [True, True, False, False, True], name="xxx", fill_value=True - ) - tm.assert_sp_series_equal(res, exp) - - # if fill_value is not nan, True can be included in sp_values - s = pd.SparseSeries([np.nan, 0.0, 1.0, 2.0, 0.0], name="xxx", fill_value=0.0) - res = s.isna() - assert isinstance(res, pd.SparseSeries) - exp = pd.Series([True, False, False, False, False], name="xxx") - tm.assert_series_equal(res.to_dense(), exp) - - def test_notna(self): - # GH 8276 - s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name="xxx") - - res = s.notna() - exp = pd.SparseSeries( - [False, False, True, True, False], name="xxx", fill_value=False - ) - tm.assert_sp_series_equal(res, exp) - - # if fill_value is not nan, True can be included in sp_values - s = pd.SparseSeries([np.nan, 0.0, 1.0, 2.0, 0.0], name="xxx", fill_value=0.0) - res = s.notna() - assert isinstance(res, pd.SparseSeries) - exp = pd.Series([False, True, True, True, True], name="xxx") - tm.assert_series_equal(res.to_dense(), exp) - - -def _dense_series_compare(s, f): - result = f(s) - assert isinstance(result, SparseSeries) - dense_result = f(s.to_dense()) - tm.assert_series_equal(result.to_dense(), dense_result) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") -class TestSparseSeriesAnalytics: - def setup_method(self, method): - arr, index = _test_data1() - self.bseries = SparseSeries(arr, index=index, kind="block", name="bseries") - - arr, index = _test_data1_zero() - self.zbseries = SparseSeries( - arr, index=index, kind="block", fill_value=0, name="zbseries" - ) - - def test_cumsum(self): - result = self.bseries.cumsum() - expected = SparseSeries(self.bseries.to_dense().cumsum()) - tm.assert_sp_series_equal(result, expected) - - result = self.zbseries.cumsum() - expected = self.zbseries.to_dense().cumsum().to_sparse() - tm.assert_series_equal(result, expected) - - axis = 1 # Series is 1-D, so only axis = 0 is valid. - msg = "No axis named {axis}".format(axis=axis) - with pytest.raises(ValueError, match=msg): - self.bseries.cumsum(axis=axis) - - def test_numpy_cumsum(self): - result = np.cumsum(self.bseries) - expected = SparseSeries(self.bseries.to_dense().cumsum()) - tm.assert_sp_series_equal(result, expected) - - result = np.cumsum(self.zbseries) - expected = self.zbseries.to_dense().cumsum().to_sparse() - tm.assert_series_equal(result, expected) - - msg = "the 'dtype' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.cumsum(self.bseries, dtype=np.int64) - - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.cumsum(self.zbseries, out=result) - - def test_numpy_func_call(self): - # no exception should be raised even though - # numpy passes in 'axis=None' or `axis=-1' - funcs = [ - "sum", - "cumsum", - "var", - "mean", - "prod", - "cumprod", - "std", - "argsort", - "min", - "max", - ] - for func in funcs: - for series in ("bseries", "zbseries"): - getattr(np, func)(getattr(self, series)) - - def test_deprecated_numpy_func_call(self): - # NOTE: These should be add to the 'test_numpy_func_call' test above - # once the behavior of argmin/argmax is corrected. - funcs = ["argmin", "argmax"] - for func in funcs: - for series in ("bseries", "zbseries"): - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False - ): - getattr(np, func)(getattr(self, series)) - - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False - ): - getattr(getattr(self, series), func)() - - -@pytest.mark.parametrize( - "datetime_type", - (np.datetime64, pd.Timestamp, lambda x: datetime.strptime(x, "%Y-%m-%d")), -) -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -def test_constructor_dict_datetime64_index(datetime_type): - # GH 9456 - dates = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"] - values = [42544017.198965244, 1234565, 40512335.181958228, -1] - - result = SparseSeries(dict(zip(map(datetime_type, dates), values))) - expected = SparseSeries(values, map(pd.Timestamp, dates)) - - tm.assert_sp_series_equal(result, expected) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") -def test_to_sparse(): - # https://github.com/pandas-dev/pandas/issues/22389 - arr = pd.SparseArray([1, 2, None, 3]) - result = pd.Series(arr).to_sparse() - assert len(result) == 4 - tm.assert_sp_array_equal(result.values, arr, check_kind=False) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -def test_deprecated_to_sparse(): - # GH 26557 - # Deprecated 0.25.0 - - ser = Series([1, np.nan, 3]) - sparse_ser = pd.SparseSeries([1, np.nan, 3]) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ser.to_sparse() - tm.assert_series_equal(result, sparse_ser) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -def test_constructor_mismatched_raises(): - msg = "Length of passed values is 2, index implies 3" - with pytest.raises(ValueError, match=msg): - SparseSeries([1, 2], index=[1, 2, 3]) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -def test_block_deprecated(): - s = SparseSeries([1]) - with tm.assert_produces_warning(FutureWarning): - s.block diff --git a/pandas/tests/sparse/test_combine_concat.py b/pandas/tests/sparse/test_combine_concat.py index c553cd3fd1a7a..4ad1aa60e7b4f 100644 --- a/pandas/tests/sparse/test_combine_concat.py +++ b/pandas/tests/sparse/test_combine_concat.py @@ -1,10 +1,6 @@ -import itertools - import numpy as np import pytest -from pandas.errors import PerformanceWarning - import pandas as pd import pandas.util.testing as tm @@ -33,442 +29,3 @@ def test_uses_first_kind(self, kind): expected = np.array([1, 2, 1, 2, 2], dtype="int64") tm.assert_numpy_array_equal(result.sp_values, expected) assert result.kind == kind - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -class TestSparseSeriesConcat: - @pytest.mark.parametrize("kind", ["integer", "block"]) - def test_concat(self, kind): - val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) - val2 = np.array([3, np.nan, 4, 0, 0]) - - sparse1 = pd.SparseSeries(val1, name="x", kind=kind) - sparse2 = pd.SparseSeries(val2, name="y", kind=kind) - - res = pd.concat([sparse1, sparse2]) - exp = pd.concat([pd.Series(val1), pd.Series(val2)]) - exp = pd.SparseSeries(exp, kind=kind) - tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True) - - sparse1 = pd.SparseSeries(val1, fill_value=0, name="x", kind=kind) - sparse2 = pd.SparseSeries(val2, fill_value=0, name="y", kind=kind) - - res = pd.concat([sparse1, sparse2]) - exp = pd.concat([pd.Series(val1), pd.Series(val2)]) - exp = pd.SparseSeries(exp, fill_value=0, kind=kind) - tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True) - - def test_concat_axis1(self): - val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) - val2 = np.array([3, np.nan, 4, 0, 0]) - - sparse1 = pd.SparseSeries(val1, name="x") - sparse2 = pd.SparseSeries(val2, name="y") - - res = pd.concat([sparse1, sparse2], axis=1) - exp = pd.concat([pd.Series(val1, name="x"), pd.Series(val2, name="y")], axis=1) - exp = pd.SparseDataFrame(exp) - tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True) - - def test_concat_different_fill(self): - val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) - val2 = np.array([3, np.nan, 4, 0, 0]) - - for kind in ["integer", "block"]: - sparse1 = pd.SparseSeries(val1, name="x", kind=kind) - sparse2 = pd.SparseSeries(val2, name="y", kind=kind, fill_value=0) - - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - res = pd.concat([sparse1, sparse2]) - - exp = pd.concat([pd.Series(val1), pd.Series(val2)]) - exp = pd.SparseSeries(exp, kind=kind) - tm.assert_sp_series_equal(res, exp) - - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - res = pd.concat([sparse2, sparse1]) - - exp = pd.concat([pd.Series(val2), pd.Series(val1)]) - exp = pd.SparseSeries(exp, kind=kind, fill_value=0) - tm.assert_sp_series_equal(res, exp) - - def test_concat_axis1_different_fill(self): - val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) - val2 = np.array([3, np.nan, 4, 0, 0]) - - sparse1 = pd.SparseSeries(val1, name="x") - sparse2 = pd.SparseSeries(val2, name="y", fill_value=0) - - res = pd.concat([sparse1, sparse2], axis=1) - exp = pd.concat([pd.Series(val1, name="x"), pd.Series(val2, name="y")], axis=1) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), exp) - - def test_concat_different_kind(self): - val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) - val2 = np.array([3, np.nan, 4, 0, 0]) - - sparse1 = pd.SparseSeries(val1, name="x", kind="integer") - sparse2 = pd.SparseSeries(val2, name="y", kind="block") - - res = pd.concat([sparse1, sparse2]) - exp = pd.concat([pd.Series(val1), pd.Series(val2)]) - exp = pd.SparseSeries(exp, kind=sparse1.kind) - tm.assert_sp_series_equal(res, exp) - - res = pd.concat([sparse2, sparse1]) - exp = pd.concat([pd.Series(val2), pd.Series(val1)]) - exp = pd.SparseSeries(exp, kind=sparse2.kind) - tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True) - - @pytest.mark.parametrize("kind", ["integer", "block"]) - def test_concat_sparse_dense(self, kind): - # use first input's fill_value - val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) - val2 = np.array([3, np.nan, 4, 0, 0]) - - sparse = pd.SparseSeries(val1, name="x", kind=kind) - dense = pd.Series(val2, name="y") - - res = pd.concat([sparse, dense]) - exp = pd.SparseSeries(pd.concat([pd.Series(val1), dense]), kind=kind) - tm.assert_sp_series_equal(res, exp) - - res = pd.concat([dense, sparse, dense]) - exp = pd.concat([dense, pd.Series(val1), dense]) - # XXX: changed from SparseSeries to Series[sparse] - exp = pd.Series(pd.SparseArray(exp, kind=kind), index=exp.index, name=exp.name) - tm.assert_series_equal(res, exp) - - sparse = pd.SparseSeries(val1, name="x", kind=kind, fill_value=0) - dense = pd.Series(val2, name="y") - - res = pd.concat([sparse, dense]) - # XXX: changed from SparseSeries to Series[sparse] - exp = pd.concat([pd.Series(val1), dense]) - exp = pd.Series( - pd.SparseArray(exp, kind=kind, fill_value=0), index=exp.index, name=exp.name - ) - tm.assert_series_equal(res, exp) - - res = pd.concat([dense, sparse, dense]) - exp = pd.concat([dense, pd.Series(val1), dense]) - # XXX: changed from SparseSeries to Series[sparse] - exp = pd.Series( - pd.SparseArray(exp, kind=kind, fill_value=0), index=exp.index, name=exp.name - ) - tm.assert_series_equal(res, exp) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") -class TestSparseDataFrameConcat: - def setup_method(self, method): - - self.dense1 = pd.DataFrame( - { - "A": [0.0, 1.0, 2.0, np.nan], - "B": [0.0, 0.0, 0.0, 0.0], - "C": [np.nan, np.nan, np.nan, np.nan], - "D": [1.0, 2.0, 3.0, 4.0], - } - ) - - self.dense2 = pd.DataFrame( - { - "A": [5.0, 6.0, 7.0, 8.0], - "B": [np.nan, 0.0, 7.0, 8.0], - "C": [5.0, 6.0, np.nan, np.nan], - "D": [np.nan, np.nan, np.nan, np.nan], - } - ) - - self.dense3 = pd.DataFrame( - { - "E": [5.0, 6.0, 7.0, 8.0], - "F": [np.nan, 0.0, 7.0, 8.0], - "G": [5.0, 6.0, np.nan, np.nan], - "H": [np.nan, np.nan, np.nan, np.nan], - } - ) - - def test_concat(self): - # fill_value = np.nan - sparse = self.dense1.to_sparse() - sparse2 = self.dense2.to_sparse() - - res = pd.concat([sparse, sparse]) - exp = pd.concat([self.dense1, self.dense1]).to_sparse() - tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True) - - res = pd.concat([sparse2, sparse2]) - exp = pd.concat([self.dense2, self.dense2]).to_sparse() - tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True) - - res = pd.concat([sparse, sparse2]) - exp = pd.concat([self.dense1, self.dense2]).to_sparse() - tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True) - - res = pd.concat([sparse2, sparse]) - exp = pd.concat([self.dense2, self.dense1]).to_sparse() - tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True) - - # fill_value = 0 - sparse = self.dense1.to_sparse(fill_value=0) - sparse2 = self.dense2.to_sparse(fill_value=0) - - res = pd.concat([sparse, sparse]) - exp = pd.concat([self.dense1, self.dense1]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True) - - res = pd.concat([sparse2, sparse2]) - exp = pd.concat([self.dense2, self.dense2]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True) - - res = pd.concat([sparse, sparse2]) - exp = pd.concat([self.dense1, self.dense2]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True) - - res = pd.concat([sparse2, sparse]) - exp = pd.concat([self.dense2, self.dense1]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True) - - def test_concat_different_fill_value(self): - # 1st fill_value will be used - sparse = self.dense1.to_sparse() - sparse2 = self.dense2.to_sparse(fill_value=0) - - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - res = pd.concat([sparse, sparse2]) - exp = pd.concat([self.dense1, self.dense2]).to_sparse() - tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True) - - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - res = pd.concat([sparse2, sparse]) - exp = pd.concat([self.dense2, self.dense1]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True) - - def test_concat_different_columns_sort_warns(self): - sparse = self.dense1.to_sparse() - sparse3 = self.dense3.to_sparse() - - # stacklevel is wrong since we have two FutureWarnings, - # one for depr, one for sorting. - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False - ): - res = pd.concat([sparse, sparse3]) - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False - ): - exp = pd.concat([self.dense1, self.dense3]) - - exp = exp.to_sparse() - tm.assert_sp_frame_equal(res, exp, check_kind=False) - - def test_concat_different_columns(self): - # fill_value = np.nan - sparse = self.dense1.to_sparse() - sparse3 = self.dense3.to_sparse() - - res = pd.concat([sparse, sparse3], sort=True) - exp = pd.concat([self.dense1, self.dense3], sort=True).to_sparse() - tm.assert_sp_frame_equal(res, exp, check_kind=False) - - res = pd.concat([sparse3, sparse], sort=True) - exp = pd.concat([self.dense3, self.dense1], sort=True).to_sparse() - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp, check_kind=False) - - def test_concat_bug(self): - from pandas.core.sparse.api import SparseDtype - - x = pd.SparseDataFrame({"A": pd.SparseArray([np.nan, np.nan], fill_value=0)}) - y = pd.SparseDataFrame({"B": []}) - res = pd.concat([x, y], sort=False)[["A"]] - exp = pd.DataFrame( - {"A": pd.SparseArray([np.nan, np.nan], dtype=SparseDtype(float, 0))} - ) - tm.assert_frame_equal(res, exp) - - def test_concat_different_columns_buggy(self): - sparse = self.dense1.to_sparse(fill_value=0) - sparse3 = self.dense3.to_sparse(fill_value=0) - - res = pd.concat([sparse, sparse3], sort=True) - exp = pd.concat([self.dense1, self.dense3], sort=True).to_sparse(fill_value=0) - exp._default_fill_value = np.nan - - tm.assert_sp_frame_equal( - res, exp, check_kind=False, consolidate_block_indices=True - ) - - res = pd.concat([sparse3, sparse], sort=True) - exp = pd.concat([self.dense3, self.dense1], sort=True).to_sparse(fill_value=0) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal( - res, exp, check_kind=False, consolidate_block_indices=True - ) - - # different fill values - sparse = self.dense1.to_sparse() - sparse3 = self.dense3.to_sparse(fill_value=0) - # each columns keeps its fill_value, thus compare in dense - res = pd.concat([sparse, sparse3], sort=True) - exp = pd.concat([self.dense1, self.dense3], sort=True) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), exp) - - res = pd.concat([sparse3, sparse], sort=True) - exp = pd.concat([self.dense3, self.dense1], sort=True) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), exp) - - def test_concat_series(self): - # fill_value = np.nan - sparse = self.dense1.to_sparse() - sparse2 = self.dense2.to_sparse() - - for col in ["A", "D"]: - res = pd.concat([sparse, sparse2[col]]) - exp = pd.concat([self.dense1, self.dense2[col]]).to_sparse() - tm.assert_sp_frame_equal(res, exp, check_kind=False) - - res = pd.concat([sparse2[col], sparse]) - exp = pd.concat([self.dense2[col], self.dense1]).to_sparse() - tm.assert_sp_frame_equal(res, exp, check_kind=False) - - # fill_value = 0 - sparse = self.dense1.to_sparse(fill_value=0) - sparse2 = self.dense2.to_sparse(fill_value=0) - - for col in ["C", "D"]: - res = pd.concat([sparse, sparse2[col]]) - exp = pd.concat([self.dense1, self.dense2[col]]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal( - res, exp, check_kind=False, consolidate_block_indices=True - ) - - res = pd.concat([sparse2[col], sparse]) - exp = pd.concat([self.dense2[col], self.dense1]).to_sparse(fill_value=0) - exp["C"] = res["C"] - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal( - res, exp, consolidate_block_indices=True, check_kind=False - ) - - def test_concat_axis1(self): - # fill_value = np.nan - sparse = self.dense1.to_sparse() - sparse3 = self.dense3.to_sparse() - - res = pd.concat([sparse, sparse3], axis=1) - exp = pd.concat([self.dense1, self.dense3], axis=1).to_sparse() - tm.assert_sp_frame_equal(res, exp) - - res = pd.concat([sparse3, sparse], axis=1) - exp = pd.concat([self.dense3, self.dense1], axis=1).to_sparse() - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp) - - # fill_value = 0 - sparse = self.dense1.to_sparse(fill_value=0) - sparse3 = self.dense3.to_sparse(fill_value=0) - - res = pd.concat([sparse, sparse3], axis=1) - exp = pd.concat([self.dense1, self.dense3], axis=1).to_sparse(fill_value=0) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp) - - res = pd.concat([sparse3, sparse], axis=1) - exp = pd.concat([self.dense3, self.dense1], axis=1).to_sparse(fill_value=0) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp) - - # different fill values - sparse = self.dense1.to_sparse() - sparse3 = self.dense3.to_sparse(fill_value=0) - # each columns keeps its fill_value, thus compare in dense - res = pd.concat([sparse, sparse3], axis=1) - exp = pd.concat([self.dense1, self.dense3], axis=1) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), exp) - - res = pd.concat([sparse3, sparse], axis=1) - exp = pd.concat([self.dense3, self.dense1], axis=1) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), exp) - - @pytest.mark.parametrize( - "fill_value,sparse_idx,dense_idx", - itertools.product([None, 0, 1, np.nan], [0, 1], [1, 0]), - ) - def test_concat_sparse_dense_rows(self, fill_value, sparse_idx, dense_idx): - frames = [self.dense1, self.dense2] - sparse_frame = [ - frames[dense_idx], - frames[sparse_idx].to_sparse(fill_value=fill_value), - ] - dense_frame = [frames[dense_idx], frames[sparse_idx]] - - # This will try both directions sparse + dense and dense + sparse - for _ in range(2): - res = pd.concat(sparse_frame) - exp = pd.concat(dense_frame) - - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), exp) - - sparse_frame = sparse_frame[::-1] - dense_frame = dense_frame[::-1] - - @pytest.mark.parametrize( - "fill_value,sparse_idx,dense_idx", - itertools.product([None, 0, 1, np.nan], [0, 1], [1, 0]), - ) - @pytest.mark.xfail(reason="The iloc fails and I can't make expected", strict=True) - def test_concat_sparse_dense_cols(self, fill_value, sparse_idx, dense_idx): - # See GH16874, GH18914 and #18686 for why this should be a DataFrame - from pandas.core.dtypes.common import is_sparse - - frames = [self.dense1, self.dense3] - - sparse_frame = [ - frames[dense_idx], - frames[sparse_idx].to_sparse(fill_value=fill_value), - ] - dense_frame = [frames[dense_idx], frames[sparse_idx]] - - # This will try both directions sparse + dense and dense + sparse - for _ in range(2): - res = pd.concat(sparse_frame, axis=1) - exp = pd.concat(dense_frame, axis=1) - cols = [i for (i, x) in enumerate(res.dtypes) if is_sparse(x)] - - for col in cols: - exp.iloc[:, col] = exp.iloc[:, col].astype("Sparse") - - for column in frames[dense_idx].columns: - if dense_idx == sparse_idx: - tm.assert_frame_equal(res[column], exp[column]) - else: - tm.assert_series_equal(res[column], exp[column]) - - tm.assert_frame_equal(res, exp) - - sparse_frame = sparse_frame[::-1] - dense_frame = dense_frame[::-1] diff --git a/pandas/tests/sparse/test_format.py b/pandas/tests/sparse/test_format.py deleted file mode 100644 index cf8734910cd19..0000000000000 --- a/pandas/tests/sparse/test_format.py +++ /dev/null @@ -1,165 +0,0 @@ -import warnings - -import numpy as np -import pytest - -from pandas.compat import is_platform_32bit, is_platform_windows - -import pandas as pd -from pandas import option_context -import pandas.util.testing as tm - -use_32bit_repr = is_platform_windows() or is_platform_32bit() - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") -class TestSparseSeriesFormatting: - @property - def dtype_format_for_platform(self): - return "" if use_32bit_repr else ", dtype=int32" - - def test_sparse_max_row(self): - s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse() - result = repr(s) - dfm = self.dtype_format_for_platform - exp = ( - "0 1.0\n1 NaN\n2 NaN\n3 3.0\n" - "4 NaN\ndtype: Sparse[float64, nan]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dfm) - ) - assert result == exp - - def test_sparsea_max_row_truncated(self): - s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse() - dfm = self.dtype_format_for_platform - - with option_context("display.max_rows", 3): - # GH 10560 - result = repr(s) - exp = ( - "0 1.0\n ... \n4 NaN\n" - "Length: 5, dtype: Sparse[float64, nan]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dfm) - ) - assert result == exp - - def test_sparse_mi_max_row(self): - idx = pd.MultiIndex.from_tuples( - [("A", 0), ("A", 1), ("B", 0), ("C", 0), ("C", 1), ("C", 2)] - ) - s = pd.Series([1, np.nan, np.nan, 3, np.nan, np.nan], index=idx).to_sparse() - result = repr(s) - dfm = self.dtype_format_for_platform - exp = ( - "A 0 1.0\n 1 NaN\nB 0 NaN\n" - "C 0 3.0\n 1 NaN\n 2 NaN\n" - "dtype: Sparse[float64, nan]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dfm) - ) - assert result == exp - - with option_context("display.max_rows", 3, "display.show_dimensions", False): - # GH 13144 - result = repr(s) - exp = ( - "A 0 1.0\n ... \nC 2 NaN\n" - "dtype: Sparse[float64, nan]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dfm) - ) - assert result == exp - - def test_sparse_bool(self): - # GH 13110 - s = pd.SparseSeries([True, False, False, True, False, False], fill_value=False) - result = repr(s) - dtype = "" if use_32bit_repr else ", dtype=int32" - exp = ( - "0 True\n1 False\n2 False\n" - "3 True\n4 False\n5 False\n" - "dtype: Sparse[bool, False]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype) - ) - assert result == exp - - with option_context("display.max_rows", 3): - result = repr(s) - exp = ( - "0 True\n ... \n5 False\n" - "Length: 6, dtype: Sparse[bool, False]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype) - ) - assert result == exp - - def test_sparse_int(self): - # GH 13110 - s = pd.SparseSeries([0, 1, 0, 0, 1, 0], fill_value=False) - - result = repr(s) - dtype = "" if use_32bit_repr else ", dtype=int32" - exp = ( - "0 0\n1 1\n2 0\n3 0\n4 1\n" - "5 0\ndtype: Sparse[int64, False]\nBlockIndex\n" - "Block locations: array([1, 4]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype) - ) - assert result == exp - - with option_context("display.max_rows", 3, "display.show_dimensions", False): - result = repr(s) - exp = ( - "0 0\n ..\n5 0\n" - "dtype: Sparse[int64, False]\nBlockIndex\n" - "Block locations: array([1, 4]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype) - ) - assert result == exp - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") -class TestSparseDataFrameFormatting: - def test_sparse_frame(self): - # GH 13110 - df = pd.DataFrame( - { - "A": [True, False, True, False, True], - "B": [True, False, True, False, True], - "C": [0, 0, 3, 0, 5], - "D": [np.nan, np.nan, np.nan, 1, 2], - } - ) - sparse = df.to_sparse() - assert repr(sparse) == repr(df) - - with option_context("display.max_rows", 3): - assert repr(sparse) == repr(df) - - def test_sparse_repr_after_set(self): - # GH 15488 - sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]]) - res = sdf.copy() - - # Ignore the warning - with pd.option_context("mode.chained_assignment", None): - sdf[0][1] = 2 # This line triggers the bug - - repr(sdf) - tm.assert_sp_frame_equal(sdf, res) - - -def test_repr_no_warning(): - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - df = pd.SparseDataFrame({"A": [1, 2]}) - s = df["A"] - - with tm.assert_produces_warning(None): - repr(df) - repr(s) diff --git a/pandas/tests/sparse/test_groupby.py b/pandas/tests/sparse/test_groupby.py deleted file mode 100644 index 04e49a272a77a..0000000000000 --- a/pandas/tests/sparse/test_groupby.py +++ /dev/null @@ -1,73 +0,0 @@ -import numpy as np -import pytest - -import pandas as pd -import pandas.util.testing as tm - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") -class TestSparseGroupBy: - def setup_method(self, method): - self.dense = pd.DataFrame( - { - "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], - "B": ["one", "one", "two", "three", "two", "two", "one", "three"], - "C": np.random.randn(8), - "D": np.random.randn(8), - "E": [np.nan, np.nan, 1, 2, np.nan, 1, np.nan, np.nan], - } - ) - self.sparse = self.dense.to_sparse() - - def test_first_last_nth(self): - # tests for first / last / nth - sparse_grouped = self.sparse.groupby("A") - dense_grouped = self.dense.groupby("A") - - sparse_grouped_first = sparse_grouped.first() - sparse_grouped_last = sparse_grouped.last() - sparse_grouped_nth = sparse_grouped.nth(1) - - dense_grouped_first = pd.DataFrame(dense_grouped.first().to_sparse()) - dense_grouped_last = pd.DataFrame(dense_grouped.last().to_sparse()) - dense_grouped_nth = pd.DataFrame(dense_grouped.nth(1).to_sparse()) - - tm.assert_frame_equal(sparse_grouped_first, dense_grouped_first) - tm.assert_frame_equal(sparse_grouped_last, dense_grouped_last) - tm.assert_frame_equal(sparse_grouped_nth, dense_grouped_nth) - - def test_aggfuncs(self): - sparse_grouped = self.sparse.groupby("A") - dense_grouped = self.dense.groupby("A") - - result = sparse_grouped.mean().to_sparse() - expected = dense_grouped.mean().to_sparse() - - tm.assert_frame_equal(result, expected) - - # ToDo: sparse sum includes str column - # tm.assert_frame_equal(sparse_grouped.sum(), - # dense_grouped.sum()) - - result = sparse_grouped.count().to_sparse() - expected = dense_grouped.count().to_sparse() - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("fill_value", [0, np.nan]) -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") -def test_groupby_includes_fill_value(fill_value): - # https://github.com/pandas-dev/pandas/issues/5078 - df = pd.DataFrame( - { - "a": [fill_value, 1, fill_value, fill_value], - "b": [fill_value, 1, fill_value, fill_value], - } - ) - sdf = df.to_sparse(fill_value=fill_value) - result = sdf.groupby("a").sum() - expected = pd.DataFrame(df.groupby("a").sum().to_sparse(fill_value=fill_value)) - tm.assert_frame_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/sparse/test_indexing.py b/pandas/tests/sparse/test_indexing.py deleted file mode 100644 index ea5e939b57566..0000000000000 --- a/pandas/tests/sparse/test_indexing.py +++ /dev/null @@ -1,1058 +0,0 @@ -import numpy as np -import pytest - -import pandas as pd -from pandas.core.sparse.api import SparseDtype -import pandas.util.testing as tm - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") -class TestSparseSeriesIndexing: - def setup_method(self, method): - self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - self.sparse = self.orig.to_sparse() - - def test_getitem(self): - orig = self.orig - sparse = self.sparse - - assert sparse[0] == 1 - assert np.isnan(sparse[1]) - assert sparse[3] == 3 - - result = sparse[[1, 3, 4]] - exp = orig[[1, 3, 4]].to_sparse() - tm.assert_sp_series_equal(result, exp) - - # dense array - result = sparse[orig % 2 == 1] - exp = orig[orig % 2 == 1].to_sparse() - tm.assert_sp_series_equal(result, exp) - - # sparse array (actuary it coerces to normal Series) - result = sparse[sparse % 2 == 1] - exp = orig[orig % 2 == 1].to_sparse() - tm.assert_sp_series_equal(result, exp) - - # sparse array - result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] - tm.assert_sp_series_equal(result, exp) - - def test_getitem_slice(self): - orig = self.orig - sparse = self.sparse - - tm.assert_sp_series_equal(sparse[:2], orig[:2].to_sparse()) - tm.assert_sp_series_equal(sparse[4:2], orig[4:2].to_sparse()) - tm.assert_sp_series_equal(sparse[::2], orig[::2].to_sparse()) - tm.assert_sp_series_equal(sparse[-5:], orig[-5:].to_sparse()) - - def test_getitem_int_dtype(self): - # GH 8292 - s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], name="xxx") - res = s[::2] - exp = pd.SparseSeries([0, 2, 4, 6], index=[0, 2, 4, 6], name="xxx") - tm.assert_sp_series_equal(res, exp) - assert res.dtype == SparseDtype(np.int64) - - s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], fill_value=0, name="xxx") - res = s[::2] - exp = pd.SparseSeries( - [0, 2, 4, 6], index=[0, 2, 4, 6], fill_value=0, name="xxx" - ) - tm.assert_sp_series_equal(res, exp) - assert res.dtype == SparseDtype(np.int64) - - def test_getitem_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0]) - sparse = orig.to_sparse(fill_value=0) - - assert sparse[0] == 1 - assert np.isnan(sparse[1]) - assert sparse[2] == 0 - assert sparse[3] == 3 - - result = sparse[[1, 3, 4]] - exp = orig[[1, 3, 4]].to_sparse(fill_value=0) - tm.assert_sp_series_equal(result, exp) - - # dense array - result = sparse[orig % 2 == 1] - exp = orig[orig % 2 == 1].to_sparse(fill_value=0) - tm.assert_sp_series_equal(result, exp) - - # sparse array (actuary it coerces to normal Series) - result = sparse[sparse % 2 == 1] - exp = orig[orig % 2 == 1].to_sparse(fill_value=0) - tm.assert_sp_series_equal(result, exp) - - # sparse array - result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] - tm.assert_sp_series_equal(result, exp) - - def test_getitem_ellipsis(self): - # GH 9467 - s = pd.SparseSeries([1, np.nan, 2, 0, np.nan]) - tm.assert_sp_series_equal(s[...], s) - - s = pd.SparseSeries([1, np.nan, 2, 0, np.nan], fill_value=0) - tm.assert_sp_series_equal(s[...], s) - - def test_getitem_slice_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0]) - sparse = orig.to_sparse(fill_value=0) - tm.assert_sp_series_equal(sparse[:2], orig[:2].to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse[4:2], orig[4:2].to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse[::2], orig[::2].to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse[-5:], orig[-5:].to_sparse(fill_value=0)) - - def test_loc(self): - orig = self.orig - sparse = self.sparse - - assert sparse.loc[0] == 1 - assert np.isnan(sparse.loc[1]) - - result = sparse.loc[[1, 3, 4]] - exp = orig.loc[[1, 3, 4]].to_sparse() - tm.assert_sp_series_equal(result, exp) - - # exceeds the bounds - result = sparse.reindex([1, 3, 4, 5]) - exp = orig.reindex([1, 3, 4, 5]).to_sparse() - tm.assert_sp_series_equal(result, exp) - # padded with NaN - assert np.isnan(result[-1]) - - # dense array - result = sparse.loc[orig % 2 == 1] - exp = orig.loc[orig % 2 == 1].to_sparse() - tm.assert_sp_series_equal(result, exp) - - # sparse array (actuary it coerces to normal Series) - result = sparse.loc[sparse % 2 == 1] - exp = orig.loc[orig % 2 == 1].to_sparse() - tm.assert_sp_series_equal(result, exp) - - # sparse array - result = sparse.loc[pd.SparseArray(sparse % 2 == 1, dtype=bool)] - tm.assert_sp_series_equal(result, exp) - - def test_loc_index(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list("ABCDE")) - sparse = orig.to_sparse() - - assert sparse.loc["A"] == 1 - assert np.isnan(sparse.loc["B"]) - - result = sparse.loc[["A", "C", "D"]] - exp = orig.loc[["A", "C", "D"]].to_sparse() - tm.assert_sp_series_equal(result, exp) - - # dense array - result = sparse.loc[orig % 2 == 1] - exp = orig.loc[orig % 2 == 1].to_sparse() - tm.assert_sp_series_equal(result, exp) - - # sparse array (actuary it coerces to normal Series) - result = sparse.loc[sparse % 2 == 1] - exp = orig.loc[orig % 2 == 1].to_sparse() - tm.assert_sp_series_equal(result, exp) - - # sparse array - result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] - tm.assert_sp_series_equal(result, exp) - - def test_loc_index_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0], index=list("ABCDE")) - sparse = orig.to_sparse(fill_value=0) - - assert sparse.loc["A"] == 1 - assert np.isnan(sparse.loc["B"]) - - result = sparse.loc[["A", "C", "D"]] - exp = orig.loc[["A", "C", "D"]].to_sparse(fill_value=0) - tm.assert_sp_series_equal(result, exp) - - # dense array - result = sparse.loc[orig % 2 == 1] - exp = orig.loc[orig % 2 == 1].to_sparse(fill_value=0) - tm.assert_sp_series_equal(result, exp) - - # sparse array (actuary it coerces to normal Series) - result = sparse.loc[sparse % 2 == 1] - exp = orig.loc[orig % 2 == 1].to_sparse(fill_value=0) - tm.assert_sp_series_equal(result, exp) - - def test_loc_slice(self): - orig = self.orig - sparse = self.sparse - tm.assert_sp_series_equal(sparse.loc[2:], orig.loc[2:].to_sparse()) - - def test_loc_slice_index_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0], index=list("ABCDE")) - sparse = orig.to_sparse(fill_value=0) - - tm.assert_sp_series_equal( - sparse.loc["C":], orig.loc["C":].to_sparse(fill_value=0) - ) - - def test_loc_slice_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0]) - sparse = orig.to_sparse(fill_value=0) - tm.assert_sp_series_equal(sparse.loc[2:], orig.loc[2:].to_sparse(fill_value=0)) - - def test_iloc(self): - orig = self.orig - sparse = self.sparse - - assert sparse.iloc[3] == 3 - assert np.isnan(sparse.iloc[2]) - - result = sparse.iloc[[1, 3, 4]] - exp = orig.iloc[[1, 3, 4]].to_sparse() - tm.assert_sp_series_equal(result, exp) - - result = sparse.iloc[[1, -2, -4]] - exp = orig.iloc[[1, -2, -4]].to_sparse() - tm.assert_sp_series_equal(result, exp) - - with pytest.raises(IndexError): - sparse.iloc[[1, 3, 5]] - - def test_iloc_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0]) - sparse = orig.to_sparse(fill_value=0) - - assert sparse.iloc[3] == 3 - assert np.isnan(sparse.iloc[1]) - assert sparse.iloc[4] == 0 - - result = sparse.iloc[[1, 3, 4]] - exp = orig.iloc[[1, 3, 4]].to_sparse(fill_value=0) - tm.assert_sp_series_equal(result, exp) - - def test_iloc_slice(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() - tm.assert_sp_series_equal(sparse.iloc[2:], orig.iloc[2:].to_sparse()) - - def test_iloc_slice_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0]) - sparse = orig.to_sparse(fill_value=0) - tm.assert_sp_series_equal( - sparse.iloc[2:], orig.iloc[2:].to_sparse(fill_value=0) - ) - - def test_at(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() - assert sparse.at[0] == orig.at[0] - assert np.isnan(sparse.at[1]) - assert np.isnan(sparse.at[2]) - assert sparse.at[3] == orig.at[3] - assert np.isnan(sparse.at[4]) - - orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list("abcde")) - sparse = orig.to_sparse() - assert sparse.at["a"] == orig.at["a"] - assert np.isnan(sparse.at["b"]) - assert np.isnan(sparse.at["c"]) - assert sparse.at["d"] == orig.at["d"] - assert np.isnan(sparse.at["e"]) - - def test_at_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0], index=list("abcde")) - sparse = orig.to_sparse(fill_value=0) - assert sparse.at["a"] == orig.at["a"] - assert np.isnan(sparse.at["b"]) - assert sparse.at["c"] == orig.at["c"] - assert sparse.at["d"] == orig.at["d"] - assert sparse.at["e"] == orig.at["e"] - - def test_iat(self): - orig = self.orig - sparse = self.sparse - - assert sparse.iat[0] == orig.iat[0] - assert np.isnan(sparse.iat[1]) - assert np.isnan(sparse.iat[2]) - assert sparse.iat[3] == orig.iat[3] - assert np.isnan(sparse.iat[4]) - - assert np.isnan(sparse.iat[-1]) - assert sparse.iat[-5] == orig.iat[-5] - - def test_iat_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0]) - sparse = orig.to_sparse() - assert sparse.iat[0] == orig.iat[0] - assert np.isnan(sparse.iat[1]) - assert sparse.iat[2] == orig.iat[2] - assert sparse.iat[3] == orig.iat[3] - assert sparse.iat[4] == orig.iat[4] - - assert sparse.iat[-1] == orig.iat[-1] - assert sparse.iat[-5] == orig.iat[-5] - - def test_get(self): - s = pd.SparseSeries([1, np.nan, np.nan, 3, np.nan]) - assert s.get(0) == 1 - assert np.isnan(s.get(1)) - assert s.get(5) is None - - s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list("ABCDE")) - assert s.get("A") == 1 - assert np.isnan(s.get("B")) - assert s.get("C") == 0 - assert s.get("XX") is None - - s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list("ABCDE"), fill_value=0) - assert s.get("A") == 1 - assert np.isnan(s.get("B")) - assert s.get("C") == 0 - assert s.get("XX") is None - - def test_take(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list("ABCDE")) - sparse = orig.to_sparse() - - tm.assert_sp_series_equal(sparse.take([0]), orig.take([0]).to_sparse()) - tm.assert_sp_series_equal( - sparse.take([0, 1, 3]), orig.take([0, 1, 3]).to_sparse() - ) - tm.assert_sp_series_equal( - sparse.take([-1, -2]), orig.take([-1, -2]).to_sparse() - ) - - def test_take_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0], index=list("ABCDE")) - sparse = orig.to_sparse(fill_value=0) - - tm.assert_sp_series_equal( - sparse.take([0]), orig.take([0]).to_sparse(fill_value=0) - ) - - exp = orig.take([0, 1, 3]).to_sparse(fill_value=0) - tm.assert_sp_series_equal(sparse.take([0, 1, 3]), exp) - - exp = orig.take([-1, -2]).to_sparse(fill_value=0) - tm.assert_sp_series_equal(sparse.take([-1, -2]), exp) - - def test_reindex(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list("ABCDE")) - sparse = orig.to_sparse() - - res = sparse.reindex(["A", "E", "C", "D"]) - exp = orig.reindex(["A", "E", "C", "D"]).to_sparse() - tm.assert_sp_series_equal(res, exp) - - # all missing & fill_value - res = sparse.reindex(["B", "E", "C"]) - exp = orig.reindex(["B", "E", "C"]).to_sparse() - tm.assert_sp_series_equal(res, exp) - - orig = pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan], index=list("ABCDE")) - sparse = orig.to_sparse() - - res = sparse.reindex(["A", "E", "C", "D"]) - exp = orig.reindex(["A", "E", "C", "D"]).to_sparse() - tm.assert_sp_series_equal(res, exp) - - def test_fill_value_reindex(self): - orig = pd.Series([1, np.nan, 0, 3, 0], index=list("ABCDE")) - sparse = orig.to_sparse(fill_value=0) - - res = sparse.reindex(["A", "E", "C", "D"]) - exp = orig.reindex(["A", "E", "C", "D"]).to_sparse(fill_value=0) - tm.assert_sp_series_equal(res, exp) - - # includes missing and fill_value - res = sparse.reindex(["A", "B", "C"]) - exp = orig.reindex(["A", "B", "C"]).to_sparse(fill_value=0) - tm.assert_sp_series_equal(res, exp) - - # all missing - orig = pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan], index=list("ABCDE")) - sparse = orig.to_sparse(fill_value=0) - - res = sparse.reindex(["A", "E", "C", "D"]) - exp = orig.reindex(["A", "E", "C", "D"]).to_sparse(fill_value=0) - tm.assert_sp_series_equal(res, exp) - - # all fill_value - orig = pd.Series([0.0, 0.0, 0.0, 0.0, 0.0], index=list("ABCDE")) - sparse = orig.to_sparse(fill_value=0) - - def test_fill_value_reindex_coerces_float_int(self): - orig = pd.Series([1, np.nan, 0, 3, 0], index=list("ABCDE")) - sparse = orig.to_sparse(fill_value=0) - - res = sparse.reindex(["A", "E", "C", "D"]) - exp = orig.reindex(["A", "E", "C", "D"]).to_sparse(fill_value=0) - tm.assert_sp_series_equal(res, exp) - - def test_reindex_fill_value(self): - floats = pd.Series([1.0, 2.0, 3.0]).to_sparse() - result = floats.reindex([1, 2, 3], fill_value=0) - expected = pd.Series([2.0, 3.0, 0], index=[1, 2, 3]).to_sparse() - tm.assert_sp_series_equal(result, expected) - - def test_reindex_nearest(self): - s = pd.Series(np.arange(10, dtype="float64")).to_sparse() - target = [0.1, 0.9, 1.5, 2.0] - actual = s.reindex(target, method="nearest") - expected = pd.Series(np.around(target), target).to_sparse() - tm.assert_sp_series_equal(expected, actual) - - actual = s.reindex(target, method="nearest", tolerance=0.2) - expected = pd.Series([0, 1, np.nan, 2], target).to_sparse() - tm.assert_sp_series_equal(expected, actual) - - actual = s.reindex(target, method="nearest", tolerance=[0.3, 0.01, 0.4, 3]) - expected = pd.Series([0, np.nan, np.nan, 2], target).to_sparse() - tm.assert_sp_series_equal(expected, actual) - - @pytest.mark.parametrize("kind", ["integer", "block"]) - @pytest.mark.parametrize("fill", [True, False, np.nan]) - def tests_indexing_with_sparse(self, kind, fill): - # see gh-13985 - arr = pd.SparseArray([1, 2, 3], kind=kind) - indexer = pd.SparseArray([True, False, True], fill_value=fill, dtype=bool) - - expected = arr[indexer] - result = pd.SparseArray([1, 3], kind=kind) - tm.assert_sp_array_equal(result, expected) - - s = pd.SparseSeries(arr, index=["a", "b", "c"], dtype=np.float64) - expected = pd.SparseSeries( - [1, 3], - index=["a", "c"], - kind=kind, - dtype=SparseDtype(np.float64, s.fill_value), - ) - - tm.assert_sp_series_equal(s[indexer], expected) - tm.assert_sp_series_equal(s.loc[indexer], expected) - tm.assert_sp_series_equal(s.iloc[indexer], expected) - - indexer = pd.SparseSeries(indexer, index=["a", "b", "c"]) - tm.assert_sp_series_equal(s[indexer], expected) - tm.assert_sp_series_equal(s.loc[indexer], expected) - - msg = "iLocation based boolean indexing cannot use an indexable as a mask" - with pytest.raises(ValueError, match=msg): - s.iloc[indexer] - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing): - def setup_method(self, method): - # Mi with duplicated values - idx = pd.MultiIndex.from_tuples( - [("A", 0), ("A", 1), ("B", 0), ("C", 0), ("C", 1)] - ) - self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=idx) - self.sparse = self.orig.to_sparse() - - def test_getitem_multi(self): - orig = self.orig - sparse = self.sparse - - assert sparse[0] == orig[0] - assert np.isnan(sparse[1]) - assert sparse[3] == orig[3] - - tm.assert_sp_series_equal(sparse["A"], orig["A"].to_sparse()) - tm.assert_sp_series_equal(sparse["B"], orig["B"].to_sparse()) - - result = sparse[[1, 3, 4]] - exp = orig[[1, 3, 4]].to_sparse() - tm.assert_sp_series_equal(result, exp) - - # dense array - result = sparse[orig % 2 == 1] - exp = orig[orig % 2 == 1].to_sparse() - tm.assert_sp_series_equal(result, exp) - - # sparse array (actuary it coerces to normal Series) - result = sparse[sparse % 2 == 1] - exp = orig[orig % 2 == 1].to_sparse() - tm.assert_sp_series_equal(result, exp) - - # sparse array - result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] - tm.assert_sp_series_equal(result, exp) - - def test_getitem_multi_tuple(self): - orig = self.orig - sparse = self.sparse - - assert sparse["C", 0] == orig["C", 0] - assert np.isnan(sparse["A", 1]) - assert np.isnan(sparse["B", 0]) - - def test_getitems_slice_multi(self): - orig = self.orig - sparse = self.sparse - - tm.assert_sp_series_equal(sparse[2:], orig[2:].to_sparse()) - tm.assert_sp_series_equal(sparse.loc["B":], orig.loc["B":].to_sparse()) - tm.assert_sp_series_equal(sparse.loc["C":], orig.loc["C":].to_sparse()) - - tm.assert_sp_series_equal(sparse.loc["A":"B"], orig.loc["A":"B"].to_sparse()) - tm.assert_sp_series_equal(sparse.loc[:"B"], orig.loc[:"B"].to_sparse()) - - def test_loc(self): - # need to be override to use different label - orig = self.orig - sparse = self.sparse - - tm.assert_sp_series_equal(sparse.loc["A"], orig.loc["A"].to_sparse()) - tm.assert_sp_series_equal(sparse.loc["B"], orig.loc["B"].to_sparse()) - - result = sparse.loc[[1, 3, 4]] - exp = orig.loc[[1, 3, 4]].to_sparse() - tm.assert_sp_series_equal(result, exp) - - # exceeds the bounds - result = sparse.loc[[1, 3, 4, 5]] - exp = orig.loc[[1, 3, 4, 5]].to_sparse() - tm.assert_sp_series_equal(result, exp) - - # single element list (GH 15447) - result = sparse.loc[["A"]] - exp = orig.loc[["A"]].to_sparse() - tm.assert_sp_series_equal(result, exp) - - # dense array - result = sparse.loc[orig % 2 == 1] - exp = orig.loc[orig % 2 == 1].to_sparse() - tm.assert_sp_series_equal(result, exp) - - # sparse array (actuary it coerces to normal Series) - result = sparse.loc[sparse % 2 == 1] - exp = orig.loc[orig % 2 == 1].to_sparse() - tm.assert_sp_series_equal(result, exp) - - # sparse array - result = sparse.loc[pd.SparseArray(sparse % 2 == 1, dtype=bool)] - tm.assert_sp_series_equal(result, exp) - - def test_loc_multi_tuple(self): - orig = self.orig - sparse = self.sparse - - assert sparse.loc["C", 0] == orig.loc["C", 0] - assert np.isnan(sparse.loc["A", 1]) - assert np.isnan(sparse.loc["B", 0]) - - def test_loc_slice(self): - orig = self.orig - sparse = self.sparse - tm.assert_sp_series_equal(sparse.loc["A":], orig.loc["A":].to_sparse()) - tm.assert_sp_series_equal(sparse.loc["B":], orig.loc["B":].to_sparse()) - tm.assert_sp_series_equal(sparse.loc["C":], orig.loc["C":].to_sparse()) - - tm.assert_sp_series_equal(sparse.loc["A":"B"], orig.loc["A":"B"].to_sparse()) - tm.assert_sp_series_equal(sparse.loc[:"B"], orig.loc[:"B"].to_sparse()) - - def test_reindex(self): - # GH 15447 - orig = self.orig - sparse = self.sparse - - res = sparse.reindex([("A", 0), ("C", 1)]) - exp = orig.reindex([("A", 0), ("C", 1)]).to_sparse() - tm.assert_sp_series_equal(res, exp) - - # On specific level: - res = sparse.reindex(["A", "C", "B"], level=0) - exp = orig.reindex(["A", "C", "B"], level=0).to_sparse() - tm.assert_sp_series_equal(res, exp) - - # single element list (GH 15447) - res = sparse.reindex(["A"], level=0) - exp = orig.reindex(["A"], level=0).to_sparse() - tm.assert_sp_series_equal(res, exp) - - with pytest.raises(TypeError): - # Incomplete keys are not accepted for reindexing: - sparse.reindex(["A", "C"]) - - # "copy" argument: - res = sparse.reindex(sparse.index, copy=True) - exp = orig.reindex(orig.index, copy=True).to_sparse() - tm.assert_sp_series_equal(res, exp) - assert sparse is not res - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") -class TestSparseDataFrameIndexing: - def test_getitem(self): - orig = pd.DataFrame( - [[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4], [0, np.nan, 5]], - columns=list("xyz"), - ) - sparse = orig.to_sparse() - - tm.assert_sp_series_equal(sparse["x"], orig["x"].to_sparse()) - tm.assert_sp_frame_equal(sparse[["x"]], orig[["x"]].to_sparse()) - tm.assert_sp_frame_equal(sparse[["z", "x"]], orig[["z", "x"]].to_sparse()) - - tm.assert_sp_frame_equal( - sparse[[True, False, True, True]], - orig[[True, False, True, True]].to_sparse(), - ) - - tm.assert_sp_frame_equal(sparse.iloc[[1, 2]], orig.iloc[[1, 2]].to_sparse()) - - def test_getitem_fill_value(self): - orig = pd.DataFrame( - [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], - columns=list("xyz"), - ) - sparse = orig.to_sparse(fill_value=0) - - result = sparse[["z"]] - expected = orig[["z"]].to_sparse(fill_value=0) - tm.assert_sp_frame_equal(result, expected, check_fill_value=False) - - tm.assert_sp_series_equal(sparse["y"], orig["y"].to_sparse(fill_value=0)) - - exp = orig[["x"]].to_sparse(fill_value=0) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(sparse[["x"]], exp) - - exp = orig[["z", "x"]].to_sparse(fill_value=0) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(sparse[["z", "x"]], exp) - - indexer = [True, False, True, True] - exp = orig[indexer].to_sparse(fill_value=0) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(sparse[indexer], exp) - - exp = orig.iloc[[1, 2]].to_sparse(fill_value=0) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(sparse.iloc[[1, 2]], exp) - - def test_loc(self): - orig = pd.DataFrame( - [[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4]], - columns=list("xyz"), - ) - sparse = orig.to_sparse() - - assert sparse.loc[0, "x"] == 1 - assert np.isnan(sparse.loc[1, "z"]) - assert sparse.loc[2, "z"] == 4 - - # have to specify `kind='integer'`, since we construct a - # new SparseArray here, and the default sparse type is - # integer there, but block in SparseSeries - tm.assert_sp_series_equal(sparse.loc[0], orig.loc[0].to_sparse(kind="integer")) - tm.assert_sp_series_equal(sparse.loc[1], orig.loc[1].to_sparse(kind="integer")) - tm.assert_sp_series_equal( - sparse.loc[2, :], orig.loc[2, :].to_sparse(kind="integer") - ) - tm.assert_sp_series_equal( - sparse.loc[2, :], orig.loc[2, :].to_sparse(kind="integer") - ) - tm.assert_sp_series_equal(sparse.loc[:, "y"], orig.loc[:, "y"].to_sparse()) - tm.assert_sp_series_equal(sparse.loc[:, "y"], orig.loc[:, "y"].to_sparse()) - - result = sparse.loc[[1, 2]] - exp = orig.loc[[1, 2]].to_sparse() - tm.assert_sp_frame_equal(result, exp) - - result = sparse.loc[[1, 2], :] - exp = orig.loc[[1, 2], :].to_sparse() - tm.assert_sp_frame_equal(result, exp) - - result = sparse.loc[:, ["x", "z"]] - exp = orig.loc[:, ["x", "z"]].to_sparse() - tm.assert_sp_frame_equal(result, exp) - - result = sparse.loc[[0, 2], ["x", "z"]] - exp = orig.loc[[0, 2], ["x", "z"]].to_sparse() - tm.assert_sp_frame_equal(result, exp) - - # exceeds the bounds - result = sparse.reindex([1, 3, 4, 5]) - exp = orig.reindex([1, 3, 4, 5]).to_sparse() - tm.assert_sp_frame_equal(result, exp) - - # dense array - result = sparse.loc[orig.x % 2 == 1] - exp = orig.loc[orig.x % 2 == 1].to_sparse() - tm.assert_sp_frame_equal(result, exp) - - # sparse array (actuary it coerces to normal Series) - result = sparse.loc[sparse.x % 2 == 1] - exp = orig.loc[orig.x % 2 == 1].to_sparse() - tm.assert_sp_frame_equal(result, exp) - - # sparse array - result = sparse.loc[pd.SparseArray(sparse.x % 2 == 1, dtype=bool)] - tm.assert_sp_frame_equal(result, exp) - - def test_loc_index(self): - orig = pd.DataFrame( - [[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4]], - index=list("abc"), - columns=list("xyz"), - ) - sparse = orig.to_sparse() - - assert sparse.loc["a", "x"] == 1 - assert np.isnan(sparse.loc["b", "z"]) - assert sparse.loc["c", "z"] == 4 - - tm.assert_sp_series_equal( - sparse.loc["a"], orig.loc["a"].to_sparse(kind="integer") - ) - tm.assert_sp_series_equal( - sparse.loc["b"], orig.loc["b"].to_sparse(kind="integer") - ) - tm.assert_sp_series_equal( - sparse.loc["b", :], orig.loc["b", :].to_sparse(kind="integer") - ) - tm.assert_sp_series_equal( - sparse.loc["b", :], orig.loc["b", :].to_sparse(kind="integer") - ) - - tm.assert_sp_series_equal(sparse.loc[:, "z"], orig.loc[:, "z"].to_sparse()) - tm.assert_sp_series_equal(sparse.loc[:, "z"], orig.loc[:, "z"].to_sparse()) - - result = sparse.loc[["a", "b"]] - exp = orig.loc[["a", "b"]].to_sparse() - tm.assert_sp_frame_equal(result, exp) - - result = sparse.loc[["a", "b"], :] - exp = orig.loc[["a", "b"], :].to_sparse() - tm.assert_sp_frame_equal(result, exp) - - result = sparse.loc[:, ["x", "z"]] - exp = orig.loc[:, ["x", "z"]].to_sparse() - tm.assert_sp_frame_equal(result, exp) - - result = sparse.loc[["c", "a"], ["x", "z"]] - exp = orig.loc[["c", "a"], ["x", "z"]].to_sparse() - tm.assert_sp_frame_equal(result, exp) - - # dense array - result = sparse.loc[orig.x % 2 == 1] - exp = orig.loc[orig.x % 2 == 1].to_sparse() - tm.assert_sp_frame_equal(result, exp) - - # sparse array (actuary it coerces to normal Series) - result = sparse.loc[sparse.x % 2 == 1] - exp = orig.loc[orig.x % 2 == 1].to_sparse() - tm.assert_sp_frame_equal(result, exp) - - # sparse array - result = sparse.loc[pd.SparseArray(sparse.x % 2 == 1, dtype=bool)] - tm.assert_sp_frame_equal(result, exp) - - def test_loc_slice(self): - orig = pd.DataFrame( - [[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4]], - columns=list("xyz"), - ) - sparse = orig.to_sparse() - tm.assert_sp_frame_equal(sparse.loc[2:], orig.loc[2:].to_sparse()) - - def test_iloc(self): - orig = pd.DataFrame([[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4]]) - sparse = orig.to_sparse() - - assert sparse.iloc[1, 1] == 3 - assert np.isnan(sparse.iloc[2, 0]) - - tm.assert_sp_series_equal(sparse.iloc[0], orig.loc[0].to_sparse(kind="integer")) - tm.assert_sp_series_equal(sparse.iloc[1], orig.loc[1].to_sparse(kind="integer")) - tm.assert_sp_series_equal( - sparse.iloc[2, :], orig.iloc[2, :].to_sparse(kind="integer") - ) - tm.assert_sp_series_equal( - sparse.iloc[2, :], orig.iloc[2, :].to_sparse(kind="integer") - ) - tm.assert_sp_series_equal(sparse.iloc[:, 1], orig.iloc[:, 1].to_sparse()) - tm.assert_sp_series_equal(sparse.iloc[:, 1], orig.iloc[:, 1].to_sparse()) - - result = sparse.iloc[[1, 2]] - exp = orig.iloc[[1, 2]].to_sparse() - tm.assert_sp_frame_equal(result, exp) - - result = sparse.iloc[[1, 2], :] - exp = orig.iloc[[1, 2], :].to_sparse() - tm.assert_sp_frame_equal(result, exp) - - result = sparse.iloc[:, [1, 0]] - exp = orig.iloc[:, [1, 0]].to_sparse() - tm.assert_sp_frame_equal(result, exp) - - result = sparse.iloc[[2], [1, 0]] - exp = orig.iloc[[2], [1, 0]].to_sparse() - tm.assert_sp_frame_equal(result, exp) - - with pytest.raises(IndexError): - sparse.iloc[[1, 3, 5]] - - def test_iloc_slice(self): - orig = pd.DataFrame( - [[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4]], - columns=list("xyz"), - ) - sparse = orig.to_sparse() - tm.assert_sp_frame_equal(sparse.iloc[2:], orig.iloc[2:].to_sparse()) - - def test_at(self): - orig = pd.DataFrame( - [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], - index=list("ABCD"), - columns=list("xyz"), - ) - sparse = orig.to_sparse() - assert sparse.at["A", "x"] == orig.at["A", "x"] - assert np.isnan(sparse.at["B", "z"]) - assert np.isnan(sparse.at["C", "y"]) - assert sparse.at["D", "x"] == orig.at["D", "x"] - - def test_at_fill_value(self): - orig = pd.DataFrame( - [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], - index=list("ABCD"), - columns=list("xyz"), - ) - sparse = orig.to_sparse(fill_value=0) - assert sparse.at["A", "x"] == orig.at["A", "x"] - assert np.isnan(sparse.at["B", "z"]) - assert np.isnan(sparse.at["C", "y"]) - assert sparse.at["D", "x"] == orig.at["D", "x"] - - def test_iat(self): - orig = pd.DataFrame( - [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], - index=list("ABCD"), - columns=list("xyz"), - ) - sparse = orig.to_sparse() - assert sparse.iat[0, 0] == orig.iat[0, 0] - assert np.isnan(sparse.iat[1, 2]) - assert np.isnan(sparse.iat[2, 1]) - assert sparse.iat[2, 0] == orig.iat[2, 0] - - assert np.isnan(sparse.iat[-1, -2]) - assert sparse.iat[-1, -1] == orig.iat[-1, -1] - - def test_iat_fill_value(self): - orig = pd.DataFrame( - [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], - index=list("ABCD"), - columns=list("xyz"), - ) - sparse = orig.to_sparse(fill_value=0) - assert sparse.iat[0, 0] == orig.iat[0, 0] - assert np.isnan(sparse.iat[1, 2]) - assert np.isnan(sparse.iat[2, 1]) - assert sparse.iat[2, 0] == orig.iat[2, 0] - - assert np.isnan(sparse.iat[-1, -2]) - assert sparse.iat[-1, -1] == orig.iat[-1, -1] - - def test_take(self): - orig = pd.DataFrame( - [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], - columns=list("xyz"), - ) - sparse = orig.to_sparse() - - tm.assert_sp_frame_equal(sparse.take([0]), orig.take([0]).to_sparse()) - tm.assert_sp_frame_equal(sparse.take([0, 1]), orig.take([0, 1]).to_sparse()) - tm.assert_sp_frame_equal(sparse.take([-1, -2]), orig.take([-1, -2]).to_sparse()) - - def test_take_fill_value(self): - orig = pd.DataFrame( - [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], - columns=list("xyz"), - ) - sparse = orig.to_sparse(fill_value=0) - - exp = orig.take([0]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(sparse.take([0]), exp) - - exp = orig.take([0, 1]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(sparse.take([0, 1]), exp) - - exp = orig.take([-1, -2]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(sparse.take([-1, -2]), exp) - - def test_reindex(self): - orig = pd.DataFrame( - [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], - index=list("ABCD"), - columns=list("xyz"), - ) - sparse = orig.to_sparse() - - res = sparse.reindex(["A", "C", "B"]) - exp = orig.reindex(["A", "C", "B"]).to_sparse() - tm.assert_sp_frame_equal(res, exp) - - orig = pd.DataFrame( - [ - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - ], - index=list("ABCD"), - columns=list("xyz"), - ) - sparse = orig.to_sparse() - - res = sparse.reindex(["A", "C", "B"]) - exp = orig.reindex(["A", "C", "B"]).to_sparse() - tm.assert_sp_frame_equal(res, exp) - - def test_reindex_fill_value(self): - orig = pd.DataFrame( - [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], - index=list("ABCD"), - columns=list("xyz"), - ) - sparse = orig.to_sparse(fill_value=0) - - res = sparse.reindex(["A", "C", "B"]) - exp = orig.reindex(["A", "C", "B"]).to_sparse(fill_value=0) - tm.assert_sp_frame_equal(res, exp) - - # all missing - orig = pd.DataFrame( - [ - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - ], - index=list("ABCD"), - columns=list("xyz"), - ) - sparse = orig.to_sparse(fill_value=0) - - res = sparse.reindex(["A", "C", "B"]) - exp = orig.reindex(["A", "C", "B"]).to_sparse(fill_value=0) - tm.assert_sp_frame_equal(res, exp) - - # all fill_value - orig = pd.DataFrame( - [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], - index=list("ABCD"), - columns=list("xyz"), - dtype=np.int, - ) - sparse = orig.to_sparse(fill_value=0) - - res = sparse.reindex(["A", "C", "B"]) - exp = orig.reindex(["A", "C", "B"]).to_sparse(fill_value=0) - tm.assert_sp_frame_equal(res, exp) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -class TestMultitype: - def setup_method(self, method): - self.cols = ["string", "int", "float", "object"] - - self.string_series = pd.SparseSeries(["a", "b", "c"]) - self.int_series = pd.SparseSeries([1, 2, 3]) - self.float_series = pd.SparseSeries([1.1, 1.2, 1.3]) - self.object_series = pd.SparseSeries([[], {}, set()]) - self.sdf = pd.SparseDataFrame( - { - "string": self.string_series, - "int": self.int_series, - "float": self.float_series, - "object": self.object_series, - } - ) - self.sdf = self.sdf[self.cols] - self.ss = pd.SparseSeries(["a", 1, 1.1, []], index=self.cols) - - def test_frame_basic_dtypes(self): - for _, row in self.sdf.iterrows(): - assert row.dtype == SparseDtype(object) - tm.assert_sp_series_equal( - self.sdf["string"], self.string_series, check_names=False - ) - tm.assert_sp_series_equal(self.sdf["int"], self.int_series, check_names=False) - tm.assert_sp_series_equal( - self.sdf["float"], self.float_series, check_names=False - ) - tm.assert_sp_series_equal( - self.sdf["object"], self.object_series, check_names=False - ) - - def test_frame_indexing_single(self): - tm.assert_sp_series_equal( - self.sdf.iloc[0], - pd.SparseSeries(["a", 1, 1.1, []], index=self.cols), - check_names=False, - ) - tm.assert_sp_series_equal( - self.sdf.iloc[1], - pd.SparseSeries(["b", 2, 1.2, {}], index=self.cols), - check_names=False, - ) - tm.assert_sp_series_equal( - self.sdf.iloc[2], - pd.SparseSeries(["c", 3, 1.3, set()], index=self.cols), - check_names=False, - ) - - def test_frame_indexing_multiple(self): - tm.assert_sp_frame_equal(self.sdf, self.sdf[:]) - tm.assert_sp_frame_equal(self.sdf, self.sdf.loc[:]) - tm.assert_sp_frame_equal( - self.sdf.iloc[[1, 2]], - pd.SparseDataFrame( - { - "string": self.string_series.iloc[[1, 2]], - "int": self.int_series.iloc[[1, 2]], - "float": self.float_series.iloc[[1, 2]], - "object": self.object_series.iloc[[1, 2]], - }, - index=[1, 2], - )[self.cols], - ) - tm.assert_sp_frame_equal( - self.sdf[["int", "string"]], - pd.SparseDataFrame({"int": self.int_series, "string": self.string_series}), - ) - - def test_series_indexing_single(self): - for i, idx in enumerate(self.cols): - assert self.ss.iloc[i] == self.ss[idx] - tm.assert_class_equal(self.ss.iloc[i], self.ss[idx], obj="series index") - - assert self.ss["string"] == "a" - assert self.ss["int"] == 1 - assert self.ss["float"] == 1.1 - assert self.ss["object"] == [] - - def test_series_indexing_multiple(self): - tm.assert_sp_series_equal( - self.ss.loc[["string", "int"]], - pd.SparseSeries(["a", 1], index=["string", "int"]), - ) - tm.assert_sp_series_equal( - self.ss.loc[["string", "object"]], - pd.SparseSeries(["a", []], index=["string", "object"]), - ) diff --git a/pandas/tests/sparse/test_pivot.py b/pandas/tests/sparse/test_pivot.py deleted file mode 100644 index 880c1c55f9f79..0000000000000 --- a/pandas/tests/sparse/test_pivot.py +++ /dev/null @@ -1,59 +0,0 @@ -import numpy as np -import pytest - -import pandas as pd -import pandas.util.testing as tm - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") -@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") -class TestPivotTable: - def setup_method(self, method): - rs = np.random.RandomState(0) - self.dense = pd.DataFrame( - { - "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], - "B": ["one", "one", "two", "three", "two", "two", "one", "three"], - "C": rs.randn(8), - "D": rs.randn(8), - "E": [np.nan, np.nan, 1, 2, np.nan, 1, np.nan, np.nan], - } - ) - self.sparse = self.dense.to_sparse() - - def test_pivot_table(self): - res_sparse = pd.pivot_table(self.sparse, index="A", columns="B", values="C") - res_dense = pd.pivot_table(self.dense, index="A", columns="B", values="C") - tm.assert_frame_equal(res_sparse, res_dense) - - res_sparse = pd.pivot_table(self.sparse, index="A", columns="B", values="E") - res_dense = pd.pivot_table(self.dense, index="A", columns="B", values="E") - tm.assert_frame_equal(res_sparse, res_dense) - - res_sparse = pd.pivot_table( - self.sparse, index="A", columns="B", values="E", aggfunc="mean" - ) - res_dense = pd.pivot_table( - self.dense, index="A", columns="B", values="E", aggfunc="mean" - ) - tm.assert_frame_equal(res_sparse, res_dense) - - def test_pivot_table_with_nans(self): - res_sparse = pd.pivot_table( - self.sparse, index="A", columns="B", values="E", aggfunc="sum" - ) - res_dense = pd.pivot_table( - self.dense, index="A", columns="B", values="E", aggfunc="sum" - ) - tm.assert_frame_equal(res_sparse, res_dense) - - def test_pivot_table_multi(self): - res_sparse = pd.pivot_table( - self.sparse, index="A", columns="B", values=["D", "E"] - ) - res_dense = pd.pivot_table( - self.dense, index="A", columns="B", values=["D", "E"] - ) - res_dense = res_dense.apply(lambda x: x.astype("Sparse[float64]")) - tm.assert_frame_equal(res_sparse, res_dense) diff --git a/pandas/tests/sparse/test_reshape.py b/pandas/tests/sparse/test_reshape.py deleted file mode 100644 index bb5232f065a04..0000000000000 --- a/pandas/tests/sparse/test_reshape.py +++ /dev/null @@ -1,43 +0,0 @@ -import numpy as np -import pytest - -import pandas as pd -import pandas.util.testing as tm - - -@pytest.fixture -def sparse_df(): - return pd.SparseDataFrame({0: {0: 1}, 1: {1: 1}, 2: {2: 1}}) # eye - - -@pytest.fixture -def multi_index3(): - return pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)]) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -def test_sparse_frame_stack(sparse_df, multi_index3): - ss = sparse_df.stack() - expected = pd.SparseSeries(np.ones(3), index=multi_index3) - tm.assert_sp_series_equal(ss, expected) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -def test_sparse_frame_unstack(sparse_df): - mi = pd.MultiIndex.from_tuples([(0, 0), (1, 0), (1, 2)]) - sparse_df.index = mi - arr = np.array([[1, np.nan, np.nan], [np.nan, 1, np.nan], [np.nan, np.nan, 1]]) - unstacked_df = pd.DataFrame(arr, index=mi).unstack() - unstacked_sdf = sparse_df.unstack() - - tm.assert_numpy_array_equal(unstacked_df.values, unstacked_sdf.values) - - -@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -def test_sparse_series_unstack(sparse_df, multi_index3): - frame = pd.SparseSeries(np.ones(3), index=multi_index3).unstack() - - arr = np.array([1, np.nan, np.nan]) - arrays = {i: pd.SparseArray(np.roll(arr, i)) for i in range(3)} - expected = pd.DataFrame(arrays) - tm.assert_frame_equal(frame, expected) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index ab9eb0ac76e78..af726caa52e88 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1332,8 +1332,6 @@ def assert_frame_equal( _check_isinstance(left, right, DataFrame) if check_frame_type: - # ToDo: There are some tests using rhs is SparseDataFrame - # lhs is DataFrame. Should use assert_class_equal in future assert isinstance(left, type(right)) # assert_class_equal(left, right, obj=obj) @@ -1557,142 +1555,6 @@ def assert_sp_array_equal( assert_numpy_array_equal(left.to_dense(), right.to_dense(), check_dtype=check_dtype) -def assert_sp_series_equal( - left, - right, - check_dtype=True, - exact_indices=True, - check_series_type=True, - check_names=True, - check_kind=True, - check_fill_value=True, - consolidate_block_indices=False, - obj="SparseSeries", -): - """Check that the left and right SparseSeries are equal. - - Parameters - ---------- - left : SparseSeries - right : SparseSeries - check_dtype : bool, default True - Whether to check the Series dtype is identical. - exact_indices : bool, default True - check_series_type : bool, default True - Whether to check the SparseSeries class is identical. - check_names : bool, default True - Whether to check the SparseSeries name attribute. - check_kind : bool, default True - Whether to just the kind of the sparse index for each column. - check_fill_value : bool, default True - Whether to check that left.fill_value matches right.fill_value - consolidate_block_indices : bool, default False - Whether to consolidate contiguous blocks for sparse arrays with - a BlockIndex. Some operations, e.g. concat, will end up with - block indices that could be consolidated. Setting this to true will - create a new BlockIndex for that array, with consolidated - block indices. - obj : str, default 'SparseSeries' - Specify the object name being compared, internally used to show - the appropriate assertion message. - """ - _check_isinstance(left, right, pd.SparseSeries) - - if check_series_type: - assert_class_equal(left, right, obj=obj) - - assert_index_equal(left.index, right.index, obj="{obj}.index".format(obj=obj)) - - assert_sp_array_equal( - left.values, - right.values, - check_kind=check_kind, - check_fill_value=check_fill_value, - consolidate_block_indices=consolidate_block_indices, - ) - - if check_names: - assert_attr_equal("name", left, right) - if check_dtype: - assert_attr_equal("dtype", left, right) - - assert_numpy_array_equal(np.asarray(left.values), np.asarray(right.values)) - - -def assert_sp_frame_equal( - left, - right, - check_dtype=True, - exact_indices=True, - check_frame_type=True, - check_kind=True, - check_fill_value=True, - consolidate_block_indices=False, - obj="SparseDataFrame", -): - """Check that the left and right SparseDataFrame are equal. - - Parameters - ---------- - left : SparseDataFrame - right : SparseDataFrame - check_dtype : bool, default True - Whether to check the Series dtype is identical. - exact_indices : bool, default True - SparseSeries SparseIndex objects must be exactly the same, - otherwise just compare dense representations. - check_frame_type : bool, default True - Whether to check the SparseDataFrame class is identical. - check_kind : bool, default True - Whether to just the kind of the sparse index for each column. - check_fill_value : bool, default True - Whether to check that left.fill_value matches right.fill_value - consolidate_block_indices : bool, default False - Whether to consolidate contiguous blocks for sparse arrays with - a BlockIndex. Some operations, e.g. concat, will end up with - block indices that could be consolidated. Setting this to true will - create a new BlockIndex for that array, with consolidated - block indices. - obj : str, default 'SparseDataFrame' - Specify the object name being compared, internally used to show - the appropriate assertion message. - """ - _check_isinstance(left, right, pd.SparseDataFrame) - - if check_frame_type: - assert_class_equal(left, right, obj=obj) - - assert_index_equal(left.index, right.index, obj="{obj}.index".format(obj=obj)) - assert_index_equal(left.columns, right.columns, obj="{obj}.columns".format(obj=obj)) - - if check_fill_value: - assert_attr_equal("default_fill_value", left, right, obj=obj) - - for col, series in left.items(): - assert col in right - # trade-off? - - if exact_indices: - assert_sp_series_equal( - series, - right[col], - check_dtype=check_dtype, - check_kind=check_kind, - check_fill_value=check_fill_value, - consolidate_block_indices=consolidate_block_indices, - ) - else: - assert_series_equal( - series.to_dense(), right[col].to_dense(), check_dtype=check_dtype - ) - - # do I care? - # assert(left.default_kind == right.default_kind) - - for col in right: - assert col in left - - # ----------------------------------------------------------------------------- # Others @@ -2876,30 +2738,6 @@ def _constructor_sliced(self): return SubclassedSeries -class SubclassedSparseSeries(pd.SparseSeries): - _metadata = ["testattr"] - - @property - def _constructor(self): - return SubclassedSparseSeries - - @property - def _constructor_expanddim(self): - return SubclassedSparseDataFrame - - -class SubclassedSparseDataFrame(pd.SparseDataFrame): - _metadata = ["testattr"] - - @property - def _constructor(self): - return SubclassedSparseDataFrame - - @property - def _constructor_sliced(self): - return SubclassedSparseSeries - - class SubclassedCategorical(Categorical): @property def _constructor(self):