From 694cb20aa5cfcfab9cf43ac3681fb7a8ca6c00b9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 18 Mar 2020 17:24:23 -0700 Subject: [PATCH] CLN: remove _ndarray_values (#32768) --- doc/source/development/internals.rst | 10 ++---- doc/source/reference/extensions.rst | 1 - pandas/core/arrays/base.py | 17 ---------- pandas/core/arrays/categorical.py | 13 ++------ pandas/core/arrays/datetimelike.py | 4 --- pandas/core/arrays/integer.py | 12 ------- pandas/core/base.py | 17 ---------- pandas/core/indexes/base.py | 31 ++++++++----------- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/extension.py | 4 --- pandas/core/series.py | 22 ++++++------- pandas/tests/base/test_conversion.py | 28 ----------------- pandas/tests/indexes/common.py | 10 +----- .../indexes/interval/test_constructors.py | 4 +-- .../tests/indexes/interval/test_interval.py | 2 +- .../tests/indexes/period/test_constructors.py | 4 +-- pandas/tests/indexes/period/test_period.py | 6 ++-- pandas/tests/reductions/test_reductions.py | 4 +-- 18 files changed, 37 insertions(+), 154 deletions(-) diff --git a/doc/source/development/internals.rst b/doc/source/development/internals.rst index 4ad045a91b5fe..8f1c3d5d818c2 100644 --- a/doc/source/development/internals.rst +++ b/doc/source/development/internals.rst @@ -89,16 +89,10 @@ pandas extends NumPy's type system with custom types, like ``Categorical`` or datetimes with a timezone, so we have multiple notions of "values". For 1-D containers (``Index`` classes and ``Series``) we have the following convention: -* ``cls._ndarray_values`` is *always* a NumPy ``ndarray``. Ideally, - ``_ndarray_values`` is cheap to compute. For example, for a ``Categorical``, - this returns the codes, not the array of objects. * ``cls._values`` refers is the "best possible" array. This could be an - ``ndarray``, ``ExtensionArray``, or in ``Index`` subclass (note: we're in the - process of removing the index subclasses here so that it's always an - ``ndarray`` or ``ExtensionArray``). + ``ndarray`` or ``ExtensionArray``. -So, for example, ``Series[category]._values`` is a ``Categorical``, while -``Series[category]._ndarray_values`` is the underlying codes. +So, for example, ``Series[category]._values`` is a ``Categorical``. .. _ref-subclassing-pandas: diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 78fdfbfd28144..4c0763e091b75 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -37,7 +37,6 @@ objects. api.extensions.ExtensionArray._from_factorized api.extensions.ExtensionArray._from_sequence api.extensions.ExtensionArray._from_sequence_of_strings - api.extensions.ExtensionArray._ndarray_values api.extensions.ExtensionArray._reduce api.extensions.ExtensionArray._values_for_argsort api.extensions.ExtensionArray._values_for_factorize diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c42c1539daa5a..ab24beb0da4fc 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -93,7 +93,6 @@ class ExtensionArray: _from_factorized _from_sequence _from_sequence_of_strings - _ndarray_values _reduce _values_for_argsort _values_for_factorize @@ -1046,22 +1045,6 @@ def _concat_same_type( # of objects _can_hold_na = True - @property - def _ndarray_values(self) -> np.ndarray: - """ - Internal pandas method for lossy conversion to a NumPy ndarray. - - This method is not part of the pandas interface. - - The expectation is that this is cheap to compute, and is primarily - used for interacting with our indexers. - - Returns - ------- - array : ndarray - """ - return np.array(self) - def _reduce(self, name, skipna=True, **kwargs): """ Return a scalar result of performing the reduction operation. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 497a9893e6c66..bfccc6f244219 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -451,10 +451,6 @@ def dtype(self) -> CategoricalDtype: """ return self._dtype - @property - def _ndarray_values(self) -> np.ndarray: - return self.codes - @property def _constructor(self) -> Type["Categorical"]: return Categorical @@ -2567,12 +2563,7 @@ def _get_codes_for_values(values, categories): """ dtype_equal = is_dtype_equal(values.dtype, categories.dtype) - if dtype_equal: - # To prevent erroneous dtype coercion in _get_data_algo, retrieve - # the underlying numpy array. gh-22702 - values = getattr(values, "_ndarray_values", values) - categories = getattr(categories, "_ndarray_values", categories) - elif is_extension_array_dtype(categories.dtype) and is_object_dtype(values): + if is_extension_array_dtype(categories.dtype) and is_object_dtype(values): # Support inferring the correct extension dtype from an array of # scalar objects. e.g. # Categorical(array[Period, Period], categories=PeriodIndex(...)) @@ -2582,7 +2573,7 @@ def _get_codes_for_values(values, categories): # exception raised in _from_sequence values = ensure_object(values) categories = ensure_object(categories) - else: + elif not dtype_equal: values = ensure_object(values) categories = ensure_object(categories) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 7510bfd1f67ad..7cf50ff2b88af 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -456,10 +456,6 @@ def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak return self._data.view("i8") - @property - def _ndarray_values(self): - return self._data - # ---------------------------------------------------------------- # Rendering Methods diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index fb33840ad757c..f2880c5cbee42 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -478,18 +478,6 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: data = self.to_numpy(dtype=dtype, **kwargs) return astype_nansafe(data, dtype, copy=False) - @property - def _ndarray_values(self) -> np.ndarray: - """ - Internal pandas method for lossy conversion to a NumPy ndarray. - - This method is not part of the pandas interface. - - The expectation is that this is cheap to compute, and is primarily - used for interacting with our indexers. - """ - return self._data - def _values_for_factorize(self) -> Tuple[np.ndarray, float]: # TODO: https://github.com/pandas-dev/pandas/issues/30037 # use masked algorithms, rather than object-dtype / np.nan. diff --git a/pandas/core/base.py b/pandas/core/base.py index bf2ed02c57a29..9281d2f72b409 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -855,23 +855,6 @@ def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default, **kwargs): result[self.isna()] = na_value return result - @property - def _ndarray_values(self) -> np.ndarray: - """ - The data as an ndarray, possibly losing information. - - The expectation is that this is cheap to compute, and is primarily - used for interacting with our indexers. - - - categorical -> codes - """ - if is_extension_array_dtype(self): - return self.array._ndarray_values - # As a mixin, we depend on the mixing class having values. - # Special mixin syntax may be developed in the future: - # https://github.com/python/typing/issues/246 - return self.values # type: ignore - @property def empty(self): return not self.size diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 162d69d957669..4fa771dfbcf82 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -464,8 +464,7 @@ def _simple_new(cls, values, name: Label = None): # _index_data is a (temporary?) fix to ensure that the direct data # manipulation we do in `_libs/reduction.pyx` continues to work. # We need access to the actual ndarray, since we're messing with - # data buffers and strides. We don't re-use `_ndarray_values`, since - # we actually set this value too. + # data buffers and strides. result._index_data = values result._name = name result._cache = {} @@ -625,7 +624,8 @@ def ravel(self, order="C"): -------- numpy.ndarray.ravel """ - return self._ndarray_values.ravel(order=order) + values = self._get_engine_target() + return values.ravel(order=order) def view(self, cls=None): @@ -3846,29 +3846,24 @@ def _values(self) -> Union[ExtensionArray, np.ndarray]: """ The best array representation. - This is an ndarray or ExtensionArray. This differs from - ``_ndarray_values``, which always returns an ndarray. + This is an ndarray or ExtensionArray. - Both ``_values`` and ``_ndarray_values`` are consistent between - ``Series`` and ``Index`` (except for datetime64[ns], which returns - a DatetimeArray for _values on the Index, but ndarray[M8ns] on the - Series). + ``_values`` are consistent between``Series`` and ``Index``. It may differ from the public '.values' method. - index | values | _values | _ndarray_values | - ----------------- | --------------- | ------------- | --------------- | - Index | ndarray | ndarray | ndarray | - CategoricalIndex | Categorical | Categorical | ndarray[int] | - DatetimeIndex | ndarray[M8ns] | DatetimeArray | ndarray[M8ns] | - DatetimeIndex[tz] | ndarray[M8ns] | DatetimeArray | ndarray[M8ns] | - PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | - IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | + index | values | _values | + ----------------- | --------------- | ------------- | + Index | ndarray | ndarray | + CategoricalIndex | Categorical | Categorical | + DatetimeIndex | ndarray[M8ns] | DatetimeArray | + DatetimeIndex[tz] | ndarray[M8ns] | DatetimeArray | + PeriodIndex | ndarray[object] | PeriodArray | + IntervalIndex | IntervalArray | IntervalArray | See Also -------- values - _ndarray_values """ return self._data diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 9c55d2de946a8..2f641a3d4c111 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -179,7 +179,7 @@ def sort_values(self, return_indexer=False, ascending=True): sorted_index = self.take(_as) return sorted_index, _as else: - # NB: using asi8 instead of _ndarray_values matters in numpy 1.18 + # NB: using asi8 instead of _data matters in numpy 1.18 # because the treatment of NaT has been changed to put NaT last # instead of first. sorted_values = np.sort(self.asi8) diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 6d5f0dbb830f9..6851aeec0ca40 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -228,10 +228,6 @@ def __iter__(self): def __array__(self, dtype=None) -> np.ndarray: return np.asarray(self._data, dtype=dtype) - @property - def _ndarray_values(self) -> np.ndarray: - return self._data._ndarray_values - def _get_engine_target(self) -> np.ndarray: return self._data._values_for_argsort() diff --git a/pandas/core/series.py b/pandas/core/series.py index 21477cce48e63..006a98a6cddcb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -550,21 +550,17 @@ def _values(self): timedelta64 dtypes), while ``.array`` ensures to always return an ExtensionArray. - Differs from ``._ndarray_values``, as that ensures to always return a - numpy array (it will call ``_ndarray_values`` on the ExtensionArray, if - the Series was backed by an ExtensionArray). - Overview: - dtype | values | _values | array | _ndarray_values | - ----------- | ------------- | ------------- | ------------- | --------------- | - Numeric | ndarray | ndarray | PandasArray | ndarray | - Category | Categorical | Categorical | Categorical | ndarray[int] | - dt64[ns] | ndarray[M8ns] | DatetimeArray | DatetimeArray | ndarray[M8ns] | - dt64[ns tz] | ndarray[M8ns] | DatetimeArray | DatetimeArray | ndarray[M8ns] | - td64[ns] | ndarray[m8ns] | TimedeltaArray| ndarray[m8ns] | ndarray[m8ns] | - Period | ndarray[obj] | PeriodArray | PeriodArray | ndarray[int] | - Nullable | EA | EA | EA | ndarray | + dtype | values | _values | array | + ----------- | ------------- | ------------- | ------------- | + Numeric | ndarray | ndarray | PandasArray | + Category | Categorical | Categorical | Categorical | + dt64[ns] | ndarray[M8ns] | DatetimeArray | DatetimeArray | + dt64[ns tz] | ndarray[M8ns] | DatetimeArray | DatetimeArray | + td64[ns] | ndarray[m8ns] | TimedeltaArray| ndarray[m8ns] | + Period | ndarray[obj] | PeriodArray | PeriodArray | + Nullable | EA | EA | EA | """ return self._data.internal_values() diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 46fd1551e6170..59f9103072fe9 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -220,34 +220,6 @@ def test_values_consistent(array, expected_type, dtype): tm.assert_equal(l_values, r_values) -@pytest.mark.parametrize( - "array, expected", - [ - (np.array([0, 1], dtype=np.int64), np.array([0, 1], dtype=np.int64)), - (np.array(["0", "1"]), np.array(["0", "1"], dtype=object)), - (pd.Categorical(["a", "a"]), np.array([0, 0], dtype="int8")), - ( - pd.DatetimeIndex(["2017-01-01T00:00:00"]), - np.array(["2017-01-01T00:00:00"], dtype="M8[ns]"), - ), - ( - pd.DatetimeIndex(["2017-01-01T00:00:00"], tz="US/Eastern"), - np.array(["2017-01-01T05:00:00"], dtype="M8[ns]"), - ), - (pd.TimedeltaIndex([10 ** 10]), np.array([10 ** 10], dtype="m8[ns]")), - ( - pd.PeriodIndex(["2017", "2018"], freq="D"), - np.array([17167, 17532], dtype=np.int64), - ), - ], -) -def test_ndarray_values(array, expected): - l_values = pd.Series(array)._ndarray_values - r_values = pd.Index(array)._ndarray_values - tm.assert_numpy_array_equal(l_values, r_values) - tm.assert_numpy_array_equal(l_values, expected) - - @pytest.mark.parametrize("arr", [np.array([1, 2, 3])]) def test_numpy_array(arr): ser = pd.Series(arr) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index c13385c135e9f..43f696e0b13db 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -313,16 +313,11 @@ def test_ensure_copied_data(self, indices): result = result.tz_localize("UTC").tz_convert(indices.tz) tm.assert_index_equal(indices, result) - tm.assert_numpy_array_equal( - indices._ndarray_values, result._ndarray_values, check_same="copy" - ) if isinstance(indices, PeriodIndex): # .values an object array of Period, thus copied result = index_type(ordinal=indices.asi8, copy=False, **init_kwargs) - tm.assert_numpy_array_equal( - indices._ndarray_values, result._ndarray_values, check_same="same" - ) + tm.assert_numpy_array_equal(indices.asi8, result.asi8, check_same="same") elif isinstance(indices, IntervalIndex): # checked in test_interval.py pass @@ -331,9 +326,6 @@ def test_ensure_copied_data(self, indices): tm.assert_numpy_array_equal( indices.values, result.values, check_same="same" ) - tm.assert_numpy_array_equal( - indices._ndarray_values, result._ndarray_values, check_same="same" - ) def test_memory_usage(self, indices): indices._engine.clear_mapping() diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 837c124db2bed..fa881df8139c6 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -91,7 +91,7 @@ def test_constructor_nan(self, constructor, breaks, closed): assert result.closed == closed assert result.dtype.subtype == expected_subtype - tm.assert_numpy_array_equal(result._ndarray_values, expected_values) + tm.assert_numpy_array_equal(np.array(result), expected_values) @pytest.mark.parametrize( "breaks", @@ -114,7 +114,7 @@ def test_constructor_empty(self, constructor, breaks, closed): assert result.empty assert result.closed == closed assert result.dtype.subtype == expected_subtype - tm.assert_numpy_array_equal(result._ndarray_values, expected_values) + tm.assert_numpy_array_equal(np.array(result), expected_values) @pytest.mark.parametrize( "breaks", diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index c2b209c810af9..efdd3fc9907a2 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -147,7 +147,7 @@ def test_ensure_copied_data(self, closed): ) # by-definition make a copy - result = IntervalIndex(index._ndarray_values, copy=False) + result = IntervalIndex(np.array(index), copy=False) tm.assert_numpy_array_equal( index.left.values, result.left.values, check_same="copy" ) diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index b5ff83ec7514d..cb2140d0b4025 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -147,9 +147,9 @@ def test_constructor_fromarraylike(self): msg = "freq not specified and cannot be inferred" with pytest.raises(ValueError, match=msg): - PeriodIndex(idx._ndarray_values) + PeriodIndex(idx.asi8) with pytest.raises(ValueError, match=msg): - PeriodIndex(list(idx._ndarray_values)) + PeriodIndex(list(idx.asi8)) msg = "'Period' object is not iterable" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 03f0be3f368cb..df2f85cd7f1e2 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -161,7 +161,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([], dtype=np.int64) - tm.assert_numpy_array_equal(idx._ndarray_values, exp) + tm.assert_numpy_array_equal(idx.asi8, exp) idx = PeriodIndex(["2011-01", NaT], freq="M") @@ -169,7 +169,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([492, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._ndarray_values, exp) + tm.assert_numpy_array_equal(idx.asi8, exp) idx = PeriodIndex(["2011-01-01", NaT], freq="D") @@ -177,7 +177,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([14975, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._ndarray_values, exp) + tm.assert_numpy_array_equal(idx.asi8, exp) def test_period_index_length(self): pi = period_range(freq="A", start="1/1/2001", end="12/1/2009") diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 211d0d52d8357..abd99aadfb484 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -55,9 +55,7 @@ def test_ops(self, opname, obj): if not isinstance(obj, PeriodIndex): expected = getattr(obj.values, opname)() else: - expected = pd.Period( - ordinal=getattr(obj._ndarray_values, opname)(), freq=obj.freq - ) + expected = pd.Period(ordinal=getattr(obj.asi8, opname)(), freq=obj.freq) try: assert result == expected except TypeError: