From 7d8cd683460e21c938fab3c4597efa04f75cb2b1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 21 Dec 2018 06:19:48 -0800 Subject: [PATCH] implement _index_data parts of #24024 (#24379) --- pandas/_libs/reduction.pyx | 13 ++++++++++--- pandas/core/indexes/base.py | 6 ++++++ pandas/core/indexes/datetimes.py | 2 ++ pandas/core/indexes/period.py | 2 ++ pandas/core/indexes/timedeltas.py | 2 ++ 5 files changed, 22 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 6f892c928805ed..a61295f781901a 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -265,7 +265,10 @@ cdef class SeriesBinGrouper: cached_typ = self.typ(vslider.buf, index=cached_ityp, name=name) else: - object.__setattr__(cached_ityp, '_data', islider.buf) + # See the comment in indexes/base.py about _index_data. + # We need this for EA-backed indexes that have a reference + # to a 1-d ndarray like datetime / timedelta / period. + object.__setattr__(cached_ityp, '_index_data', islider.buf) cached_ityp._engine.clear_mapping() object.__setattr__( cached_typ._data._block, 'values', vslider.buf) @@ -569,8 +572,11 @@ cdef class BlockSlider: util.set_array_not_contiguous(x) self.nblocks = len(self.blocks) + # See the comment in indexes/base.py about _index_data. + # We need this for EA-backed indexes that have a reference to a 1-d + # ndarray like datetime / timedelta / period. self.idx_slider = Slider( - self.frame.index.values, self.dummy.index.values) + self.frame.index._index_data, self.dummy.index._index_data) self.base_ptrs = malloc(sizeof(char*) * len(self.blocks)) for i, block in enumerate(self.blocks): @@ -594,7 +600,8 @@ cdef class BlockSlider: # move and set the index self.idx_slider.move(start, end) - object.__setattr__(self.index, '_data', self.idx_slider.buf) + + object.__setattr__(self.index, '_index_data', self.idx_slider.buf) self.index._engine.clear_mapping() cdef reset(self): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index cc6f182fadce6f..a2cf88fa9cb1a2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -519,6 +519,12 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): result = object.__new__(cls) result._data = values + # _index_data is a (temporary?) fix to ensure that the direct data + # manipulation we do in `_libs/reduction.pyx` continues to work. + # We need access to the actual ndarray, since we're messing with + # data buffers and strides. We don't re-use `_ndarray_values`, since + # we actually set this value too. + result._index_data = values result.name = name for k, v in compat.iteritems(kwargs): setattr(result, k, v) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 1c966ab58e8c4f..0e4132524045c5 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -269,6 +269,8 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): result = super(DatetimeIndex, cls)._simple_new(values, freq, tz) result.name = name + # For groupby perf. See note in indexes/base about _index_data + result._index_data = result._data result._reset_identity() return result diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 7ece1eaf547c86..17666cd651a509 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -235,6 +235,8 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): raise TypeError("PeriodIndex._simple_new only accepts PeriodArray") result = object.__new__(cls) result._data = values + # For groupby perf. See note in indexes/base about _index_data + result._index_data = values._data result.name = name result._reset_identity() return result diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 5d52696992c308..e6c714683979fa 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -199,6 +199,8 @@ def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): result = super(TimedeltaIndex, cls)._simple_new(values, freq) result.name = name + # For groupby perf. See note in indexes/base about _index_data + result._index_data = result._data result._reset_identity() return result