implement _index_data parts of #24024 (#24379)

pandas-dev · Dec 21, 2018 · 04a0eac · 04a0eac
1 parent 5d134ec
commit 04a0eac
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 3 deletions.
diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx
@@ -265,7 +265,10 @@ cdef class SeriesBinGrouper:
                     cached_typ = self.typ(vslider.buf, index=cached_ityp,
                                           name=name)
                 else:
-                    object.__setattr__(cached_ityp, '_data', islider.buf)
+                    # See the comment in indexes/base.py about _index_data.
+                    # We need this for EA-backed indexes that have a reference
+                    # to a 1-d ndarray like datetime / timedelta / period.
+                    object.__setattr__(cached_ityp, '_index_data', islider.buf)
                     cached_ityp._engine.clear_mapping()
                     object.__setattr__(
                         cached_typ._data._block, 'values', vslider.buf)
@@ -569,8 +572,11 @@ cdef class BlockSlider:
             util.set_array_not_contiguous(x)
 
         self.nblocks = len(self.blocks)
+        # See the comment in indexes/base.py about _index_data.
+        # We need this for EA-backed indexes that have a reference to a 1-d
+        # ndarray like datetime / timedelta / period.
         self.idx_slider = Slider(
-            self.frame.index.values, self.dummy.index.values)
+            self.frame.index._index_data, self.dummy.index._index_data)
 
         self.base_ptrs = <char**>malloc(sizeof(char*) * len(self.blocks))
         for i, block in enumerate(self.blocks):
@@ -594,7 +600,8 @@ cdef class BlockSlider:
 
         # move and set the index
         self.idx_slider.move(start, end)
-        object.__setattr__(self.index, '_data', self.idx_slider.buf)
+
+        object.__setattr__(self.index, '_index_data', self.idx_slider.buf)
         self.index._engine.clear_mapping()
 
     cdef reset(self):

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -519,6 +519,12 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
 
         result = object.__new__(cls)
         result._data = values
+        # _index_data is a (temporary?) fix to ensure that the direct data
+        # manipulation we do in `_libs/reduction.pyx` continues to work.
+        # We need access to the actual ndarray, since we're messing with
+        # data buffers and strides. We don't re-use `_ndarray_values`, since
+        # we actually set this value too.
+        result._index_data = values
         result.name = name
         for k, v in compat.iteritems(kwargs):
             setattr(result, k, v)

diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -269,6 +269,8 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None):
 
         result = super(DatetimeIndex, cls)._simple_new(values, freq, tz)
         result.name = name
+        # For groupby perf. See note in indexes/base about _index_data
+        result._index_data = result._data
         result._reset_identity()
         return result
 

diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
@@ -235,6 +235,8 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs):
             raise TypeError("PeriodIndex._simple_new only accepts PeriodArray")
         result = object.__new__(cls)
         result._data = values
+        # For groupby perf. See note in indexes/base about _index_data
+        result._index_data = values._data
         result.name = name
         result._reset_identity()
         return result

diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py
@@ -199,6 +199,8 @@ def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE):
 
         result = super(TimedeltaIndex, cls)._simple_new(values, freq)
         result.name = name
+        # For groupby perf. See note in indexes/base about _index_data
+        result._index_data = result._data
         result._reset_identity()
         return result