Merge pull request pandas-dev#6690 from jreback/resample_bin

BUG: Bug in resample with extra bins when using an evenly divisible freq (GH4076)
jsexauer · Mar 23, 2014 · 7ffa655 · 7ffa655
2 parents 83b1ce4 + 4214a17
commit 7ffa655
Show file tree

Hide file tree

Showing 5 changed files with 55 additions and 20 deletions.
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -185,7 +185,7 @@ Improvements to existing features
 - Performance improvement when converting ``DatetimeIndex`` to floating ordinals
   using ``DatetimeConverter`` (:issue:`6636`)
 - Performance improvement for  ``DataFrame.shift`` (:issue: `5609`)
-  
+
 .. _release.bug_fixes-0.14.0:
 
 Bug Fixes
@@ -270,6 +270,7 @@ Bug Fixes
 - Bug in compat with ``np.compress``, surfaced in (:issue:`6658`)
 - Bug in binary operations with a rhs of a Series not aligning (:issue:`6681`)
 - Bug in ``DataFrame.to_stata`` which incorrectly handles nan values and ignores 'with_index' keyword argument (:issue:`6685`)
+- Bug in resample with extra bins when using an evenly divisible frequency (:issue:`4076`)
 
 pandas 0.13.1
 -------------

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1486,25 +1486,20 @@ def get_iterator(self, data, axis=0):
         Generator yielding sequence of (name, subsetted object)
         for each group
         """
-        if axis == 0:
-            start = 0
-            for edge, label in zip(self.bins, self.binlabels):
-                yield label, data[start:edge]
-                start = edge
-
-            if start < len(data):
-                yield self.binlabels[-1], data[start:]
+        if isinstance(data, NDFrame):
+            slicer = lambda start,edge: data._slice(slice(start,edge),axis=axis)
+            length = len(data.axes[axis])
         else:
-            start = 0
-            for edge, label in zip(self.bins, self.binlabels):
-                inds = lrange(start, edge)
-                yield label, data.take(inds, axis=axis)
-                start = edge
-
-            n = len(data.axes[axis])
-            if start < n:
-                inds = lrange(start, n)
-                yield self.binlabels[-1], data.take(inds, axis=axis)
+            slicer = lambda start,edge: data[slice(start,edge)]
+            length = len(data)
+
+        start = 0
+        for edge, label in zip(self.bins, self.binlabels):
+            yield label, slicer(start,edge)
+            start = edge
+
+        if start < length:
+            yield self.binlabels[-1], slicer(start,None)
 
     def apply(self, f, data, axis=0):
         result_keys = []

diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -2864,7 +2864,8 @@ def test_groupby_with_timegrouper(self):
         df = df.set_index(['Date'])
 
         expected = DataFrame({ 'Quantity' : np.nan },
-                             index=date_range('20130901 13:00:00','20131205 13:00:00',freq='5D',name='Date'))
+                             index=date_range('20130901 13:00:00','20131205 13:00:00',
+                                              freq='5D',name='Date',closed='left'))
         expected.iloc[[0,6,18],0] = np.array([24.,6.,9.],dtype='float64')
 
         result1 = df.resample('5D',how=sum)

diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py
@@ -185,6 +185,12 @@ def _get_time_bins(self, ax):
             elif not trimmed:
                 labels = labels[:-1]
 
+        # if we end up with more labels than bins
+        # adjust the labels
+        # GH4076
+        if len(bins) < len(labels):
+            labels = labels[:len(bins)]
+
         return binner, bins, labels
 
     def _adjust_bin_edges(self, binner, ax_values):

diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py
@@ -1087,6 +1087,38 @@ def test_resample_doesnt_truncate(self):
         result = series.resample('D')
         self.assertEquals(result.index[0], dates[0])
 
+    def test_evenly_divisible_with_no_extra_bins(self):
+        # 4076
+        # when the frequency is evenly divisible, sometimes extra bins
+
+        df = DataFrame(np.random.randn(9, 3), index=date_range('2000-1-1', periods=9))
+        result = df.resample('5D')
+        expected = pd.concat([df.iloc[0:5].mean(),df.iloc[5:].mean()],axis=1).T
+        expected.index = [Timestamp('2000-1-1'),Timestamp('2000-1-6')]
+        assert_frame_equal(result,expected)
+
+        index = date_range(start='2001-5-4', periods=28)
+        df = DataFrame(
+            [{'REST_KEY': 1, 'DLY_TRN_QT': 80, 'DLY_SLS_AMT': 90,
+              'COOP_DLY_TRN_QT': 30, 'COOP_DLY_SLS_AMT': 20}] * 28 +
+            [{'REST_KEY': 2, 'DLY_TRN_QT': 70, 'DLY_SLS_AMT': 10,
+              'COOP_DLY_TRN_QT': 50, 'COOP_DLY_SLS_AMT': 20}] * 28,
+            index=index.append(index)).sort()
+
+        index = date_range('2001-5-4',periods=4,freq='7D')
+        expected = DataFrame(
+            [{'REST_KEY': 14, 'DLY_TRN_QT': 14, 'DLY_SLS_AMT': 14,
+              'COOP_DLY_TRN_QT': 14, 'COOP_DLY_SLS_AMT': 14}] * 4,
+            index=index).unstack().swaplevel(1,0).sortlevel()
+        result = df.resample('7D', how='count')
+        assert_series_equal(result,expected)
+
+        expected = DataFrame(
+            [{'REST_KEY': 21, 'DLY_TRN_QT': 1050, 'DLY_SLS_AMT': 700,
+              'COOP_DLY_TRN_QT': 560, 'COOP_DLY_SLS_AMT': 280}] * 4,
+            index=index)
+        result = df.resample('7D', how='sum')
+        assert_frame_equal(result,expected)
 
 class TestTimeGrouper(tm.TestCase):