ENH: initial version of convert_to_annual for pandas, #736

pandas-dev · Apr 6, 2012 · 5b4be0b · 5b4be0b
1 parent 570a03a
commit 5b4be0b
Show file tree

Hide file tree

Showing 6 changed files with 154 additions and 5 deletions.
diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
@@ -130,9 +130,6 @@ def get_result(self):
         return DataFrame(values, index=index, columns=columns)
 
     def get_new_values(self):
-        return self._reshape_values(self.values)
-
-    def _reshape_values(self, values):
         values = self.values
         # place the values
         length, width = self.full_shape
@@ -148,7 +145,7 @@ def _reshape_values(self, values):
         new_values.fill(np.nan)
 
         # is there a simpler / faster way of doing this?
-        for i in xrange(self.values.shape[1]):
+        for i in xrange(values.shape[1]):
             chunk = new_values[:, i * width : (i + 1) * width]
             mask_chunk = new_mask[:, i * width : (i + 1) * width]
 
@@ -200,6 +197,8 @@ def get_new_index(self):
 
         return new_index
 
+
+
 def pivot(self, index=None, columns=None, values=None):
     """
     See DataFrame.pivot

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -839,7 +839,8 @@ def copy(self, order='C'):
         -------
         cp : Series
         """
-        return Series(self.values.copy(order), index=self.index, name=self.name)
+        return Series(self.values.copy(order), index=self.index,
+                      name=self.name)
 
     def to_dict(self):
         """

diff --git a/pandas/tseries/__init__.py b/pandas/tseries/__init__.py
diff --git a/pandas/tseries/tests/__init__.py b/pandas/tseries/tests/__init__.py
diff --git a/pandas/tseries/tests/test_tools.py b/pandas/tseries/tests/test_tools.py
@@ -0,0 +1,64 @@
+import nose
+import unittest
+
+import numpy as np
+
+from pandas import Series, date_range
+import pandas.util.testing as tm
+
+from pandas.tseries.tools import convert_to_annual, isleapyear
+
+class TestConvertAnnual(unittest.TestCase):
+    """
+    New pandas of scikits.timeseries convert_to_annual
+    """
+    def test_daily(self):
+        rng = date_range('1/1/2000', '12/31/2004', freq='D')
+        ts = Series(np.random.randn(len(rng)), index=rng)
+
+        annual = convert_to_annual(ts, 'D')
+
+        doy = ts.index.dayofyear
+        doy[(-isleapyear(ts.index.year)) & (doy >= 60)] += 1
+
+        for i in range(1, 367):
+            subset = ts[doy == i]
+            subset.index = [x.year for x in subset.index]
+
+            tm.assert_series_equal(annual[i].dropna(), subset)
+
+        # check leap days
+        leaps = ts[(ts.index.month == 2) & (ts.index.day == 29)]
+        day = leaps.index.dayofyear[0]
+        leaps.index = leaps.index.year
+        tm.assert_series_equal(annual[day].dropna(), leaps)
+
+    def test_weekly(self):
+        pass
+
+    def test_monthly(self):
+        rng = date_range('1/1/2000', '12/31/2004', freq='M')
+        ts = Series(np.random.randn(len(rng)), index=rng)
+
+        annual = convert_to_annual(ts, 'M')
+
+        month = ts.index.month
+
+        for i in range(1, 13):
+            subset = ts[month == i]
+            subset.index = [x.year for x in subset.index]
+            tm.assert_series_equal(annual[i].dropna(), subset)
+
+    def test_interval_monthly(self):
+        pass
+
+    def test_interval_daily(self):
+        pass
+
+    def test_interval_weekly(self):
+        pass
+
+if __name__ == '__main__':
+    nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
+                   exit=False)
+
diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py
@@ -0,0 +1,85 @@
+from pandas.core.frame import DataFrame
+import pandas.core.nanops as nanops
+
+import numpy as np
+
+def convert_to_annual(series, freq=None):
+    """
+    Group a series by years, taking leap years into account.
+
+    The output has as many rows as distinct years in the original series,
+    and as many columns as the length of a leap year in the units corresponding
+    to the original frequency (366 for daily frequency, 366*24 for hourly...).
+    The fist column of the output corresponds to Jan. 1st, 00:00:00,
+    while the last column corresponds to Dec, 31st, 23:59:59.
+    Entries corresponding to Feb. 29th are masked for non-leap years.
+
+    For example, if the initial series has a daily frequency, the 59th column
+    of the output always corresponds to Feb. 28th, the 61st column to Mar. 1st,
+    and the 60th column is masked for non-leap years.
+    With a hourly initial frequency, the (59*24)th column of the output always
+    correspond to Feb. 28th 23:00, the (61*24)th column to Mar. 1st, 00:00, and
+    the 24 columns between (59*24) and (61*24) are masked.
+
+    If the original frequency is less than daily, the output is equivalent to
+    ``series.convert('A', func=None)``.
+
+    Parameters
+    ----------
+    series : TimeSeries
+    freq : string or None, default None
+
+
+    Returns
+    -------
+    annual : DataFrame
+    """
+    index = series.index
+    year = index.year
+    years = nanops.unique1d(year)
+
+    if freq is not None:
+        freq = freq.upper()
+
+    if freq == 'D':
+        width = 366
+        offset = index.dayofyear - 1
+
+        # adjust for leap year
+        offset[(-isleapyear(year)) & (offset >= 59)] += 1
+
+        columns = range(1, 367)
+        # todo: strings like 1/1, 1/25, etc.?
+    elif freq in ('M', 'BM'):
+        width = 12
+        offset = index.month - 1
+        columns = range(1, 13)
+    else:
+        raise NotImplementedError(freq)
+
+    flat_index = (year - years.min()) * width + offset
+
+    values = np.empty((len(years), width), dtype=series.dtype)
+
+    if not np.issubdtype(series.dtype, np.integer):
+        values.fill(np.nan)
+    else:
+        raise Exception('need to upcast')
+
+    values.put(flat_index, series.values)
+
+    return DataFrame(values, index=years, columns=columns)
+
+def isleapyear(year):
+    """
+    Returns true if year is a leap year.
+
+    Parameters
+    ----------
+    year : integer / sequence
+        A given (list of) year(s).
+    """
+    year = np.asarray(year)
+    return np.logical_or(year % 400 == 0,
+                         np.logical_and(year % 4 == 0, year % 100 > 0))
+