pivot hourly using a new result shape

timmie · Nov 1, 2012 · fcde169 · fcde169
1 parent a3d10ba
commit fcde169
Show file tree

Hide file tree

Showing 3 changed files with 262 additions and 0 deletions.
diff --git a/pandas/tseries/pivot.py b/pandas/tseries/pivot.py
@@ -0,0 +1,207 @@
+import numpy as np
+
+from pandas.core.frame import DataFrame
+import pandas.core.nanops as nanops
+from pandas.tseries.util import isleapyear
+from pandas.tseries.index import date_range
+
+def pivot_annual_h(series, freq=None, dt_index=False):
+    """
+    Group a series by years, taking leap years into account.
+
+    The output has as many rows as distinct years in the original series,
+    and as many columns as the length of a leap year in the units corresponding
+    to the original frequency (366 for daily frequency, 366*24 for hourly...).
+    The fist column of the output corresponds to Jan. 1st, 00:00:00,
+    while the last column corresponds to Dec, 31st, 23:59:59.
+    Entries corresponding to Feb. 29th are masked for non-leap years.
+
+    For example, if the initial series has a daily frequency, the 59th column
+    of the output always corresponds to Feb. 28th, the 61st column to Mar. 1st,
+    and the 60th column is masked for non-leap years.
+    With a hourly initial frequency, the (59*24)th column of the output always
+    correspond to Feb. 28th 23:00, the (61*24)th column to Mar. 1st, 00:00, and
+    the 24 columns between (59*24) and (61*24) are masked.
+
+    If the original frequency is less than daily, the output is equivalent to
+    ``series.convert('A', func=None)``.
+
+    Parameters
+    ----------
+    series : TimeSeries
+    freq : string or None, default None
+
+    Returns
+    -------
+    annual : DataFrame
+    
+    
+    """
+    #TODO: test like original pandas and the position of first and last value in arrays
+    #TODO: reduce number of hardcoded values scattered all around.   
+    index = series.index
+    year = index.year
+    years = nanops.unique1d(year)    
+
+    if freq is not None:
+        freq = freq.upper()
+    else:
+        freq = series.index.freq
+
+    if freq == 'H':
+
+        ##basics
+
+        #integer value of sum of all hours in a leap hear
+        total_hoy_leap = (year_length(series.index.freqstr))
+
+        #list of all hours in a leap year
+        hoy_leap_list = range(1, (total_hoy_leap + 1 ))
+
+
+
+        values = np.empty((total_hoy_leap, len(years)), dtype=series.dtype)
+        values.fill(np.nan)
+
+        dummy_df = DataFrame(values, index=hoy_leap_list, 
+                        columns=years)
+
+        ##get offset for leap hours
+
+        #see:
+        #http://stackoverflow.com/questions/2004364/increment-numpy-array-with-repeated-indices
+        #1994-02-28 23:00:00 -> index 1415
+        ind_z = np.array(range(0, 8760))
+        ind_i = np.array(range(1416,8760 ))
+
+        ind_t = ind_z.copy()
+        ind_t[ind_i]+=24
+
+        #TODO: beautify variable names
+        for year in years:
+
+            # select data for the respective year
+            ser_sel = series[ series.index.year == year]
+            info = (ser_sel).values
+
+
+
+            if isleapyear(year):
+                dummy_df[year] = info
+            else:
+                data = np.empty((total_hoy_leap), dtype=series.dtype)
+                data.fill(np.nan)
+
+                ser_sel = series[ series.index.year == year]
+                info = (ser_sel).values
+
+                data.put(ind_t, (series[ series.index.year == year]).values)
+
+                dummy_df[year] = data
+
+        res_df = dummy_df
+
+        #assign a datetime index, CAUTION: the year is definatly wrong!
+        if dt_index:
+            rng = default_rng()            
+            res_df = DataFrame(res_df.values, index=rng, 
+                               columns=res_df.columns)
+
+        return res_df
+
+#TDOO: use pivot_annual for D & M and minute in the same fashion
+    if freq == 'D':
+        raise NotImplementedError(freq), "use pandas.tseries.util.pivot_annual"        
+
+    if freq == 'M':
+        raise NotImplementedError(freq), "use pandas.tseries.util.pivot_annual"
+
+    else:
+        raise NotImplementedError(freq)
+
+
+    return res_df
+
+
+### timeseries pivoting helper
+
+def last_col2front(df, col_no=1):
+    """shifts the last column of a data frame to the front
+    
+    increase col_no to shift more cols    
+    """
+    cols = cols = df.columns.tolist()
+    #increase index value to 2+ if more columns are to be shifted
+    cols = cols[-col_no:] + cols[:-col_no]
+    df = df[cols]
+
+    return df
+
+
+def extended_info(df, time_cols=True, aggreg=True, aggreg_func=None,
+                  datetime_index=False):
+    """add extended information to a timeseries pivot
+    """
+
+    df_extended = df.copy()
+    #perform the following only on the data columns
+    cols = df_extended.columns
+    #TODO: add standard aggregation
+    #TODO: make function be set by argument
+    #TODO: is there no a SM describe function?
+    #TODO: Maybe use http://pandas.pydata.org/pandas-docs/dev/basics.html#summarizing-data-describe
+    if aggreg:
+
+        df_extended['mean'] = df_extended[cols].mean(1)
+        df_extended['sum'] = df_extended[cols].sum(1)
+        df_extended['min'] = df_extended[cols].min(1)
+        df_extended['max'] = df_extended[cols].max(1)
+        df_extended['max'] = df_extended[cols].std(1)
+
+    #add some metadata
+    #TODO: add function to make index a datetime with the argument above using the rng below    
+    #TODO: convert the range to lower frequencies and reuse the function.
+    rng = default_rng()
+    df_extended['doy'] = rng.dayofyear
+#    df_extended = last_col2front(df_extended)
+    df_extended['month'] = rng.month
+#    df_extended = last_col2front(df_extended)
+    df_extended['day'] = rng.day
+#    df_extended = last_col2front(df_extended)
+    df_extended['hour'] = rng.hour + 1
+    df_extended = last_col2front(df_extended, col_no=4)
+
+    return df_extended
+
+###Timeseries convenience / helper functions
+
+
+def year_length(freq, leap=True):
+    """helper function for year length at different frequencies.
+    to be expanded
+    """
+
+    daysofyear_leap = 366
+    daysofyear_nonleap = 365
+
+    if freq == 'H':
+        if leap:        
+            length = 24 * daysofyear_leap
+        else:
+            length = 24 * daysofyear_nonleap
+
+    return length
+
+def default_rng(freq='H', leap=True):
+    """create default ranges
+    """
+
+    if leap:
+        total_hoy_leap = (year_length(freq='H'))    
+        rng = date_range('1/1/2012', periods=total_hoy_leap, freq='H')
+
+    else:
+        total_hoy_nonleap = (year_length(freq='H'))    
+        rng = date_range('1/1/2011', periods=total_hoy_nonleap, freq='H')        
+
+    return rng
diff --git a/pandas/tseries/tests/test_util.py b/pandas/tseries/tests/test_util.py
@@ -7,11 +7,64 @@
 import pandas.util.testing as tm
 
 from pandas.tseries.util import pivot_annual, isleapyear
+from pandas.tseries import pivot
 
 class TestPivotAnnual(unittest.TestCase):
     """
     New pandas of scikits.timeseries pivot_annual
     """
+    def test_hourly(self):
+        rng_hourly = date_range('1/1/1994', periods=(18* 8760 + 4*24), freq='H')
+        data_hourly = np.random.randint(100, high=350, size=rng_hourly.size)
+        data_hourly = data_hourly.astype('float64')
+        ts_hourly = Series(data_hourly, index=rng_hourly)
+
+        annual = pivot.pivot_annual_h(ts_hourly, dt_index=True)
+
+        ### general
+        ##test first column: if first value and data are the same as first value of timeseries
+        #date
+        def get_mdh(DatetimeIndex, index):
+            #(m, d, h)
+            mdh_tuple = (DatetimeIndex.month[index], DatetimeIndex.day[index], 
+                        DatetimeIndex.hour[index])
+            return mdh_tuple
+#        ts_hourly.index.month[1], ts_hourly.index.month[1], ts_hourly.index.month[1]
+
+        assert get_mdh(ts_hourly.index, 1) == get_mdh(annual.index, 1)
+        #are the last dates of ts identical with the dates last row in the last column?
+        assert get_mdh(ts_hourly.index[-1]) == get_mdh(annual.index, 
+                                                        (annual.index.size -1))
+        #first values of the ts identical with the first col and last row of the df?        
+        assert ts_hourly[0] == annual.ix[1].values[0]
+        #last values of the ts identical with the last col and last row of the df?        
+        assert ts_hourly[-1] == annual.ix[annual.index.size].values[-1]     
+        ### index
+        ##test if index has the right length
+        assert annual.index[-1] == 8784
+        ##test last column: if first value and data are the same as first value of timeseries
+        ### leap
+        ##test leap offset
+        #leap year: 1996 - are the values of the ts and the 
+        ser96_leap = ts_hourly[(ts_hourly.index.year == 1996) &  
+                          (ts_hourly.index.month == 2) &
+                          (ts_hourly.index.day == 29)                          
+                          ]
+
+        df96 = annual[1996]
+        df96_leap = df96[(df96.index.month == 2) & (df96.index.day == 29)]
+        tm.assert_series_equal(ser96_leap, df96_leap)
+        #non-leap year: 1994 - are all values NaN for day 29.02?
+        nan_arr = np.empty(24)
+        nan_arr.fill(np.nan)                  
+        df94 = annual[1994]
+        df94_noleap = df94[(df94.index.month == 2) & (df94.index.day == 29)]
+        np.testing.assert_equal(df94_noleap.values, nan_arr)
+        ### extended functionaliy
+
+
+
+
     def test_daily(self):
         rng = date_range('1/1/2000', '12/31/2004', freq='D')
         ts = Series(np.random.randn(len(rng)), index=rng)
@@ -33,6 +86,7 @@ def test_daily(self):
         leaps.index = leaps.index.year
         tm.assert_series_equal(annual[day].dropna(), leaps)
 
+
     def test_weekly(self):
         pass
 

diff --git a/pandas/tseries/util.py b/pandas/tseries/util.py
@@ -2,6 +2,7 @@
 
 from pandas.core.frame import DataFrame
 import pandas.core.nanops as nanops
+from pandas.tseries.util import isleapyear
 
 def pivot_annual(series, freq=None):
     """