Skip to content

Commit

Permalink
pivot hourly using a new result shape
Browse files Browse the repository at this point in the history
  • Loading branch information
timmie committed Nov 1, 2012
1 parent a3d10ba commit fcde169
Show file tree
Hide file tree
Showing 3 changed files with 262 additions and 0 deletions.
207 changes: 207 additions & 0 deletions pandas/tseries/pivot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
import numpy as np

from pandas.core.frame import DataFrame
import pandas.core.nanops as nanops
from pandas.tseries.util import isleapyear
from pandas.tseries.index import date_range

def pivot_annual_h(series, freq=None, dt_index=False):
"""
Group a series by years, taking leap years into account.
The output has as many rows as distinct years in the original series,
and as many columns as the length of a leap year in the units corresponding
to the original frequency (366 for daily frequency, 366*24 for hourly...).
The fist column of the output corresponds to Jan. 1st, 00:00:00,
while the last column corresponds to Dec, 31st, 23:59:59.
Entries corresponding to Feb. 29th are masked for non-leap years.
For example, if the initial series has a daily frequency, the 59th column
of the output always corresponds to Feb. 28th, the 61st column to Mar. 1st,
and the 60th column is masked for non-leap years.
With a hourly initial frequency, the (59*24)th column of the output always
correspond to Feb. 28th 23:00, the (61*24)th column to Mar. 1st, 00:00, and
the 24 columns between (59*24) and (61*24) are masked.
If the original frequency is less than daily, the output is equivalent to
``series.convert('A', func=None)``.
Parameters
----------
series : TimeSeries
freq : string or None, default None
Returns
-------
annual : DataFrame
"""
#TODO: test like original pandas and the position of first and last value in arrays
#TODO: reduce number of hardcoded values scattered all around.
index = series.index
year = index.year
years = nanops.unique1d(year)

if freq is not None:
freq = freq.upper()
else:
freq = series.index.freq

if freq == 'H':

##basics

#integer value of sum of all hours in a leap hear
total_hoy_leap = (year_length(series.index.freqstr))

#list of all hours in a leap year
hoy_leap_list = range(1, (total_hoy_leap + 1 ))



values = np.empty((total_hoy_leap, len(years)), dtype=series.dtype)
values.fill(np.nan)

dummy_df = DataFrame(values, index=hoy_leap_list,
columns=years)

##get offset for leap hours

#see:
#http://stackoverflow.com/questions/2004364/increment-numpy-array-with-repeated-indices
#1994-02-28 23:00:00 -> index 1415
ind_z = np.array(range(0, 8760))
ind_i = np.array(range(1416,8760 ))

ind_t = ind_z.copy()
ind_t[ind_i]+=24

#TODO: beautify variable names
for year in years:

# select data for the respective year
ser_sel = series[ series.index.year == year]
info = (ser_sel).values



if isleapyear(year):
dummy_df[year] = info
else:
data = np.empty((total_hoy_leap), dtype=series.dtype)
data.fill(np.nan)

ser_sel = series[ series.index.year == year]
info = (ser_sel).values

data.put(ind_t, (series[ series.index.year == year]).values)

dummy_df[year] = data

res_df = dummy_df

#assign a datetime index, CAUTION: the year is definatly wrong!
if dt_index:
rng = default_rng()
res_df = DataFrame(res_df.values, index=rng,
columns=res_df.columns)

return res_df

#TDOO: use pivot_annual for D & M and minute in the same fashion
if freq == 'D':
raise NotImplementedError(freq), "use pandas.tseries.util.pivot_annual"

if freq == 'M':
raise NotImplementedError(freq), "use pandas.tseries.util.pivot_annual"

else:
raise NotImplementedError(freq)


return res_df


### timeseries pivoting helper

def last_col2front(df, col_no=1):
"""shifts the last column of a data frame to the front
increase col_no to shift more cols
"""
cols = cols = df.columns.tolist()
#increase index value to 2+ if more columns are to be shifted
cols = cols[-col_no:] + cols[:-col_no]
df = df[cols]

return df


def extended_info(df, time_cols=True, aggreg=True, aggreg_func=None,
datetime_index=False):
"""add extended information to a timeseries pivot
"""

df_extended = df.copy()
#perform the following only on the data columns
cols = df_extended.columns
#TODO: add standard aggregation
#TODO: make function be set by argument
#TODO: is there no a SM describe function?
#TODO: Maybe use http://pandas.pydata.org/pandas-docs/dev/basics.html#summarizing-data-describe
if aggreg:

df_extended['mean'] = df_extended[cols].mean(1)
df_extended['sum'] = df_extended[cols].sum(1)
df_extended['min'] = df_extended[cols].min(1)
df_extended['max'] = df_extended[cols].max(1)
df_extended['max'] = df_extended[cols].std(1)

#add some metadata
#TODO: add function to make index a datetime with the argument above using the rng below
#TODO: convert the range to lower frequencies and reuse the function.
rng = default_rng()
df_extended['doy'] = rng.dayofyear
# df_extended = last_col2front(df_extended)
df_extended['month'] = rng.month
# df_extended = last_col2front(df_extended)
df_extended['day'] = rng.day
# df_extended = last_col2front(df_extended)
df_extended['hour'] = rng.hour + 1
df_extended = last_col2front(df_extended, col_no=4)

return df_extended

###Timeseries convenience / helper functions


def year_length(freq, leap=True):
"""helper function for year length at different frequencies.
to be expanded
"""

daysofyear_leap = 366
daysofyear_nonleap = 365

if freq == 'H':
if leap:
length = 24 * daysofyear_leap
else:
length = 24 * daysofyear_nonleap

return length

def default_rng(freq='H', leap=True):
"""create default ranges
"""

if leap:
total_hoy_leap = (year_length(freq='H'))
rng = date_range('1/1/2012', periods=total_hoy_leap, freq='H')

else:
total_hoy_nonleap = (year_length(freq='H'))
rng = date_range('1/1/2011', periods=total_hoy_nonleap, freq='H')

return rng
54 changes: 54 additions & 0 deletions pandas/tseries/tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,64 @@
import pandas.util.testing as tm

from pandas.tseries.util import pivot_annual, isleapyear
from pandas.tseries import pivot

class TestPivotAnnual(unittest.TestCase):
"""
New pandas of scikits.timeseries pivot_annual
"""
def test_hourly(self):
rng_hourly = date_range('1/1/1994', periods=(18* 8760 + 4*24), freq='H')
data_hourly = np.random.randint(100, high=350, size=rng_hourly.size)
data_hourly = data_hourly.astype('float64')
ts_hourly = Series(data_hourly, index=rng_hourly)

annual = pivot.pivot_annual_h(ts_hourly, dt_index=True)

### general
##test first column: if first value and data are the same as first value of timeseries
#date
def get_mdh(DatetimeIndex, index):
#(m, d, h)
mdh_tuple = (DatetimeIndex.month[index], DatetimeIndex.day[index],
DatetimeIndex.hour[index])
return mdh_tuple
# ts_hourly.index.month[1], ts_hourly.index.month[1], ts_hourly.index.month[1]

assert get_mdh(ts_hourly.index, 1) == get_mdh(annual.index, 1)
#are the last dates of ts identical with the dates last row in the last column?
assert get_mdh(ts_hourly.index[-1]) == get_mdh(annual.index,
(annual.index.size -1))
#first values of the ts identical with the first col and last row of the df?
assert ts_hourly[0] == annual.ix[1].values[0]
#last values of the ts identical with the last col and last row of the df?
assert ts_hourly[-1] == annual.ix[annual.index.size].values[-1]
### index
##test if index has the right length
assert annual.index[-1] == 8784
##test last column: if first value and data are the same as first value of timeseries
### leap
##test leap offset
#leap year: 1996 - are the values of the ts and the
ser96_leap = ts_hourly[(ts_hourly.index.year == 1996) &
(ts_hourly.index.month == 2) &
(ts_hourly.index.day == 29)
]

df96 = annual[1996]
df96_leap = df96[(df96.index.month == 2) & (df96.index.day == 29)]
tm.assert_series_equal(ser96_leap, df96_leap)
#non-leap year: 1994 - are all values NaN for day 29.02?
nan_arr = np.empty(24)
nan_arr.fill(np.nan)
df94 = annual[1994]
df94_noleap = df94[(df94.index.month == 2) & (df94.index.day == 29)]
np.testing.assert_equal(df94_noleap.values, nan_arr)
### extended functionaliy




def test_daily(self):
rng = date_range('1/1/2000', '12/31/2004', freq='D')
ts = Series(np.random.randn(len(rng)), index=rng)
Expand All @@ -33,6 +86,7 @@ def test_daily(self):
leaps.index = leaps.index.year
tm.assert_series_equal(annual[day].dropna(), leaps)


def test_weekly(self):
pass

Expand Down
1 change: 1 addition & 0 deletions pandas/tseries/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from pandas.core.frame import DataFrame
import pandas.core.nanops as nanops
from pandas.tseries.util import isleapyear

def pivot_annual(series, freq=None):
"""
Expand Down

0 comments on commit fcde169

Please sign in to comment.