Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: fixes to rolling_window functions with center=True #3161

Closed
wants to merge 10 commits into from
17 changes: 11 additions & 6 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1104,8 +1104,11 @@ def roll_skew(ndarray[double_t] input, int win, int minp):

R = sqrt(B)

output[i] = ((sqrt(nobs * (nobs - 1.)) * C) /
((nobs-2) * R * R * R))
if R != 0 and nobs > 2:
output[i] = ((sqrt(nobs * (nobs - 1.)) * C) /
((nobs-2) * R * R * R))
else:
output[i] = NaN
else:
output[i] = NaN

Expand Down Expand Up @@ -1173,10 +1176,12 @@ def roll_kurt(ndarray[double_t] input,
R = R * A
D = xxxx / nobs - R - 6*B*A*A - 4*C*A

K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2)
K = K / ((nobs - 2.)*(nobs-3.))

output[i] = K
if B != 0 and nobs > 3:
K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2)
K = K / ((nobs - 2.)*(nobs-3.))
output[i] = K
else:
output[i] = NaN
else:
output[i] = NaN

Expand Down
119 changes: 103 additions & 16 deletions pandas/stats/moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
freq : None or string alias / date offset object, default=None
Frequency to conform to before computing statistic
time_rule is a legacy alias for freq
pad_val: Value to use for parts of the window which fall outside the array
bounds. (applies only for center=True, currently)

Returns
-------
Expand Down Expand Up @@ -251,7 +253,7 @@ def rolling_corr_pairwise(df, window, min_periods=None):


def _rolling_moment(arg, window, func, minp, axis=0, freq=None,
center=False, time_rule=None, **kwargs):
center=False, time_rule=None, pad_val=np.NaN, **kwargs):
"""
Rolling statistical measure using supplied function. Designed to be
used with passed-in Cython array-based functions.
Expand All @@ -275,19 +277,105 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None,
"""
arg = _conv_timerule(arg, freq, time_rule)
calc = lambda x: func(x, window, minp=minp, **kwargs)
# strips the pandas object into a callback and an ndarray
# the callback takes back an ndarray and reconstitutes the
# pandas object with the new data
return_hook, values = _process_data_structure(arg)
# actually calculate the moment. Faster way to do this?
if values.ndim > 1:
result = np.apply_along_axis(calc, axis, values)
else:
result = calc(values)
def calc(x,axis):
_calc = lambda x: func(x, window, minp=minp, **kwargs)
if x.ndim > 1:
return np.apply_along_axis(_calc, axis, x)
else:
return _calc(x)

def fwd(x):
"""
reshapes a 1d/2d ndarray into a 2d array so the processing can
be done on a fixed case.
"""
v = x.view()
if v.ndim > 1:
v = v if axis == 1 else v.T
else:
v = v.reshape((1,len(v)))
return v

rs = return_hook(result)
def bkwd(x):
if arg.ndim > 1:
x = x if axis == 1 else x.T
else:
x = x[0]
return x

def pad(values,nahead,pos="head",window=window,axis=axis,pad_val=pad_val):
v = values

tip = np.empty((v.shape[0],2*nahead+1+nahead+1),dtype=v.dtype)
if pos == "head":
tip[:,:nahead+1] = pad_val
tip[:,nahead+1:] = v[:,:(2*nahead+1)]
elif pos == "tail":
tip[:,-(nahead+1):] = pad_val
tip[:,:-(nahead+1)] = v[:,-(2*nahead+1):]
else:
raise NotImplementedError()

return tip

# window = 5 /4
# print pad(mkdf(10,5).values,window=5)
# print pad(mkdf(10,5).values,window=5,axis=1)
# print pad(mkdf(10,5).irow(0).values,window=5)
result = calc(values,axis)
if center:
rs = _center_window(rs, window, axis)
return rs


# GH2953, fixup edges
if window > 2:
result = _center_window(result, window, axis)
result = fwd(result)
values = fwd(values)
# with the data always in a consistent alignment
# we can always apply the func along the same axis=1
# and eliminate special cases

# there's an ambiguity in what constitutes
# the "center" when window is even
# we Just close ranks with numpy, so a window of len 4 [0..3]
# 2 is the "center" slot

if window % 2 == 0 :
nahead = (window-1)//2 or 1
else:
nahead = (window)//2
tip = pad(values,nahead,'head')
head = calc(tip,axis=1)[:,-(nahead+1):][:,:nahead+1]

tip = pad(values,nahead,'tail')
tail = calc(tip,axis=1)[:,-(nahead+1):][:,:nahead]

result[:,-(nahead):] = tail
result[:,:nahead+1] = head

if minp > 0:
ld = minp - nahead-1
rd = ld-1 if window % 2 == 0 else ld
if ld > 0:
result[:,:ld] = NaN
if rd >0:
result[:,-(rd):] = NaN

result =bkwd(result)

# rebuild the correct pandas object using the new data
return return_hook(result)
# TODO: test window=2
# from pandas.stats import moments as mom
# print list(mom.rolling_mean(Series(np.ones(10)),3,min_periods=1,pad_val=0,center=True).values)
# print list(mom.rolling_mean(Series(np.ones(10)),4,min_periods=1,pad_val=0,center=True).values)
# print list(mom.rolling_mean(Series(np.ones(10)),5,min_periods=1,pad_val=0,center=True).values)
# mom.rolling_mean(DataFrame(np.ones((3,10))),3,axis=0,min_periods=1,pad_val=0,center=True)
# mom.rolling_mean(DataFrame(np.ones((3,10))),3,axis=1,min_periods=1,pad_val=0,center=True)
# mom.rolling_mean(Series(range(25)),3,axis=0,min_periods=1,pad_val=0,center=True)
def _center_window(rs, window, axis):
if axis > rs.ndim-1:
raise ValueError("Requested axis is larger then no. of argument dimensions")
Expand Down Expand Up @@ -497,14 +585,13 @@ def _rolling_func(func, desc, check_minp=_use_window):
@Substitution(desc, _unary_arg, _type_of_input)
@Appender(_doc_template)
@wraps(func)

def f(arg, window, min_periods=None, freq=None, center=False,
time_rule=None, **kwargs):
def call_cython(arg, window, minp, **kwds):
minp = check_minp(minp, window)
return func(arg, window, minp, **kwds)
return _rolling_moment(arg, window, call_cython, min_periods,
time_rule=None, pad_val=np.NaN, **kwargs):
min_periods = check_minp(min_periods, window)
return _rolling_moment(arg, window, func, min_periods,
freq=freq, center=center,
time_rule=time_rule, **kwargs)
time_rule=time_rule,pad_val=pad_val, **kwargs)

return f

Expand Down
142 changes: 111 additions & 31 deletions pandas/stats/tests/test_moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,23 +396,30 @@ def _check_ndarray(self, func, static_comp, window=50,
assert_almost_equal(result[-1], static_comp(arr[10:-10]))

if has_center:
win = 20
if has_min_periods:
result = func(arr, 20, min_periods=15, center=True)
expected = func(arr, 20, min_periods=15)
result = func(arr, win, min_periods=15, center=True)
expected = func(arr, win, min_periods=15)
else:
result = func(arr, 20, center=True)
expected = func(arr, 20)
result = func(arr, win, center=True)
expected = func(arr, win)

assert_almost_equal(result[1], expected[10])
if fill_value is None:
self.assert_(np.isnan(result[-9:]).all())
else:
self.assert_((result[-9:] == 0).all())
# fill_Value only specified by rolling_Count
# which assumes the old 0 append at end
# behavior, no longer true

# if fill_value is None:
# self.assert_(np.isnan(result[-9:]).all())
# else:
# self.assert_((result[-9:] == 0).all())


if has_min_periods:
self.assert_(np.isnan(expected[23]))
self.assert_(np.isnan(result[14]))
self.assert_(np.isnan(expected[-5]))
self.assert_(np.isnan(result[-14]))
self.assert_(np.isnan(expected[-5]))

def _check_structures(self, func, static_comp,
has_min_periods=True, has_time_rule=True,
Expand Down Expand Up @@ -450,29 +457,87 @@ def _check_structures(self, func, static_comp,
assert_almost_equal(frame_result.xs(last_date),
trunc_frame.apply(static_comp))

if has_center:
if has_min_periods:
minp = 10
series_xp = func(self.series, 25, min_periods=minp).shift(-12)
frame_xp = func(self.frame, 25, min_periods=minp).shift(-12)

series_rs = func(self.series, 25, min_periods=minp,
center=True)
frame_rs = func(self.frame, 25, min_periods=minp,
center=True)

else:
series_xp = func(self.series, 25).shift(-12)
frame_xp = func(self.frame, 25).shift(-12)

series_rs = func(self.series, 25, center=True)
frame_rs = func(self.frame, 25, center=True)

if fill_value is not None:
series_xp = series_xp.fillna(fill_value)
frame_xp = frame_xp.fillna(fill_value)
assert_series_equal(series_xp, series_rs)
assert_frame_equal(frame_xp, frame_rs)
def test_centered_rolling_moment(self):

def participating(i,win):
return [x for x in range(i-win//2,i+(win+1)//2)]

# validate
self.assertEqual(participating(0,3),[-1,0,1])
self.assertEqual(participating(1,3),[0,1,2])
self.assertEqual(participating(0,4),[-2,-1,0,1])
self.assertEqual(participating(1,4),[-1,0,1,2])

def get_v(s,f,i,win,minp,pad_val=np.nan):
_is = np.array(participating(i,win))
in_range_mask = np.array([ x>=0 and x< len(s) for x in _is ])
def f_(i,data):
# print( data)
vals = np.array( list(data) )
if minp:
return f(vals,win,min_periods=minp)[-1]
else:
return f(vals,win)[-1]

if all(in_range_mask): # middle
return f_(i,s.take(_is))

elif minp and sum(in_range_mask) < minp:
return np.NaN
return "minp_nan"
else:
lpad = np.sum([_is<0])
rpad = np.sum([_is>= len(s)])

_in_is = np.ma.array(_is, mask=~in_range_mask).compressed()
vals = np.array([pad_val] * lpad + list(s.take(_in_is)) + [pad_val]* rpad)
return f_(i,vals)
return "edge"

def build_series(data,func,win,minp,pad_val=np.nan):
return Series(( [get_v(data,func,i,win,minp,pad_val=pad_val) for i in range(len(data))] ))

N,K = 20,5
for win in range(3,N-1):
for minp in range(1,win+1):
func = mom.rolling_mean

# self.series = Series(np.ones(N))
self.series = Series(randn(N))
series_xp =build_series(self.series,func,win,minp,pad_val=0)

# frame_xp = DataFrame(np.ones((N,K)), copy=True)
self.frame= DataFrame(randn(N,K), copy=True)
f = lambda i: func(self.frame.icol(i), win, min_periods=minp, pad_val=0, center=True)
data =[f(i) for i in range(len(self.frame.columns))]
frame_xp = DataFrame(np.array(data).T)
series_rs = func(self.series, win, min_periods=minp, pad_val=0, center=True)
frame_rs = func(self.frame, win, min_periods=minp, pad_val=0,center=True)

assert_series_equal(series_xp, series_rs)
assert_frame_equal(frame_xp, frame_rs)

N,K = 20,5
minp=None
for win in range(3,N-1):
func = mom.rolling_count
# self.series = Series(np.ones(N))
self.series = Series(randn(N))
series_xp =build_series(self.series,func,win,minp)

# frame_xp = DataFrame(np.ones((N,K)), copy=True)
self.frame= DataFrame(randn(N,K), copy=True)

f = lambda i: func(self.frame.icol(i), win, center=True)
data =[f(i) for i in range(len(self.frame.columns))]
frame_xp = DataFrame(np.array(data).T)

series_rs = func(self.series, win,center=True)
frame_rs = func(self.frame, win,center=True)

assert_series_equal(series_xp, series_rs)
assert_frame_equal(frame_xp, frame_rs)

def test_legacy_time_rule_arg(self):
from StringIO import StringIO
Expand Down Expand Up @@ -768,6 +833,21 @@ def _check_expanding(self, func, static_comp, has_min_periods=True,
preserve_nan=preserve_nan)
self._check_expanding_structures(func)

def test_rolling_mean_edges(self):
# GH2803
# actually, covers edge handling more generally
def movingaverage(interval, window_size):
window = np.ones(int(window_size))/float(window_size)
return np.convolve(interval, window, 'same')

nitems = 25
for win in range(1,nitems,1):
ser = Series(range(nitems))
df = DataFrame(index=range(len(ser)))
df['rm'] = mom.rolling_mean(ser, win, center=True, min_periods=1,pad_val=0)
df['ma'] = movingaverage(ser, win)
tm.assert_almost_equal(df['rm'] , df['ma'])

if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down