Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance increase rolling min max #19549

Merged
merged 22 commits into from
Feb 14, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion asv_bench/benchmarks/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,26 @@ class Methods(object):

def setup(self, constructor, window, dtype, method):
N = 10**5
arr = np.random.random(N).astype(dtype)
arr = (100 * np.random.random(N)).astype(dtype)
self.roll = getattr(pd, constructor)(arr).rolling(window)

def time_rolling(self, constructor, window, dtype, method):
getattr(self.roll, method)()

class VariableWindowMethods(Methods):
sample_time = 0.2
params = (['DataFrame', 'Series'],
['50s', '1h', '1d'],
['int', 'float'],
['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
'sum'])
param_names = ['contructor', 'window', 'dtype', 'method']

def setup(self, constructor, window, dtype, method):
N = 10**5
arr = (100 * np.random.random(N)).astype(dtype)
index = pd.date_range('2017-01-01', periods=N, freq='5s')
self.roll = getattr(pd, constructor)(arr, index=index).rolling(window)

class Pairwise(object):

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,7 @@ Performance Improvements
- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`)
- Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`)
- Improved performance of :func:`DataFrameGroupBy.rank` (:issue:`15779`)
- Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`)

.. _whatsnew_0230.docs:

Expand Down
15 changes: 15 additions & 0 deletions pandas/_libs/src/headers/cmath
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#ifndef _PANDAS_MATH_H_
#define _PANDAS_MATH_H_

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add commets here on why we have this file (so the next person doesn't go thru the same as you :>)

// In older versions of Visual Studio there wasn't a std::signbit defined
// This defines it using _copysign
#if defined(_MSC_VER) && (_MSC_VER < 1800)
#include <cmath>
namespace std {
__inline int signbit(double num) { return _copysign(1.0, num) < 0; }
}
#else
#include <cmath>
#endif

#endif
11 changes: 0 additions & 11 deletions pandas/_libs/src/headers/math.h

This file was deleted.

54 changes: 36 additions & 18 deletions pandas/_libs/window.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

cimport cython
from cython cimport Py_ssize_t
from libcpp.deque cimport deque

from libc.stdlib cimport malloc, free

Expand All @@ -12,7 +13,7 @@ from numpy cimport ndarray, double_t, int64_t, float64_t
cnp.import_array()


cdef extern from "../src/headers/math.h":
cdef extern from "../src/headers/cmath" namespace "std":
int signbit(double) nogil
double sqrt(double x) nogil

Expand Down Expand Up @@ -1222,8 +1223,9 @@ cdef _roll_min_max(ndarray[numeric] input, int64_t win, int64_t minp,
cdef:
numeric ai
bint is_variable, should_replace
int64_t s, e, N, i, j, removed
int64_t N, i, removed, window_i
Py_ssize_t nobs = 0
deque Q[int64_t]
ndarray[int64_t] starti, endi
ndarray[numeric, ndim=1] output
cdef:
Expand All @@ -1242,32 +1244,48 @@ cdef _roll_min_max(ndarray[numeric] input, int64_t win, int64_t minp,

output = np.empty(N, dtype=input.dtype)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a description of the algorithm and a link (if available)

Q = deque[int64_t]()

if is_variable:

with nogil:

for i in range(N):
s = starti[i]
e = endi[i]
# This is using a modified version of the C++ code in this
# SO post: http://bit.ly/2nOoHlY
# The original impl didn't deal with variable window sizes
# So the code was optimized for that

r = input[s]
nobs = 0
for j in range(s, e):
for i from starti[0] <= i < endi[0]:
ai = init_mm(input[i], &nobs, is_max)

# adds, death at the i offset
ai = init_mm(input[j], &nobs, is_max)
if is_max:
while not Q.empty() and ai >= input[Q.back()]:
Q.pop_back()
else:
while not Q.empty() and ai <= input[Q.back()]:
Q.pop_back()
Q.push_back(i)

if is_max:
if ai > r:
r = ai
else:
if ai < r:
r = ai
for i from endi[0] <= i < N:
output[i-1] = calc_mm(minp, nobs, input[Q.front()])

output[i] = calc_mm(minp, nobs, r)
ai = init_mm(input[i], &nobs, is_max)

else:
if is_max:
while not Q.empty() and ai >= input[Q.back()]:
Q.pop_back()
else:
while not Q.empty() and ai <= input[Q.back()]:
Q.pop_back()

while not Q.empty() and Q.front() <= i - (endi[i] - starti[i]):
Q.pop_front()

Q.push_back(i)

output[N-1] = calc_mm(minp, nobs, input[Q.front()])

else:
# setup the rings of death!
ring = <numeric *>malloc(win * sizeof(numeric))
death = <int64_t *>malloc(win * sizeof(int64_t))
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,8 @@ def pxd(name):
'pyxfile': '_libs/testing'},
'_libs.window': {
'pyxfile': '_libs/window',
'pxdfiles': ['_libs/skiplist', '_libs/src/util']},
'pxdfiles': ['_libs/skiplist', '_libs/src/util'],
'language': 'c++'},
'_libs.writers': {
'pyxfile': '_libs/writers',
'pxdfiles': ['_libs/src/util']},
Expand All @@ -640,11 +641,11 @@ def pxd(name):
sources=sources,
depends=data.get('depends', []),
include_dirs=include,
language=data.get('language', 'c'),
extra_compile_args=extra_compile_args)

extensions.append(obj)


# ----------------------------------------------------------------------
# msgpack

Expand Down