From 3b9bff8d90eab34880bb95e583cc66126613b7c8 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 8 Dec 2019 11:54:05 -0800 Subject: [PATCH 01/44] Add numba to import_optional_dependencies --- doc/source/getting_started/install.rst | 1 + pandas/compat/_optional.py | 1 + 2 files changed, 2 insertions(+) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 9f3ab22496ae7..ae15ad3a5d2c3 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -255,6 +255,7 @@ gcsfs 0.2.2 Google Cloud Storage access html5lib HTML parser for read_html (see :ref:`note `) lxml 3.8.0 HTML parser for read_html (see :ref:`note `) matplotlib 2.2.2 Visualization +numba 0.46.0 Alternative execution engine for rolling operations openpyxl 2.4.8 Reading / writing for xlsx files pandas-gbq 0.8.0 Google Big Query access psycopg2 PostgreSQL engine for sqlalchemy diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 0be201daea425..9650ba39bf46a 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -27,6 +27,7 @@ "xlrd": "1.1.0", "xlwt": "1.2.0", "xlsxwriter": "0.9.8", + "numba": "0.46.0", } From 9a302bff89e1ef00e9db6d994d24fb644cabe4f4 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 8 Dec 2019 12:45:03 -0800 Subject: [PATCH 02/44] Start adding keywords --- pandas/core/window/rolling.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 9f804584f532a..e8a6ac5cc9a93 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1246,9 +1246,15 @@ def count(self): objects instead. If you are just applying a NumPy reduction function this will achieve much better performance. - - *args, **kwargs - Arguments and keyword arguments to be passed into func. + args : tuple, default None + Positional arguments to be passed into func + kwargs : dict, default None + Keyword arguments to be passed into func + engine : str, default 'cython' + Execution engine for the applied function. + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runn rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. Returns ------- @@ -1262,15 +1268,23 @@ def count(self): """ ) - def apply(self, func, raw=False, args=(), kwargs={}): + def apply(self, func, raw=False, args=None, kwargs=None, engine='cython'): from pandas import Series kwargs.pop("_level", None) kwargs.pop("floor", None) window = self._get_window() offset = _offset(window, self.center) + if args is None: + args = () + if kwargs is None: + kwargs = {} + if engine not in {'cython', 'numba'}: + raise ValueError("engine must be either 'numba' or 'cython'") if not is_bool(raw): raise ValueError("raw parameter must be `True` or `False`") + if raw is False and engine == 'numba': + raise ValueError("raw must be `True` when using the numba engine") window_func = partial( self._get_cython_func_type("roll_generic"), From 0e9a600a6929fe58211ff57578d2777e19ef19a9 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 8 Dec 2019 19:47:58 -0800 Subject: [PATCH 03/44] Modify apply for numba and cython --- pandas/core/window/rolling.py | 119 +++++++++++++++++++++++++++++++--- 1 file changed, 109 insertions(+), 10 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index e8a6ac5cc9a93..06b945e9136ce 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1253,8 +1253,14 @@ def count(self): engine : str, default 'cython' Execution engine for the applied function. * ``'cython'`` : Runs rolling apply through C-extensions from cython. - * ``'numba'`` : Runn rolling apply through JIT compiled code from numba. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. Only available when ``raw`` is set to ``True``. + engine_kwargs : dict, default None + Arguments to specify for the execution engine. + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` Returns ------- @@ -1268,8 +1274,15 @@ def count(self): """ ) - def apply(self, func, raw=False, args=None, kwargs=None, engine='cython'): - from pandas import Series + def apply( + self, + func, + raw=False, + args=None, + kwargs=None, + engine="cython", + engine_kwargs=None, + ): kwargs.pop("_level", None) kwargs.pop("floor", None) @@ -1279,12 +1292,30 @@ def apply(self, func, raw=False, args=None, kwargs=None, engine='cython'): args = () if kwargs is None: kwargs = {} - if engine not in {'cython', 'numba'}: - raise ValueError("engine must be either 'numba' or 'cython'") if not is_bool(raw): raise ValueError("raw parameter must be `True` or `False`") - if raw is False and engine == 'numba': - raise ValueError("raw must be `True` when using the numba engine") + + if engine == "cython": + if engine_kwargs is not None: + raise ValueError("cython engine does not accept engine_kwargs") + apply_func = self._generate_cython_apply_func( + args, kwargs, raw, offset, func + ) + elif engine == "numba": + if raw is False: + raise ValueError("raw must be `True` when using the numba engine") + apply_func = self._generate_numba_apply_func( + args, kwargs, func, engine_kwargs + ) + else: + raise ValueError("engine must be either 'numba' or 'cython'") + + # TODO: Why do we always pass center=False? + # name=func for WindowGroupByMixin._apply + return self._apply(apply_func, center=False, floor=0, name=func) + + def _generate_cython_apply_func(self, args, kwargs, raw, offset, func): + from pandas import Series window_func = partial( self._get_cython_func_type("roll_generic"), @@ -1300,9 +1331,77 @@ def apply_func(values, begin, end, min_periods, raw=raw): values = Series(values, index=self.obj.index) return window_func(values, begin, end, min_periods) - # TODO: Why do we always pass center=False? - # name=func for WindowGroupByMixin._apply - return self._apply(apply_func, center=False, floor=0, name=func) + return apply_func + + def _generate_numba_apply_func(self, args, kwargs, func, engine_kwargs): + numba = import_optional_dependency("numba") + + if engine_kwargs is None: + engine_kwargs = {"nopython": True, "nogil": False, "parallel": False} + + nopython = engine_kwargs.get("nopython", True) + nogil = engine_kwargs.get("nogil", False) + # Maybe raise something here about 32 bit compat, if not compat.is_platform_32bit() + parallel = engine_kwargs.get("parallel", False) + + if kwargs and nopython: + raise ValueError( + "numba does not support kwargs with nopython=True: " + "https://github.com/numba/numba/issues/2916" + ) + + if parallel: + loop_range = numba.prange + else: + loop_range = range + + def make_rolling_apply(func): + """ + 1. jit the user's function + 2. Return a rolling apply function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the rolling apply function. + """ + + @numba.generated_jit(nopython=nopython) + def numba_func(window, *_args): + if getattr(np, func.__name__, False) is func: + + def impl(window, *_args): + return func(window, *_args) + + return impl + else: + jf = numba.jit(func, nopython=nopython) + + def impl(window, *_args): + return jf(window, *_args) + + return impl + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def roll_apply( + values: np.ndarray, + begin: np.ndarray, + end: np.ndarray, + minimum_periods: int, + ): + result = np.empty(len(begin)) + for i in loop_range(len(result)): + start = begin[i] + stop = end[i] + window = values[start:stop] + count_nan = np.sum(np.isnan(window)) + if len(window) - count_nan >= minimum_periods: + result[i] = numba_func(window, *args) + else: + result[i] = np.nan + return result + + return roll_apply + + return make_rolling_apply(func) def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) From dbb2a9b360f71371ffc2ab475d04de03da6bca71 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 8 Dec 2019 20:23:01 -0800 Subject: [PATCH 04/44] Add numba as optional dependency --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 2b171d097a693..937f80c5cd2b3 100644 --- a/environment.yml +++ b/environment.yml @@ -72,6 +72,7 @@ dependencies: - matplotlib>=2.2.2 # pandas.plotting, Series.plot, DataFrame.plot - numexpr>=2.6.8 - scipy>=1.1 + - numba>=0.46.0 # optional for io - beautifulsoup4>=4.6.0 # pandas.read_html From f0e9a4dd342d1d146719aee64b128308df895311 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 8 Dec 2019 22:05:26 -0800 Subject: [PATCH 05/44] Add premil tests --- pandas/core/window/rolling.py | 29 +++++++++++++++++++++-------- pandas/tests/window/test_api.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 06b945e9136ce..e27ca21e9bea7 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1283,15 +1283,14 @@ def apply( engine="cython", engine_kwargs=None, ): - - kwargs.pop("_level", None) - kwargs.pop("floor", None) - window = self._get_window() - offset = _offset(window, self.center) if args is None: args = () if kwargs is None: kwargs = {} + kwargs.pop("_level", None) + kwargs.pop("floor", None) + window = self._get_window() + offset = _offset(window, self.center) if not is_bool(raw): raise ValueError("raw parameter must be `True` or `False`") @@ -1341,7 +1340,6 @@ def _generate_numba_apply_func(self, args, kwargs, func, engine_kwargs): nopython = engine_kwargs.get("nopython", True) nogil = engine_kwargs.get("nogil", False) - # Maybe raise something here about 32 bit compat, if not compat.is_platform_32bit() parallel = engine_kwargs.get("parallel", False) if kwargs and nopython: @@ -2047,8 +2045,23 @@ def count(self): @Substitution(name="rolling") @Appender(_shared_docs["apply"]) - def apply(self, func, raw=False, args=(), kwargs={}): - return super().apply(func, raw=raw, args=args, kwargs=kwargs) + def apply( + self, + func, + raw=False, + args=None, + kwargs=None, + engine="cython", + engine_kwargs=None, + ): + return super().apply( + func, + raw=raw, + args=args, + kwargs=kwargs, + engine=engine, + engine_kwargs=engine_kwargs, + ) @Substitution(name="rolling") @Appender(_shared_docs["sum"]) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 5085576cc96f0..a099e6731e2dd 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -342,3 +342,31 @@ def test_multiple_agg_funcs(self, func, window_size, expected_vals): ) tm.assert_frame_equal(result, expected) + + +class TestEngine: + def test_invalid_engine(self): + with pytest.raises( + ValueError, match="engine must be either 'numba' or 'cython'" + ): + Series(range(1)).rolling(1).apply(lambda x: x, engine="foo") + + def test_invalid_engine_kwargs_cython(self): + with pytest.raises( + ValueError, match="cython engine does not accept engine_kwargs" + ): + Series(range(1)).rolling(1).apply( + lambda x: x, engine="cython", engine_kwargs={"nopython": False} + ) + + def test_invalid_raw_numba(self): + with pytest.raises( + ValueError, match="raw must be `True` when using the numba engine" + ): + Series(range(1)).rolling(1).apply(lambda x: x, raw=False, engine="numba") + + def test_invalid_kwargs_nopython(self): + with pytest.raises(ValueError, match="numba does not support kwargs with"): + Series(range(1)).rolling(1).apply( + lambda x: x, kwargs={"a": 1}, engine="numba", raw=True + ) From cb976cf9f8d3ac8cffae1b1f4b644baa7e3b059d Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 10 Dec 2019 23:15:32 -0800 Subject: [PATCH 06/44] Add numba to requirements-dev, type and reorder signature in apply --- pandas/core/window/rolling.py | 20 ++++++++++---------- requirements-dev.txt | 3 ++- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index ad9f194dddbe9..5b1f17ad1d3a6 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1239,10 +1239,6 @@ def count(self): objects instead. If you are just applying a NumPy reduction function this will achieve much better performance. - args : tuple, default None - Positional arguments to be passed into func - kwargs : dict, default None - Keyword arguments to be passed into func engine : str, default 'cython' Execution engine for the applied function. * ``'cython'`` : Runs rolling apply through C-extensions from cython. @@ -1254,6 +1250,10 @@ def count(self): * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` and ``parallel``. The default ``engine_kwargs`` for the ``'numba'`` engine is ``{'nopython': True, 'nogil': False, 'parallel': False}`` + args : tuple, default None + Positional arguments to be passed into func + kwargs : dict, default None + Keyword arguments to be passed into func Returns ------- @@ -1269,12 +1269,12 @@ def count(self): def apply( self, - func, - raw=False, - args=None, - kwargs=None, - engine="cython", - engine_kwargs=None, + func: Callable, + raw: bool = False, + engine: str = "cython", + engine_kwargs: Optional[Dict] = None, + args: Optional[Tuple] = None, + kwargs: Optional[Dict] = None, ): if args is None: args = () diff --git a/requirements-dev.txt b/requirements-dev.txt index 5f67726a3e476..a829b323a4e06 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -64,4 +64,5 @@ xlsxwriter xlwt odfpy pyreadstat -git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master \ No newline at end of file +git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master +numba>=0.46.0 \ No newline at end of file From 45420bb59ccd6906f21737156b750f574aea8537 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 10 Dec 2019 23:25:02 -0800 Subject: [PATCH 07/44] Move numba routines to its own file --- pandas/core/window/numba_.py | 77 +++++++++++++++++++++++++++++++++++ pandas/core/window/rolling.py | 74 +-------------------------------- 2 files changed, 79 insertions(+), 72 deletions(-) create mode 100644 pandas/core/window/numba_.py diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py new file mode 100644 index 0000000000000..bacefd51724f3 --- /dev/null +++ b/pandas/core/window/numba_.py @@ -0,0 +1,77 @@ +from typing import Callable, Dict, Optional, Tuple + +import numpy as np + +from pandas.compat._optional import import_optional_dependency + + +def _generate_numba_apply_func( + args: Tuple, kwargs: Dict, func: Callable, engine_kwargs: Optional[Dict] +): + numba = import_optional_dependency("numba") + + if engine_kwargs is None: + engine_kwargs = {"nopython": True, "nogil": False, "parallel": False} + + nopython = engine_kwargs.get("nopython", True) + nogil = engine_kwargs.get("nogil", False) + parallel = engine_kwargs.get("parallel", False) + + if kwargs and nopython: + raise ValueError( + "numba does not support kwargs with nopython=True: " + "https://github.com/numba/numba/issues/2916" + ) + + if parallel: + loop_range = numba.prange + else: + loop_range = range + + def make_rolling_apply(func): + """ + 1. jit the user's function + 2. Return a rolling apply function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the rolling apply function. + """ + + @numba.generated_jit(nopython=nopython) + def numba_func(window, *_args): + if getattr(np, func.__name__, False) is func: + + def impl(window, *_args): + return func(window, *_args) + + return impl + else: + jf = numba.jit(func, nopython=nopython) + + def impl(window, *_args): + return jf(window, *_args) + + return impl + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def roll_apply( + values: np.ndarray, + begin: np.ndarray, + end: np.ndarray, + minimum_periods: int, + ): + result = np.empty(len(begin)) + for i in loop_range(len(result)): + start = begin[i] + stop = end[i] + window = values[start:stop] + count_nan = np.sum(np.isnan(window)) + if len(window) - count_nan >= minimum_periods: + result[i] = numba_func(window, *args) + else: + result[i] = np.nan + return result + + return roll_apply + + return make_rolling_apply(func) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 5b1f17ad1d3a6..30b9fa5792e90 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -54,6 +54,7 @@ FixedWindowIndexer, VariableWindowIndexer, ) +from pandas.core.window.numba_ import _generate_numba_apply_func class _Window(PandasObject, ShallowMixin, SelectionMixin): @@ -1296,9 +1297,7 @@ def apply( elif engine == "numba": if raw is False: raise ValueError("raw must be `True` when using the numba engine") - apply_func = self._generate_numba_apply_func( - args, kwargs, func, engine_kwargs - ) + apply_func = _generate_numba_apply_func(args, kwargs, func, engine_kwargs) else: raise ValueError("engine must be either 'numba' or 'cython'") @@ -1325,75 +1324,6 @@ def apply_func(values, begin, end, min_periods, raw=raw): return apply_func - def _generate_numba_apply_func(self, args, kwargs, func, engine_kwargs): - numba = import_optional_dependency("numba") - - if engine_kwargs is None: - engine_kwargs = {"nopython": True, "nogil": False, "parallel": False} - - nopython = engine_kwargs.get("nopython", True) - nogil = engine_kwargs.get("nogil", False) - parallel = engine_kwargs.get("parallel", False) - - if kwargs and nopython: - raise ValueError( - "numba does not support kwargs with nopython=True: " - "https://github.com/numba/numba/issues/2916" - ) - - if parallel: - loop_range = numba.prange - else: - loop_range = range - - def make_rolling_apply(func): - """ - 1. jit the user's function - 2. Return a rolling apply function with the jitted function inline - - Configurations specified in engine_kwargs apply to both the user's - function _AND_ the rolling apply function. - """ - - @numba.generated_jit(nopython=nopython) - def numba_func(window, *_args): - if getattr(np, func.__name__, False) is func: - - def impl(window, *_args): - return func(window, *_args) - - return impl - else: - jf = numba.jit(func, nopython=nopython) - - def impl(window, *_args): - return jf(window, *_args) - - return impl - - @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def roll_apply( - values: np.ndarray, - begin: np.ndarray, - end: np.ndarray, - minimum_periods: int, - ): - result = np.empty(len(begin)) - for i in loop_range(len(result)): - start = begin[i] - stop = end[i] - window = values[start:stop] - count_nan = np.sum(np.isnan(window)) - if len(window) - count_nan >= minimum_periods: - result[i] = numba_func(window, *args) - else: - result[i] = np.nan - return result - - return roll_apply - - return make_rolling_apply(func) - def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) window_func = self._get_cython_func_type("roll_sum") From 17851cf1a7c77a83b59dc629cdcc642341e0d5bd Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 10 Dec 2019 23:30:37 -0800 Subject: [PATCH 08/44] Adjust signature in top level function as well --- pandas/core/window/rolling.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 30b9fa5792e90..9ff61721a2abb 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1972,18 +1972,18 @@ def apply( self, func, raw=False, - args=None, - kwargs=None, engine="cython", engine_kwargs=None, + args=None, + kwargs=None, ): return super().apply( func, raw=raw, - args=args, - kwargs=kwargs, engine=engine, engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, ) @Substitution(name="rolling") From 9619f8d675d2273a501a916176976fc97ea18166 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 11 Dec 2019 11:56:57 -0800 Subject: [PATCH 09/44] Generate requirements-dev.txt using script --- requirements-dev.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index a829b323a4e06..93a5a6b32fad6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -47,6 +47,7 @@ jinja2 matplotlib>=2.2.2 numexpr>=2.6.8 scipy>=1.1 +numba>=0.46.0 beautifulsoup4>=4.6.0 fastparquet>=0.3.2 html5lib @@ -64,5 +65,4 @@ xlsxwriter xlwt odfpy pyreadstat -git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master -numba>=0.46.0 \ No newline at end of file +git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master \ No newline at end of file From b8908eaf15e4deca584bc77e5e76d7bd0fcba5c3 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 12 Dec 2019 22:10:56 -0800 Subject: [PATCH 10/44] Add skip test decorator, add numba to a few builds --- ci/deps/azure-36-minimum_versions.yaml | 1 + ci/deps/azure-windows-36.yaml | 1 + pandas/tests/window/test_api.py | 1 + 3 files changed, 3 insertions(+) diff --git a/ci/deps/azure-36-minimum_versions.yaml b/ci/deps/azure-36-minimum_versions.yaml index 8bf4f70d18aec..de7e011d9c7ca 100644 --- a/ci/deps/azure-36-minimum_versions.yaml +++ b/ci/deps/azure-36-minimum_versions.yaml @@ -17,6 +17,7 @@ dependencies: - beautifulsoup4=4.6.0 - bottleneck=1.2.1 - jinja2=2.8 + - numba=0.46.0 - numexpr=2.6.2 - numpy=1.13.3 - openpyxl=2.5.7 diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index 2bd11c9030325..7fa9dee7445a6 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -17,6 +17,7 @@ dependencies: - bottleneck - fastparquet>=0.3.2 - matplotlib=3.0.2 + - numba - numexpr - numpy=1.15.* - openpyxl diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index a099e6731e2dd..ca61b98d0b416 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -344,6 +344,7 @@ def test_multiple_agg_funcs(self, func, window_size, expected_vals): tm.assert_frame_equal(result, expected) +@td.skip_if_no('numba', '0.46.0') class TestEngine: def test_invalid_engine(self): with pytest.raises( From 135f2ad9d236a5d1fba0defd1a210d4e40f20c53 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 12 Dec 2019 22:21:13 -0800 Subject: [PATCH 11/44] black --- pandas/core/window/numba_.py | 17 +++++++++-------- pandas/tests/window/test_api.py | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index bacefd51724f3..435e29b12a9ff 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -8,6 +8,15 @@ def _generate_numba_apply_func( args: Tuple, kwargs: Dict, func: Callable, engine_kwargs: Optional[Dict] ): + """ + Generate a numba jitted apply function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a rolling apply function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the rolling apply function. + """ numba = import_optional_dependency("numba") if engine_kwargs is None: @@ -29,14 +38,6 @@ def _generate_numba_apply_func( loop_range = range def make_rolling_apply(func): - """ - 1. jit the user's function - 2. Return a rolling apply function with the jitted function inline - - Configurations specified in engine_kwargs apply to both the user's - function _AND_ the rolling apply function. - """ - @numba.generated_jit(nopython=nopython) def numba_func(window, *_args): if getattr(np, func.__name__, False) is func: diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index ca61b98d0b416..518da688d72bf 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -344,7 +344,7 @@ def test_multiple_agg_funcs(self, func, window_size, expected_vals): tm.assert_frame_equal(result, expected) -@td.skip_if_no('numba', '0.46.0') +@td.skip_if_no("numba", "0.46.0") class TestEngine: def test_invalid_engine(self): with pytest.raises( From 34a5687d4670f5d7d1b0288013d08718421b3f0f Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 12 Dec 2019 22:32:21 -0800 Subject: [PATCH 12/44] don't rejit a user's jitted function --- pandas/core/window/numba_.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 435e29b12a9ff..e02597e25c145 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -38,21 +38,27 @@ def _generate_numba_apply_func( loop_range = range def make_rolling_apply(func): - @numba.generated_jit(nopython=nopython) - def numba_func(window, *_args): - if getattr(np, func.__name__, False) is func: - def impl(window, *_args): - return func(window, *_args) + if isinstance(func, numba.targets.registry.CPUDispatcher): + # Don't jit a user passed jitted function + numba_func = func + else: - return impl - else: - jf = numba.jit(func, nopython=nopython) + @numba.generated_jit(nopython=nopython) + def numba_func(window, *_args): + if getattr(np, func.__name__, False) is func: - def impl(window, *_args): - return jf(window, *_args) + def impl(window, *_args): + return func(window, *_args) - return impl + return impl + else: + jf = numba.jit(func, nopython=nopython) + + def impl(window, *_args): + return jf(window, *_args) + + return impl @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) def roll_apply( From 6da8199f446ea0590c4596f746983e7a1da97277 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 12 Dec 2019 22:47:39 -0800 Subject: [PATCH 13/44] Add numba/cython comparison test --- pandas/tests/window/test_numba.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 pandas/tests/window/test_numba.py diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py new file mode 100644 index 0000000000000..eeee83635e31a --- /dev/null +++ b/pandas/tests/window/test_numba.py @@ -0,0 +1,27 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import Series +import pandas.util.testing as tm + + +@td.skip_if_no("numba", "0.46.0") +class TestApply: + @pytest.mark.parametrize("nogil", [True, False]) + @pytest.mark.parametrize("parallel", [True, False]) + @pytest.mark.parametrize("nopython", [True, False]) + def test_numba_vs_cython(self, nogil, parallel, nopython): + def f(x, *args): + return np.sqrt(x) + np.sum(args) + 1 + + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} + args = (2,) + + s = Series(range(10)) + result = s.rolling(2).apply( + f, args=args, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = s.rolling(2).apply(f, engine="cython", args=args, raw=True) + tm.assert_series_equal(result, expected) From 54e74d1094e47b3b8fd8b552d44b98c45381f63b Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 16 Dec 2019 21:31:17 -0800 Subject: [PATCH 14/44] Remove typing for now --- pandas/core/window/rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 86bb51f341731..be7c351dadf54 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1270,7 +1270,7 @@ def count(self): def apply( self, - func: Callable, + func, raw: bool = False, engine: str = "cython", engine_kwargs: Optional[Dict] = None, From 04d353098259896cac0c2eb24487db51548661e7 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 16 Dec 2019 21:42:21 -0800 Subject: [PATCH 15/44] Remove sub description for doc failures? --- pandas/core/window/rolling.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index be7c351dadf54..4e20bbf34ac31 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1241,20 +1241,18 @@ def count(self): If you are just applying a NumPy reduction function this will achieve much better performance. engine : str, default 'cython' - Execution engine for the applied function. * ``'cython'`` : Runs rolling apply through C-extensions from cython. * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. Only available when ``raw`` is set to ``True``. engine_kwargs : dict, default None - Arguments to specify for the execution engine. * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` and ``parallel``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{'nopython': True, 'nogil': False, 'parallel': False}`` + ``{'nopython': True, 'nogil': False, 'parallel': False}``. args : tuple, default None - Positional arguments to be passed into func + Positional arguments to be passed into func. kwargs : dict, default None - Keyword arguments to be passed into func + Keyword arguments to be passed into func. Returns ------- From 4bbf5872365d4515bed285097c915d5e6db03d12 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 16 Dec 2019 22:03:02 -0800 Subject: [PATCH 16/44] Fix test function --- pandas/tests/window/test_numba.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index eeee83635e31a..b60ca94d9a799 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -14,7 +14,10 @@ class TestApply: @pytest.mark.parametrize("nopython", [True, False]) def test_numba_vs_cython(self, nogil, parallel, nopython): def f(x, *args): - return np.sqrt(x) + np.sum(args) + 1 + arg_sum = 0 + for arg in args: + arg_sum += arg + return np.mean(x) + arg_sum engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} args = (2,) From f849bc7006be8d60f0739b173783f9d783c262ab Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 16 Dec 2019 22:16:23 -0800 Subject: [PATCH 17/44] test user predefined jit function, clarify docstring --- pandas/core/window/rolling.py | 3 ++- pandas/tests/window/conftest.py | 15 +++++++++++++++ pandas/tests/window/test_numba.py | 11 +++++++---- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 4e20bbf34ac31..8482fa52e8921 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1247,7 +1247,8 @@ def count(self): engine_kwargs : dict, default None * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel``. The default ``engine_kwargs`` for the ``'numba'`` engine is + and ``parallel`` dictionary keys. The values must either be ``True`` or ``False``. + The default ``engine_kwargs`` for the ``'numba'`` engine is ``{'nopython': True, 'nogil': False, 'parallel': False}``. args : tuple, default None Positional arguments to be passed into func. diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 7ea4be25ca2a6..8f40278d95d12 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -47,3 +47,18 @@ def center(request): @pytest.fixture(params=[None, 1]) def min_periods(request): return request.param + + +@pytest.fixture(params=[True, False]) +def parallel(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def nogil(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def nopython(request): + return request.param diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index b60ca94d9a799..3a85cff9a73eb 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -9,16 +9,19 @@ @td.skip_if_no("numba", "0.46.0") class TestApply: - @pytest.mark.parametrize("nogil", [True, False]) - @pytest.mark.parametrize("parallel", [True, False]) - @pytest.mark.parametrize("nopython", [True, False]) - def test_numba_vs_cython(self, nogil, parallel, nopython): + @pytest.mark.parametrize("jit", [True, False]) + def test_numba_vs_cython(self, jit, nogil, parallel, nopython): def f(x, *args): arg_sum = 0 for arg in args: arg_sum += arg return np.mean(x) + arg_sum + if jit: + import numba + + f = numba.jit(f) + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} args = (2,) From 0c30e48b2fb8f7864d46728c3204f49b2628f783 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 16 Dec 2019 22:21:03 -0800 Subject: [PATCH 18/44] Apply engine kwargs to function as well --- pandas/core/window/numba_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index e02597e25c145..aa0472f76068a 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -44,7 +44,7 @@ def make_rolling_apply(func): numba_func = func else: - @numba.generated_jit(nopython=nopython) + @numba.generated_jit(nopython=nopython, nogil=nogil, parallel=parallel) def numba_func(window, *_args): if getattr(np, func.__name__, False) is func: From c4c952ef4f096c5d19ae225c0d383dead63e8b78 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 16 Dec 2019 22:22:41 -0800 Subject: [PATCH 19/44] Clairfy documentation --- pandas/core/window/rolling.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 8482fa52e8921..db832447f8830 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1232,7 +1232,8 @@ def count(self): ---------- func : function Must produce a single value from an ndarray input if ``raw=True`` - or a single value from a Series if ``raw=False``. + or a single value from a Series if ``raw=False``. Can also accept a + Numba JIT function with ``engine='numba'`` specified. raw : bool, default None * ``False`` : passes each row or column as a Series to the function. From 8645976e71a3dc34a3b488957cce4c82167651e7 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 16 Dec 2019 22:25:05 -0800 Subject: [PATCH 20/44] Clarify what engine_kwargs applies to --- pandas/core/window/rolling.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index db832447f8830..37b1ad0e0f521 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1250,7 +1250,8 @@ def count(self): * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` and ``parallel`` dictionary keys. The values must either be ``True`` or ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{'nopython': True, 'nogil': False, 'parallel': False}``. + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be applied + to both the ``func`` and the ``apply`` rolling aggregation. args : tuple, default None Positional arguments to be passed into func. kwargs : dict, default None From 987c91697fc747fe7f8e951d53fbe0cf3476cbc8 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 16 Dec 2019 23:20:35 -0800 Subject: [PATCH 21/44] Start section for numba rolling apply --- doc/source/user_guide/computation.rst | 19 +++++++++++++++++++ pandas/core/window/rolling.py | 4 ++++ 2 files changed, 23 insertions(+) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 627a83b7359bb..7e618e9363c08 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -321,6 +321,11 @@ We provide a number of common statistical functions: :meth:`~Rolling.cov`, Unbiased covariance (binary) :meth:`~Rolling.corr`, Correlation (binary) +.. _stats.rolling_apply: + +Rolling Apply +~~~~~~~~~~~~~ + The :meth:`~Rolling.apply` function takes an extra ``func`` argument and performs generic rolling computations. The ``func`` argument should be a single function that produces a single value from an ndarray input. Suppose we wanted to @@ -334,6 +339,20 @@ compute the mean absolute deviation on a rolling basis: @savefig rolling_apply_ex.png s.rolling(window=60).apply(mad, raw=True).plot(style='k') +Additionally, :meth:`~Rolling.apply` can leverage `Numba `__ +if installed as an optional dependency as the execution engine of the apply aggregation using the +``engine='numba'`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``). +Numba will be applied in potentially two routines: + +1. If ``func`` is a standard Python function, the engine will JIT the passed function. ``func`` +can also be a pre-JIT function in which case the engine will not JIT the function again. +2. The engine will JIT the for loop where the apply function is applied to each window. + +The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the +`numba.jit decorator `__. +These keyword arguments will be applied to *both* the passed function (if a standard Python function) +and the apply for loop. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported. + .. _stats.rolling_window: Rolling windows diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 37b1ad0e0f521..c00b88956325e 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1266,6 +1266,10 @@ def count(self): -------- Series.%(name)s : Series %(name)s. DataFrame.%(name)s : DataFrame %(name)s. + + Notes + ----- + See :ref:`stats.rolling_window` for extended documentation on the Numba engine. """ ) From b775684c9be43ba203f2777c7ddd013a271718be Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 16 Dec 2019 23:28:40 -0800 Subject: [PATCH 22/44] Lint --- pandas/core/window/rolling.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index c00b88956325e..d3da2cbbbbf42 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1248,10 +1248,10 @@ def count(self): engine_kwargs : dict, default None * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or ``False``. - The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be applied - to both the ``func`` and the ``apply`` rolling aggregation. + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to both the ``func`` and the ``apply`` rolling aggregation. args : tuple, default None Positional arguments to be passed into func. kwargs : dict, default None @@ -1266,7 +1266,7 @@ def count(self): -------- Series.%(name)s : Series %(name)s. DataFrame.%(name)s : DataFrame %(name)s. - + Notes ----- See :ref:`stats.rolling_window` for extended documentation on the Numba engine. From 2e04e602ca860872b8409c80b49a68dec37e059c Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 17 Dec 2019 00:01:53 -0800 Subject: [PATCH 23/44] clarify note --- pandas/core/window/rolling.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index d3da2cbbbbf42..8e198e60933a0 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1269,7 +1269,8 @@ def count(self): Notes ----- - See :ref:`stats.rolling_window` for extended documentation on the Numba engine. + See :ref:`stats.rolling_window` for extended documentation and performance + considerations for the Numba engine. """ ) From 0c140330b36ad25ff86d1b4ce3d615d050517928 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 18 Dec 2019 23:27:47 -0800 Subject: [PATCH 24/44] Add apply function cache to save compiled numba functions --- pandas/core/window/common.py | 1 + pandas/core/window/numba_.py | 10 +++++++++- pandas/core/window/rolling.py | 24 +++++++++++++++++++----- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index c7d856e9a1e88..0e7a877cbc69b 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -70,6 +70,7 @@ def _apply( floor: int = 1, is_weighted: bool = False, name: Optional[str] = None, + use_numba_cache: Optional = False, **kwargs, ): """ diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index aa0472f76068a..d4b693b7dd988 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -6,7 +6,11 @@ def _generate_numba_apply_func( - args: Tuple, kwargs: Dict, func: Callable, engine_kwargs: Optional[Dict] + args: Tuple, + kwargs: Dict, + func: Callable, + engine_kwargs: Optional[Dict], + function_cache: Dict, ): """ Generate a numba jitted apply function specified by values from engine_kwargs. @@ -37,6 +41,10 @@ def _generate_numba_apply_func( else: loop_range = range + # Return an already compiled version of roll_apply if available + if func in function_cache: + return function_cache[func] + def make_rolling_apply(func): if isinstance(func, numba.targets.registry.CPUDispatcher): diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 8e198e60933a0..62d9605d30bb1 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -93,6 +93,7 @@ def __init__( self.win_freq = None self.axis = obj._get_axis_number(axis) if axis is not None else None self.validate() + self._numba_func_cache = dict() @property def _constructor(self): @@ -443,6 +444,7 @@ def _apply( floor: int = 1, is_weighted: bool = False, name: Optional[str] = None, + use_numba_cache: Optional[bool] = False, **kwargs, ): """ @@ -455,10 +457,11 @@ def _apply( func : callable function to apply center : bool require_min_periods : int - floor: int - is_weighted - name: str, + floor : int + is_weighted : bool + name : str, compatibility with groupby.rolling + use_numba_cache : bool **kwargs additional arguments for rolling function and window function @@ -533,6 +536,9 @@ def calc(x): result = calc(values) result = np.asarray(result) + if use_numba_cache: + self._numba_func_cache[name] = func + if center: result = self._center_window(result, window) @@ -1303,13 +1309,21 @@ def apply( elif engine == "numba": if raw is False: raise ValueError("raw must be `True` when using the numba engine") - apply_func = _generate_numba_apply_func(args, kwargs, func, engine_kwargs) + apply_func = _generate_numba_apply_func( + args, kwargs, func, engine_kwargs, self._numba_func_cache + ) else: raise ValueError("engine must be either 'numba' or 'cython'") # TODO: Why do we always pass center=False? # name=func for WindowGroupByMixin._apply - return self._apply(apply_func, center=False, floor=0, name=func) + return self._apply( + apply_func, + center=False, + floor=0, + name=func, + use_numba_cache=engine == "numba", + ) def _generate_cython_apply_func(self, args, kwargs, raw, offset, func): from pandas import Series From c7106dc47535d34b82ad388271320d762f596014 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 18 Dec 2019 23:51:36 -0800 Subject: [PATCH 25/44] Add performance example --- doc/source/user_guide/computation.rst | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 7e618e9363c08..643a4e9ca69b4 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -353,6 +353,30 @@ The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be These keyword arguments will be applied to *both* the passed function (if a standard Python function) and the apply for loop. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported. +.. note:: + + In terms of performance, **the first time a function is run using the Numba engine will be slow** + as Numba will have some function compilation overhead. However, `rolling` objects will cache + the function and subsequent calls will be fast. In general, the Numba engine is performant with + a larger amount of data points (e.g. 1+ million). + +.. code-block:: ipython + + In [1]: data = pd.Series(range(1000000)) + + In [2]: roll = data.rolling(10) + + In [3]: f = lambda x: np.sum(x) + 5 + # Ran the first time, compilation time will affect performance + In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) + 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) + # Function is cached and performance will improve + In [5]: %timeit roll.apply(f, engine='numba', raw=True) + 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [6]: %timeit roll.apply(f, engine='cython', raw=True) + 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + .. _stats.rolling_window: Rolling windows From 2846faf0e85e5487e1d75d66950870e941dfb976 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Fri, 20 Dec 2019 20:01:31 -0800 Subject: [PATCH 26/44] Remove whitespace --- doc/source/user_guide/computation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 643a4e9ca69b4..67b0e5c0098b4 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -363,7 +363,7 @@ and the apply for loop. Currently only ``nogil``, ``nopython``, and ``parallel`` .. code-block:: ipython In [1]: data = pd.Series(range(1000000)) - + In [2]: roll = data.rolling(10) In [3]: f = lambda x: np.sum(x) + 5 From 5a645c0a54ef6f32b4a6d59caabb7673547be90b Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sat, 21 Dec 2019 20:31:45 -0800 Subject: [PATCH 27/44] Address lint errors and separate apply tests --- doc/source/user_guide/computation.rst | 3 +- pandas/core/window/numba_.py | 5 +- pandas/tests/window/conftest.py | 5 + pandas/tests/window/test_apply.py | 114 ++++++++++++++++++ pandas/tests/window/test_moments.py | 51 -------- pandas/tests/window/test_timeseries_window.py | 30 ----- 6 files changed, 125 insertions(+), 83 deletions(-) create mode 100644 pandas/tests/window/test_apply.py diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 67b0e5c0098b4..259c274cfc369 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -366,7 +366,8 @@ and the apply for loop. Currently only ``nogil``, ``nopython``, and ``parallel`` In [2]: roll = data.rolling(10) - In [3]: f = lambda x: np.sum(x) + 5 + In [3]: def f(x): + ...: return np.sum(x) + 5 # Ran the first time, compilation time will affect performance In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index d4b693b7dd988..284d566a14949 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -1,3 +1,4 @@ +import types from typing import Callable, Dict, Optional, Tuple import numpy as np @@ -54,7 +55,9 @@ def make_rolling_apply(func): @numba.generated_jit(nopython=nopython, nogil=nogil, parallel=parallel) def numba_func(window, *_args): - if getattr(np, func.__name__, False) is func: + if getattr(np, func.__name__, False) is func or isinstance( + func, types.BuiltinFunctionType + ): def impl(window, *_args): return func(window, *_args) diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 8f40278d95d12..21b57d35d04a7 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -62,3 +62,8 @@ def nogil(request): @pytest.fixture(params=[True, False]) def nopython(request): return request.param + + +@pytest.fixture(params=["numba", "cython"]) +def engine(request): + return request.param diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py new file mode 100644 index 0000000000000..965d8e3616f4c --- /dev/null +++ b/pandas/tests/window/test_apply.py @@ -0,0 +1,114 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Series, Timestamp, date_range +import pandas.util.testing as tm + + +@pytest.mark.parametrize("bad_raw", [None, 1, 0]) +def test_rolling_apply_invalid_raw(bad_raw): + with pytest.raises(ValueError, match="raw parameter must be `True` or `False`"): + Series(range(3)).rolling(1).apply(len, raw=bad_raw) + + +def test_rolling_apply_out_of_bounds(engine, raw): + # gh-1850 + if engine == "numba": + raw = True + + vals = Series([1, 2, 3, 4]) + + result = vals.rolling(10).apply(np.sum, engine=engine, raw=raw) + assert result.isna().all() + + result = vals.rolling(10, min_periods=1).apply(np.sum, engine=engine, raw=raw) + expected = Series([1, 3, 6, 10], dtype=float) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize("window", [2, "2s"]) +def test_rolling_apply_with_pandas_objects(window): + # 5071 + df = DataFrame( + {"A": np.random.randn(5), "B": np.random.randint(0, 10, size=5)}, + index=date_range("20130101", periods=5, freq="s"), + ) + + # we have an equal spaced timeseries index + # so simulate removing the first period + def f(x): + if x.index[0] == df.index[0]: + return np.nan + return x.iloc[-1] + + result = df.rolling(window).apply(f, raw=False) + expected = df.iloc[2:].reindex_like(df) + tm.assert_frame_equal(result, expected) + + with pytest.raises(AttributeError): + df.rolling(window).apply(f, raw=True) + + +def test_rolling_apply(engine, raw): + if engine == "numba": + raw = True + expected = Series([], dtype="float64") + result = expected.rolling(10).apply(lambda x: x.mean(), engine=engine, raw=raw) + tm.assert_series_equal(result, expected) + + # gh-8080 + s = Series([None, None, None]) + result = s.rolling(2, min_periods=0).apply(lambda x: len(x), engine=engine, raw=raw) + expected = Series([1.0, 2.0, 2.0]) + tm.assert_series_equal(result, expected) + + result = s.rolling(2, min_periods=0).apply(len, engine=engine, raw=raw) + tm.assert_series_equal(result, expected) + + +def test_all_apply(engine, raw): + if engine == "numba": + raw = True + + df = ( + DataFrame( + {"A": date_range("20130101", periods=5, freq="s"), "B": range(5)} + ).set_index("A") + * 2 + ) + er = df.rolling(window=1) + r = df.rolling(window="1s") + + result = r.apply(lambda x: 1, engine=engine, raw=raw) + expected = er.apply(lambda x: 1, engine=engine, raw=raw) + tm.assert_frame_equal(result, expected) + + +def test_ragged_apply(engine, raw): + if engine == "numba": + raw = True + + df = DataFrame({"B": range(5)}) + df.index = [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] + + f = lambda x: 1 + result = df.rolling(window="1s", min_periods=1).apply(f, engine=engine, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).apply(f, engine=engine, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).apply(f, engine=engine, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_moments.py b/pandas/tests/window/test_moments.py index 2c65c9e2ac82c..756f1a37d9a9b 100644 --- a/pandas/tests/window/test_moments.py +++ b/pandas/tests/window/test_moments.py @@ -674,57 +674,6 @@ def f(x): self._check_moment_func(np.mean, name="apply", func=f, raw=raw) - expected = Series([], dtype="float64") - result = expected.rolling(10).apply(lambda x: x.mean(), raw=raw) - tm.assert_series_equal(result, expected) - - # gh-8080 - s = Series([None, None, None]) - result = s.rolling(2, min_periods=0).apply(lambda x: len(x), raw=raw) - expected = Series([1.0, 2.0, 2.0]) - tm.assert_series_equal(result, expected) - - result = s.rolling(2, min_periods=0).apply(len, raw=raw) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("bad_raw", [None, 1, 0]) - def test_rolling_apply_invalid_raw(self, bad_raw): - with pytest.raises(ValueError, match="raw parameter must be `True` or `False`"): - Series(range(3)).rolling(1).apply(len, raw=bad_raw) - - def test_rolling_apply_out_of_bounds(self, raw): - # gh-1850 - vals = pd.Series([1, 2, 3, 4]) - - result = vals.rolling(10).apply(np.sum, raw=raw) - assert result.isna().all() - - result = vals.rolling(10, min_periods=1).apply(np.sum, raw=raw) - expected = pd.Series([1, 3, 6, 10], dtype=float) - tm.assert_almost_equal(result, expected) - - @pytest.mark.parametrize("window", [2, "2s"]) - def test_rolling_apply_with_pandas_objects(self, window): - # 5071 - df = pd.DataFrame( - {"A": np.random.randn(5), "B": np.random.randint(0, 10, size=5)}, - index=pd.date_range("20130101", periods=5, freq="s"), - ) - - # we have an equal spaced timeseries index - # so simulate removing the first period - def f(x): - if x.index[0] == df.index[0]: - return np.nan - return x.iloc[-1] - - result = df.rolling(window).apply(f, raw=False) - expected = df.iloc[2:].reindex_like(df) - tm.assert_frame_equal(result, expected) - - with pytest.raises(AttributeError): - df.rolling(window).apply(f, raw=True) - def test_rolling_std(self, raw): self._check_moment_func(lambda x: np.std(x, ddof=1), name="std", raw=raw) self._check_moment_func( diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 46582b4b50c84..c0d47fc2ca624 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -566,26 +566,6 @@ def test_freqs_ops(self, freq, op, result_data): tm.assert_series_equal(result, expected) - def test_ragged_apply(self, raw): - - df = self.ragged - - f = lambda x: 1 - result = df.rolling(window="1s", min_periods=1).apply(f, raw=raw) - expected = df.copy() - expected["B"] = 1.0 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="2s", min_periods=1).apply(f, raw=raw) - expected = df.copy() - expected["B"] = 1.0 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="5s", min_periods=1).apply(f, raw=raw) - expected = df.copy() - expected["B"] = 1.0 - tm.assert_frame_equal(result, expected) - def test_all(self): # simple comparison of integer vs time-based windowing @@ -614,16 +594,6 @@ def test_all(self): expected = er.quantile(0.5) tm.assert_frame_equal(result, expected) - def test_all_apply(self, raw): - - df = self.regular * 2 - er = df.rolling(window=1) - r = df.rolling(window="1s") - - result = r.apply(lambda x: 1, raw=raw) - expected = er.apply(lambda x: 1, raw=raw) - tm.assert_frame_equal(result, expected) - def test_all2(self): # more sophisticated comparison of integer vs. From 6bac000ba2961544e01ff2f42d98c47a7048d5e9 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 22 Dec 2019 10:28:53 -0800 Subject: [PATCH 28/44] Add whatsnew note --- doc/source/whatsnew/v1.0.0.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index a15d5b319fc82..e829b559bb8b5 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -169,6 +169,16 @@ You can use the alias ``"boolean"`` as well. s = pd.Series([True, False, None], dtype="boolean") s +.. _whatsnew_1000.numba_rolling_apply: + +Using Numba in ``rolling.apply`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We've added an ``engine`` keyword to :meth:`~Rolling.apply` that allows the user to execute the +routine using `Numba `__ instead of Cython. Using the Numba engine +can yield significant performance gains if the apply function can operate on numpy arrays and +the data set is larger. For more details, see :ref:`rolling apply documentation ` + .. _whatsnew_1000.custom_window: Defining custom windows for rolling operations @@ -428,6 +438,8 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | matplotlib | 2.2.2 | | +-----------------+-----------------+---------+ +| numba | 0.46.0 | | ++-----------------+-----------------+---------+ | openpyxl | 2.5.7 | X | +-----------------+-----------------+---------+ | pyarrow | 0.12.0 | X | From 6f1c73f955285d027355283017238237b11f013f Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 22 Dec 2019 10:41:38 -0800 Subject: [PATCH 29/44] Skip apply tests for numba not installed, lint --- doc/source/user_guide/computation.rst | 2 +- pandas/tests/window/conftest.py | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 259c274cfc369..043e4d761d477 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -369,7 +369,7 @@ and the apply for loop. Currently only ``nogil``, ``nopython``, and ``parallel`` In [3]: def f(x): ...: return np.sum(x) + 5 # Ran the first time, compilation time will affect performance - In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) + In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) # noqa: E225 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) # Function is cached and performance will improve In [5]: %timeit roll.apply(f, engine='numba', raw=True) diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 21b57d35d04a7..77846b0829fd3 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -1,3 +1,10 @@ +try: + import numba # noqa + + _HAVE_NUMBA = True +except ImportError: + _HAVE_NUMBA = False + import pytest @@ -64,6 +71,14 @@ def nopython(request): return request.param -@pytest.fixture(params=["numba", "cython"]) +@pytest.fixture( + params=[ + pytest.param( + "numba", + marks=pytest.mark.skipif(not _HAVE_NUMBA, reason="numba is not installed"), + ), + "cython", + ] +) def engine(request): return request.param From a8903379cd56a60ae927c9897918964bd46cb759 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 22 Dec 2019 11:59:28 -0800 Subject: [PATCH 30/44] Add typing --- pandas/core/window/rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 62d9605d30bb1..14137f8cd4ce9 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -93,7 +93,7 @@ def __init__( self.win_freq = None self.axis = obj._get_axis_number(axis) if axis is not None else None self.validate() - self._numba_func_cache = dict() + self._numba_func_cache: Dict = dict() @property def _constructor(self): From 0a9071cd72d9a958aa6d02eedd65bc0b9329d5ae Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 22 Dec 2019 12:58:04 -0800 Subject: [PATCH 31/44] Add more typing --- pandas/core/window/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 0e7a877cbc69b..80f6de1aecd74 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -70,7 +70,7 @@ def _apply( floor: int = 1, is_weighted: bool = False, name: Optional[str] = None, - use_numba_cache: Optional = False, + use_numba_cache: Optional[bool] = False, **kwargs, ): """ From 9d8d40b86b2b8ad2ff89a664f4463da4eedb0e81 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 22 Dec 2019 18:18:03 -0800 Subject: [PATCH 32/44] Formatting cleanups --- doc/source/user_guide/computation.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 043e4d761d477..84c0e960a14f8 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -356,13 +356,13 @@ and the apply for loop. Currently only ``nogil``, ``nopython``, and ``parallel`` .. note:: In terms of performance, **the first time a function is run using the Numba engine will be slow** - as Numba will have some function compilation overhead. However, `rolling` objects will cache + as Numba will have some function compilation overhead. However, ``rolling`` objects will cache the function and subsequent calls will be fast. In general, the Numba engine is performant with a larger amount of data points (e.g. 1+ million). .. code-block:: ipython - In [1]: data = pd.Series(range(1000000)) + In [1]: data = pd.Series(range(1_000_000)) In [2]: roll = data.rolling(10) From a429206771582227958c914a3b2756a85fb96260 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 23 Dec 2019 17:30:53 -0800 Subject: [PATCH 33/44] Address Jeff's comments --- doc/source/user_guide/computation.rst | 13 +++++---- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/window/numba_.py | 21 +++++++++++++-- pandas/core/window/rolling.py | 13 +++++++-- pandas/tests/window/conftest.py | 13 ++++++--- pandas/tests/window/test_api.py | 2 +- pandas/tests/window/test_numba.py | 39 +++++++++++++++++++++++++++ 7 files changed, 88 insertions(+), 14 deletions(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 84c0e960a14f8..31ab87e831d30 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -339,19 +339,22 @@ compute the mean absolute deviation on a rolling basis: @savefig rolling_apply_ex.png s.rolling(window=60).apply(mad, raw=True).plot(style='k') +.. versionadded:: 1.0 + Additionally, :meth:`~Rolling.apply` can leverage `Numba `__ -if installed as an optional dependency as the execution engine of the apply aggregation using the +if installed as an optional dependency. The apply aggregation can be executed using Numba by specifying ``engine='numba'`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``). Numba will be applied in potentially two routines: -1. If ``func`` is a standard Python function, the engine will JIT the passed function. ``func`` -can also be a pre-JIT function in which case the engine will not JIT the function again. +1. If ``func`` is a standard Python function, the engine will `JIT `__ +the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. 2. The engine will JIT the for loop where the apply function is applied to each window. The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the `numba.jit decorator `__. These keyword arguments will be applied to *both* the passed function (if a standard Python function) -and the apply for loop. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported. +and the apply for loop. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported, +and their default values are set to ``False``, ``True`` and ``False`` respectively. .. note:: @@ -368,7 +371,7 @@ and the apply for loop. Currently only ``nogil``, ``nopython``, and ``parallel`` In [3]: def f(x): ...: return np.sum(x) + 5 - # Ran the first time, compilation time will affect performance + # Run the first time, compilation time will affect performance In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) # noqa: E225 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) # Function is cached and performance will improve diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index f8d8c0a3b593a..0b18983ba10f1 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -178,6 +178,7 @@ We've added an ``engine`` keyword to :meth:`~Rolling.apply` that allows the user routine using `Numba `__ instead of Cython. Using the Numba engine can yield significant performance gains if the apply function can operate on numpy arrays and the data set is larger. For more details, see :ref:`rolling apply documentation ` +(:issue:`28987`) .. _whatsnew_1000.custom_window: diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 284d566a14949..1fa4483e2be5e 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -6,7 +6,7 @@ from pandas.compat._optional import import_optional_dependency -def _generate_numba_apply_func( +def generate_numba_apply_func( args: Tuple, kwargs: Dict, func: Callable, @@ -21,11 +21,28 @@ def _generate_numba_apply_func( Configurations specified in engine_kwargs apply to both the user's function _AND_ the rolling apply function. + + Parameters + ---------- + args : tuple + *args to be passed into the function + kwargs : dict + **kwargs to be passed into the function + func : function + function to be applied to each window and will be JITed + engine_kwargs : dict + dictionary of arguments to be passed into numba.jit + function_cache : dict + dictionary of cached apply function to avoid re-compiling the apply loop + + Returns + ------- + Numba function """ numba = import_optional_dependency("numba") if engine_kwargs is None: - engine_kwargs = {"nopython": True, "nogil": False, "parallel": False} + engine_kwargs = {} nopython = engine_kwargs.get("nopython", True) nogil = engine_kwargs.get("nogil", False) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 98be912f8106e..43cf46d97c3e5 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -54,7 +54,7 @@ FixedWindowIndexer, VariableWindowIndexer, ) -from pandas.core.window.numba_ import _generate_numba_apply_func +from pandas.core.window.numba_ import generate_numba_apply_func class _Window(PandasObject, ShallowMixin, SelectionMixin): @@ -1240,6 +1240,9 @@ def count(self): Must produce a single value from an ndarray input if ``raw=True`` or a single value from a Series if ``raw=False``. Can also accept a Numba JIT function with ``engine='numba'`` specified. + + .. versionchanged:: 1.0.0 + raw : bool, default None * ``False`` : passes each row or column as a Series to the function. @@ -1251,6 +1254,9 @@ def count(self): * ``'cython'`` : Runs rolling apply through C-extensions from cython. * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. Only available when ``raw`` is set to ``True``. + + .. versionadded:: 1.0.0 + engine_kwargs : dict, default None * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` @@ -1258,6 +1264,9 @@ def count(self): ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be applied to both the ``func`` and the ``apply`` rolling aggregation. + + .. versionadded:: 1.0.0 + args : tuple, default None Positional arguments to be passed into func. kwargs : dict, default None @@ -1309,7 +1318,7 @@ def apply( elif engine == "numba": if raw is False: raise ValueError("raw must be `True` when using the numba engine") - apply_func = _generate_numba_apply_func( + apply_func = generate_numba_apply_func( args, kwargs, func, engine_kwargs, self._numba_func_cache ) else: diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 77846b0829fd3..4baaa27f57658 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -1,12 +1,13 @@ -try: - import numba # noqa +import pytest + +from pandas.compat._optional import import_optional_dependency +try: + import_optional_dependency('numba') # noqa _HAVE_NUMBA = True except ImportError: _HAVE_NUMBA = False -import pytest - @pytest.fixture(params=[True, False]) def raw(request): @@ -58,16 +59,19 @@ def min_periods(request): @pytest.fixture(params=[True, False]) def parallel(request): + """parallel keyword argument for numba.jit""" return request.param @pytest.fixture(params=[True, False]) def nogil(request): + """nogil keyword argument for numba.jit""" return request.param @pytest.fixture(params=[True, False]) def nopython(request): + """nopython keyword argument for numba.jit""" return request.param @@ -81,4 +85,5 @@ def nopython(request): ] ) def engine(request): + """engine keyword argument for rolling.apply""" return request.param diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 518da688d72bf..b0e175acfb922 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -344,7 +344,6 @@ def test_multiple_agg_funcs(self, func, window_size, expected_vals): tm.assert_frame_equal(result, expected) -@td.skip_if_no("numba", "0.46.0") class TestEngine: def test_invalid_engine(self): with pytest.raises( @@ -366,6 +365,7 @@ def test_invalid_raw_numba(self): ): Series(range(1)).rolling(1).apply(lambda x: x, raw=False, engine="numba") + @td.skip_if_no("numba", "0.46.0") def test_invalid_kwargs_nopython(self): with pytest.raises(ValueError, match="numba does not support kwargs with"): Series(range(1)).rolling(1).apply( diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 3a85cff9a73eb..66e4d4e2e7145 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -31,3 +31,42 @@ def f(x, *args): ) expected = s.rolling(2).apply(f, engine="cython", args=args, raw=True) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("jit", [True, False]) + def test_cache(self, jit, nogil, parallel, nopython): + # Test that the functions are cached correctly if we switch functions + def func_1(x): + return np.mean(x) + 4 + + def func_2(x): + return np.std(x) * 5 + + if jit: + import numba + + func_1 = numba.jit(func_1) + func_2 = numba.jit(func_2) + + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} + + roll = Series(range(10)).rolling(2) + result = roll.apply( + func_1, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = roll.apply(func_1, engine="cython", raw=True) + tm.assert_series_equal(result, expected) + + # func_1 should be in the cache now + assert func_1 in roll._numba_func_cache + + result = roll.apply( + func_2, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = roll.apply(func_2, engine="cython", raw=True) + tm.assert_series_equal(result, expected) + # This run should use the cached func_1 + result = roll.apply( + func_1, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = roll.apply(func_1, engine="cython", raw=True) + tm.assert_series_equal(result, expected) From 5826ad9d235d316f3f84d64aa917328ace557932 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 23 Dec 2019 17:31:14 -0800 Subject: [PATCH 34/44] Black --- pandas/tests/window/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 4baaa27f57658..a8188d484aae6 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -3,7 +3,7 @@ from pandas.compat._optional import import_optional_dependency try: - import_optional_dependency('numba') # noqa + import_optional_dependency("numba") # noqa _HAVE_NUMBA = True except ImportError: _HAVE_NUMBA = False From cf7571b8f83dd95a931e9de2646d892fab6db376 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 23 Dec 2019 19:48:41 -0800 Subject: [PATCH 35/44] Add clarification --- doc/source/user_guide/computation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 31ab87e831d30..a2150c207c0b0 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -353,7 +353,7 @@ the passed function. ``func`` can also be a JITed function in which case the eng The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the `numba.jit decorator `__. These keyword arguments will be applied to *both* the passed function (if a standard Python function) -and the apply for loop. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported, +and the apply for loop over each window. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported, and their default values are set to ``False``, ``True`` and ``False`` respectively. .. note:: From 18eed60802e1eac77ed38bc1770b93f50d0d2d72 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 24 Dec 2019 10:51:54 -0800 Subject: [PATCH 36/44] Move function to module level --- pandas/core/window/numba_.py | 104 +++++++++++++++++----------------- pandas/core/window/rolling.py | 2 + 2 files changed, 53 insertions(+), 53 deletions(-) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 1fa4483e2be5e..1608536eecdf0 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -6,6 +6,56 @@ from pandas.compat._optional import import_optional_dependency +def make_rolling_apply(func, args, nogil, parallel, nopython): + numba = import_optional_dependency("numba") + + if parallel: + loop_range = numba.prange + else: + loop_range = range + + if isinstance(func, numba.targets.registry.CPUDispatcher): + # Don't jit a user passed jitted function + numba_func = func + else: + + @numba.generated_jit(nopython=nopython, nogil=nogil, parallel=parallel) + def numba_func(window, *_args): + if getattr(np, func.__name__, False) is func or isinstance( + func, types.BuiltinFunctionType + ): + + def impl(window, *_args): + return func(window, *_args) + + return impl + else: + jf = numba.jit(func, nopython=nopython) + + def impl(window, *_args): + return jf(window, *_args) + + return impl + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def roll_apply( + values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int, + ): + result = np.empty(len(begin)) + for i in loop_range(len(result)): + start = begin[i] + stop = end[i] + window = values[start:stop] + count_nan = np.sum(np.isnan(window)) + if len(window) - count_nan >= minimum_periods: + result[i] = numba_func(window, *args) + else: + result[i] = np.nan + return result + + return roll_apply + + def generate_numba_apply_func( args: Tuple, kwargs: Dict, @@ -39,7 +89,6 @@ def generate_numba_apply_func( ------- Numba function """ - numba = import_optional_dependency("numba") if engine_kwargs is None: engine_kwargs = {} @@ -54,59 +103,8 @@ def generate_numba_apply_func( "https://github.com/numba/numba/issues/2916" ) - if parallel: - loop_range = numba.prange - else: - loop_range = range - # Return an already compiled version of roll_apply if available if func in function_cache: return function_cache[func] - def make_rolling_apply(func): - - if isinstance(func, numba.targets.registry.CPUDispatcher): - # Don't jit a user passed jitted function - numba_func = func - else: - - @numba.generated_jit(nopython=nopython, nogil=nogil, parallel=parallel) - def numba_func(window, *_args): - if getattr(np, func.__name__, False) is func or isinstance( - func, types.BuiltinFunctionType - ): - - def impl(window, *_args): - return func(window, *_args) - - return impl - else: - jf = numba.jit(func, nopython=nopython) - - def impl(window, *_args): - return jf(window, *_args) - - return impl - - @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def roll_apply( - values: np.ndarray, - begin: np.ndarray, - end: np.ndarray, - minimum_periods: int, - ): - result = np.empty(len(begin)) - for i in loop_range(len(result)): - start = begin[i] - stop = end[i] - window = values[start:stop] - count_nan = np.sum(np.isnan(window)) - if len(window) - count_nan >= minimum_periods: - result[i] = numba_func(window, *args) - else: - result[i] = np.nan - return result - - return roll_apply - - return make_rolling_apply(func) + return make_rolling_apply(func, args, nogil, parallel, nopython) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 43cf46d97c3e5..5208cd5d11c94 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -462,6 +462,8 @@ def _apply( name : str, compatibility with groupby.rolling use_numba_cache : bool + whether to cache a numba compiled function. Only available for numba + enabled methods (so far only apply) **kwargs additional arguments for rolling function and window function From f715b55043c7ae5a7e6981bf1694611951159272 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 24 Dec 2019 11:30:01 -0800 Subject: [PATCH 37/44] move cache check higher up --- pandas/core/window/numba_.py | 7 ------- pandas/core/window/rolling.py | 10 +++++++--- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 1608536eecdf0..302afda6c402c 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -61,7 +61,6 @@ def generate_numba_apply_func( kwargs: Dict, func: Callable, engine_kwargs: Optional[Dict], - function_cache: Dict, ): """ Generate a numba jitted apply function specified by values from engine_kwargs. @@ -82,8 +81,6 @@ def generate_numba_apply_func( function to be applied to each window and will be JITed engine_kwargs : dict dictionary of arguments to be passed into numba.jit - function_cache : dict - dictionary of cached apply function to avoid re-compiling the apply loop Returns ------- @@ -103,8 +100,4 @@ def generate_numba_apply_func( "https://github.com/numba/numba/issues/2916" ) - # Return an already compiled version of roll_apply if available - if func in function_cache: - return function_cache[func] - return make_rolling_apply(func, args, nogil, parallel, nopython) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 5208cd5d11c94..4d657b68b2ae0 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1320,9 +1320,13 @@ def apply( elif engine == "numba": if raw is False: raise ValueError("raw must be `True` when using the numba engine") - apply_func = generate_numba_apply_func( - args, kwargs, func, engine_kwargs, self._numba_func_cache - ) + if func in self._numba_func_cache: + # Return an already compiled version of roll_apply if available + apply_func = self._numba_func_cache[func] + else: + apply_func = generate_numba_apply_func( + args, kwargs, func, engine_kwargs + ) else: raise ValueError("engine must be either 'numba' or 'cython'") From 6a765bf1e734d29ceb899b3609f7a7aea0f139fa Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 24 Dec 2019 12:08:59 -0800 Subject: [PATCH 38/44] Address Will's comments --- pandas/core/window/numba_.py | 20 ++++++++------------ pandas/tests/window/conftest.py | 16 ++-------------- 2 files changed, 10 insertions(+), 26 deletions(-) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 302afda6c402c..27efec35089ec 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -1,5 +1,5 @@ import types -from typing import Callable, Dict, Optional, Tuple +from typing import Any, Callable, Dict, Optional, Tuple import numpy as np @@ -24,23 +24,19 @@ def numba_func(window, *_args): if getattr(np, func.__name__, False) is func or isinstance( func, types.BuiltinFunctionType ): - - def impl(window, *_args): - return func(window, *_args) - - return impl + jf = func else: jf = numba.jit(func, nopython=nopython) - def impl(window, *_args): - return jf(window, *_args) + def impl(window, *_args): + return jf(window, *_args) - return impl + return impl @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) def roll_apply( values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int, - ): + ) -> np.ndarray: result = np.empty(len(begin)) for i in loop_range(len(result)): start = begin[i] @@ -58,9 +54,9 @@ def roll_apply( def generate_numba_apply_func( args: Tuple, - kwargs: Dict, + kwargs: Dict[str, Any], func: Callable, - engine_kwargs: Optional[Dict], + engine_kwargs: Optional[Dict[str, bool]], ): """ Generate a numba jitted apply function specified by values from engine_kwargs. diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index a8188d484aae6..2a2ff95f599ba 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -1,12 +1,6 @@ import pytest -from pandas.compat._optional import import_optional_dependency - -try: - import_optional_dependency("numba") # noqa - _HAVE_NUMBA = True -except ImportError: - _HAVE_NUMBA = False +import pandas.util._test_decorators as td @pytest.fixture(params=[True, False]) @@ -76,13 +70,7 @@ def nopython(request): @pytest.fixture( - params=[ - pytest.param( - "numba", - marks=pytest.mark.skipif(not _HAVE_NUMBA, reason="numba is not installed"), - ), - "cython", - ] + params=[pytest.param("numba", marks=td.skip_if_no("numba", "0.46.0")), "cython"] ) def engine(request): """engine keyword argument for rolling.apply""" From af3fe500e2f0155fdff884578a326475c3f9b6e7 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 24 Dec 2019 12:12:05 -0800 Subject: [PATCH 39/44] Type Callable in generate_numba_apply_func --- pandas/core/window/numba_.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 27efec35089ec..024f7c50f9b45 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -3,6 +3,7 @@ import numpy as np +from pandas._typing import Scalar from pandas.compat._optional import import_optional_dependency @@ -55,7 +56,7 @@ def roll_apply( def generate_numba_apply_func( args: Tuple, kwargs: Dict[str, Any], - func: Callable, + func: Callable[[np.ndarray, ...], Scalar], engine_kwargs: Optional[Dict[str, bool]], ): """ From f7dfcf4e9f7c83b0e27c4179a89d4b35035fb45a Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 24 Dec 2019 14:28:13 -0800 Subject: [PATCH 40/44] use ellipsis, cannot specify np.ndarray as well --- pandas/core/window/numba_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 024f7c50f9b45..af6491e183e80 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -56,7 +56,7 @@ def roll_apply( def generate_numba_apply_func( args: Tuple, kwargs: Dict[str, Any], - func: Callable[[np.ndarray, ...], Scalar], + func: Callable[..., Scalar], engine_kwargs: Optional[Dict[str, bool]], ): """ From a42a9603a498dd54fd6266fe13419209e1eb2e12 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 24 Dec 2019 14:31:45 -0800 Subject: [PATCH 41/44] Remove trailing whitespace in apply docstring --- pandas/core/window/rolling.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 4d657b68b2ae0..033b6777e42ca 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1242,9 +1242,9 @@ def count(self): Must produce a single value from an ndarray input if ``raw=True`` or a single value from a Series if ``raw=False``. Can also accept a Numba JIT function with ``engine='numba'`` specified. - + .. versionchanged:: 1.0.0 - + raw : bool, default None * ``False`` : passes each row or column as a Series to the function. @@ -1256,7 +1256,7 @@ def count(self): * ``'cython'`` : Runs rolling apply through C-extensions from cython. * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. Only available when ``raw`` is set to ``True``. - + .. versionadded:: 1.0.0 engine_kwargs : dict, default None @@ -1266,9 +1266,9 @@ def count(self): ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be applied to both the ``func`` and the ``apply`` rolling aggregation. - + .. versionadded:: 1.0.0 - + args : tuple, default None Positional arguments to be passed into func. kwargs : dict, default None From d01983029a936482513a6322946a1232f1610f0e Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 24 Dec 2019 20:03:52 -0800 Subject: [PATCH 42/44] Address Will's and Brock's comments --- doc/source/whatsnew/v1.0.0.rst | 6 ++-- pandas/core/window/numba_.py | 8 ++++- pandas/core/window/rolling.py | 4 +-- pandas/tests/window/conftest.py | 12 ++++++++ pandas/tests/window/test_api.py | 29 ------------------ pandas/tests/window/test_apply.py | 50 +++++++++++++++++++++++-------- 6 files changed, 62 insertions(+), 47 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0ab633b5f7c5e..dd0b332be9c64 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -177,8 +177,8 @@ Using Numba in ``rolling.apply`` We've added an ``engine`` keyword to :meth:`~Rolling.apply` that allows the user to execute the routine using `Numba `__ instead of Cython. Using the Numba engine can yield significant performance gains if the apply function can operate on numpy arrays and -the data set is larger. For more details, see :ref:`rolling apply documentation ` -(:issue:`28987`) +the data set is larger (1 million rows or greater). For more details, see +:ref:`rolling apply documentation ` (:issue:`28987`) .. _whatsnew_1000.custom_window: @@ -439,7 +439,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | matplotlib | 2.2.2 | | +-----------------+-----------------+---------+ -| numba | 0.46.0 | | +| numba | 0.46.0 | X | +-----------------+-----------------+---------+ | openpyxl | 2.5.7 | X | +-----------------+-----------------+---------+ diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index af6491e183e80..fdc5c32c453b1 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -7,7 +7,13 @@ from pandas.compat._optional import import_optional_dependency -def make_rolling_apply(func, args, nogil, parallel, nopython): +def make_rolling_apply( + func: Callable[..., Scalar], + args: Tuple, + nogil: bool, + parallel: bool, + nopython: bool, +): numba = import_optional_dependency("numba") if parallel: diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 033b6777e42ca..d2618debf3c32 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -93,7 +93,7 @@ def __init__( self.win_freq = None self.axis = obj._get_axis_number(axis) if axis is not None else None self.validate() - self._numba_func_cache: Dict = dict() + self._numba_func_cache: Dict[Callable, Callable] = dict() @property def _constructor(self): @@ -444,7 +444,7 @@ def _apply( floor: int = 1, is_weighted: bool = False, name: Optional[str] = None, - use_numba_cache: Optional[bool] = False, + use_numba_cache: bool = False, **kwargs, ): """ diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 2a2ff95f599ba..fb46ca51ace58 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -75,3 +75,15 @@ def nopython(request): def engine(request): """engine keyword argument for rolling.apply""" return request.param + + +@pytest.fixture( + params=[ + pytest.param(("numba", True), marks=td.skip_if_no("numba", "0.46.0")), + ("cython", True), + ("cython", False), + ] +) +def engine_and_raw(request): + """engine and raw keyword arguments for rolling.apply""" + return request.param diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index b0e175acfb922..5085576cc96f0 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -342,32 +342,3 @@ def test_multiple_agg_funcs(self, func, window_size, expected_vals): ) tm.assert_frame_equal(result, expected) - - -class TestEngine: - def test_invalid_engine(self): - with pytest.raises( - ValueError, match="engine must be either 'numba' or 'cython'" - ): - Series(range(1)).rolling(1).apply(lambda x: x, engine="foo") - - def test_invalid_engine_kwargs_cython(self): - with pytest.raises( - ValueError, match="cython engine does not accept engine_kwargs" - ): - Series(range(1)).rolling(1).apply( - lambda x: x, engine="cython", engine_kwargs={"nopython": False} - ) - - def test_invalid_raw_numba(self): - with pytest.raises( - ValueError, match="raw must be `True` when using the numba engine" - ): - Series(range(1)).rolling(1).apply(lambda x: x, raw=False, engine="numba") - - @td.skip_if_no("numba", "0.46.0") - def test_invalid_kwargs_nopython(self): - with pytest.raises(ValueError, match="numba does not support kwargs with"): - Series(range(1)).rolling(1).apply( - lambda x: x, kwargs={"a": 1}, engine="numba", raw=True - ) diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py index 965d8e3616f4c..4b56cbd48c388 100644 --- a/pandas/tests/window/test_apply.py +++ b/pandas/tests/window/test_apply.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import DataFrame, Series, Timestamp, date_range import pandas.util.testing as tm @@ -11,10 +13,9 @@ def test_rolling_apply_invalid_raw(bad_raw): Series(range(3)).rolling(1).apply(len, raw=bad_raw) -def test_rolling_apply_out_of_bounds(engine, raw): +def test_rolling_apply_out_of_bounds(engine_and_raw): # gh-1850 - if engine == "numba": - raw = True + engine, raw = engine_and_raw vals = Series([1, 2, 3, 4]) @@ -49,9 +50,9 @@ def f(x): df.rolling(window).apply(f, raw=True) -def test_rolling_apply(engine, raw): - if engine == "numba": - raw = True +def test_rolling_apply(engine_and_raw): + engine, raw = engine_and_raw + expected = Series([], dtype="float64") result = expected.rolling(10).apply(lambda x: x.mean(), engine=engine, raw=raw) tm.assert_series_equal(result, expected) @@ -66,9 +67,8 @@ def test_rolling_apply(engine, raw): tm.assert_series_equal(result, expected) -def test_all_apply(engine, raw): - if engine == "numba": - raw = True +def test_all_apply(engine_and_raw): + engine, raw = engine_and_raw df = ( DataFrame( @@ -84,9 +84,8 @@ def test_all_apply(engine, raw): tm.assert_frame_equal(result, expected) -def test_ragged_apply(engine, raw): - if engine == "numba": - raw = True +def test_ragged_apply(engine_and_raw): + engine, raw = engine_and_raw df = DataFrame({"B": range(5)}) df.index = [ @@ -112,3 +111,30 @@ def test_ragged_apply(engine, raw): expected = df.copy() expected["B"] = 1.0 tm.assert_frame_equal(result, expected) + + +def test_invalid_engine(): + with pytest.raises(ValueError, match="engine must be either 'numba' or 'cython'"): + Series(range(1)).rolling(1).apply(lambda x: x, engine="foo") + + +def test_invalid_engine_kwargs_cython(): + with pytest.raises(ValueError, match="cython engine does not accept engine_kwargs"): + Series(range(1)).rolling(1).apply( + lambda x: x, engine="cython", engine_kwargs={"nopython": False} + ) + + +def test_invalid_raw_numba(): + with pytest.raises( + ValueError, match="raw must be `True` when using the numba engine" + ): + Series(range(1)).rolling(1).apply(lambda x: x, raw=False, engine="numba") + + +@td.skip_if_no("numba") +def test_invalid_kwargs_nopython(): + with pytest.raises(ValueError, match="numba does not support kwargs with"): + Series(range(1)).rolling(1).apply( + lambda x: x, kwargs={"a": 1}, engine="numba", raw=True + ) From 29d145fd74cdb07126b63e9599e32a99d0b975b4 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 24 Dec 2019 21:02:16 -0800 Subject: [PATCH 43/44] Fix typing --- pandas/core/window/rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index d2618debf3c32..e1cf5a6b09a15 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -93,7 +93,7 @@ def __init__( self.win_freq = None self.axis = obj._get_axis_number(axis) if axis is not None else None self.validate() - self._numba_func_cache: Dict[Callable, Callable] = dict() + self._numba_func_cache: Dict[Optional[str], Callable] = dict() @property def _constructor(self): From a3da51eb3a301b8fdcf6991c5ea091d206ab21b8 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 26 Dec 2019 10:15:31 -0800 Subject: [PATCH 44/44] Address followup comments --- pandas/core/window/common.py | 2 +- pandas/core/window/numba_.py | 23 ++++++++++++++++++++++- pandas/core/window/rolling.py | 2 +- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 80f6de1aecd74..5b467b03c1fc2 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -70,7 +70,7 @@ def _apply( floor: int = 1, is_weighted: bool = False, name: Optional[str] = None, - use_numba_cache: Optional[bool] = False, + use_numba_cache: bool = False, **kwargs, ): """ diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index fdc5c32c453b1..127957943d2ff 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -14,6 +14,27 @@ def make_rolling_apply( parallel: bool, nopython: bool, ): + """ + Creates a JITted rolling apply function with a JITted version of + the user's function. + + Parameters + ---------- + func : function + function to be applied to each window and will be JITed + args : tuple + *args to be passed into the function + nogil : bool + nogil parameter from engine_kwargs for numba.jit + parallel : bool + parallel parameter from engine_kwargs for numba.jit + nopython : bool + nopython parameter from engine_kwargs for numba.jit + + Returns + ------- + Numba function + """ numba = import_optional_dependency("numba") if parallel: @@ -33,7 +54,7 @@ def numba_func(window, *_args): ): jf = func else: - jf = numba.jit(func, nopython=nopython) + jf = numba.jit(func, nopython=nopython, nogil=nogil) def impl(window, *_args): return jf(window, *_args) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index e1cf5a6b09a15..5b0fbbb3518d2 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1286,7 +1286,7 @@ def count(self): Notes ----- - See :ref:`stats.rolling_window` for extended documentation and performance + See :ref:`stats.rolling_apply` for extended documentation and performance considerations for the Numba engine. """ )