diff --git a/ci/deps/azure-36-minimum_versions.yaml b/ci/deps/azure-36-minimum_versions.yaml index 8bf4f70d18aec..de7e011d9c7ca 100644 --- a/ci/deps/azure-36-minimum_versions.yaml +++ b/ci/deps/azure-36-minimum_versions.yaml @@ -17,6 +17,7 @@ dependencies: - beautifulsoup4=4.6.0 - bottleneck=1.2.1 - jinja2=2.8 + - numba=0.46.0 - numexpr=2.6.2 - numpy=1.13.3 - openpyxl=2.5.7 diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index 2bd11c9030325..7fa9dee7445a6 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -17,6 +17,7 @@ dependencies: - bottleneck - fastparquet>=0.3.2 - matplotlib=3.0.2 + - numba - numexpr - numpy=1.15.* - openpyxl diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 62a39fb5176f9..77ceed484f9c3 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -255,6 +255,7 @@ gcsfs 0.2.2 Google Cloud Storage access html5lib HTML parser for read_html (see :ref:`note `) lxml 3.8.0 HTML parser for read_html (see :ref:`note `) matplotlib 2.2.2 Visualization +numba 0.46.0 Alternative execution engine for rolling operations openpyxl 2.5.7 Reading / writing for xlsx files pandas-gbq 0.8.0 Google Big Query access psycopg2 PostgreSQL engine for sqlalchemy diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 627a83b7359bb..a2150c207c0b0 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -321,6 +321,11 @@ We provide a number of common statistical functions: :meth:`~Rolling.cov`, Unbiased covariance (binary) :meth:`~Rolling.corr`, Correlation (binary) +.. _stats.rolling_apply: + +Rolling Apply +~~~~~~~~~~~~~ + The :meth:`~Rolling.apply` function takes an extra ``func`` argument and performs generic rolling computations. The ``func`` argument should be a single function that produces a single value from an ndarray input. Suppose we wanted to @@ -334,6 +339,48 @@ compute the mean absolute deviation on a rolling basis: @savefig rolling_apply_ex.png s.rolling(window=60).apply(mad, raw=True).plot(style='k') +.. versionadded:: 1.0 + +Additionally, :meth:`~Rolling.apply` can leverage `Numba `__ +if installed as an optional dependency. The apply aggregation can be executed using Numba by specifying +``engine='numba'`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``). +Numba will be applied in potentially two routines: + +1. If ``func`` is a standard Python function, the engine will `JIT `__ +the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. +2. The engine will JIT the for loop where the apply function is applied to each window. + +The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the +`numba.jit decorator `__. +These keyword arguments will be applied to *both* the passed function (if a standard Python function) +and the apply for loop over each window. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported, +and their default values are set to ``False``, ``True`` and ``False`` respectively. + +.. note:: + + In terms of performance, **the first time a function is run using the Numba engine will be slow** + as Numba will have some function compilation overhead. However, ``rolling`` objects will cache + the function and subsequent calls will be fast. In general, the Numba engine is performant with + a larger amount of data points (e.g. 1+ million). + +.. code-block:: ipython + + In [1]: data = pd.Series(range(1_000_000)) + + In [2]: roll = data.rolling(10) + + In [3]: def f(x): + ...: return np.sum(x) + 5 + # Run the first time, compilation time will affect performance + In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) # noqa: E225 + 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) + # Function is cached and performance will improve + In [5]: %timeit roll.apply(f, engine='numba', raw=True) + 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [6]: %timeit roll.apply(f, engine='cython', raw=True) + 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + .. _stats.rolling_window: Rolling windows diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 852694a51e79d..f0c2d4121d1c9 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -169,6 +169,17 @@ You can use the alias ``"boolean"`` as well. s = pd.Series([True, False, None], dtype="boolean") s +.. _whatsnew_1000.numba_rolling_apply: + +Using Numba in ``rolling.apply`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We've added an ``engine`` keyword to :meth:`~Rolling.apply` that allows the user to execute the +routine using `Numba `__ instead of Cython. Using the Numba engine +can yield significant performance gains if the apply function can operate on numpy arrays and +the data set is larger (1 million rows or greater). For more details, see +:ref:`rolling apply documentation ` (:issue:`28987`) + .. _whatsnew_1000.custom_window: Defining custom windows for rolling operations @@ -428,6 +439,8 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | matplotlib | 2.2.2 | | +-----------------+-----------------+---------+ +| numba | 0.46.0 | X | ++-----------------+-----------------+---------+ | openpyxl | 2.5.7 | X | +-----------------+-----------------+---------+ | pyarrow | 0.12.0 | X | diff --git a/environment.yml b/environment.yml index f930458d0a855..73dda3cdc51e8 100644 --- a/environment.yml +++ b/environment.yml @@ -75,6 +75,7 @@ dependencies: - matplotlib>=2.2.2 # pandas.plotting, Series.plot, DataFrame.plot - numexpr>=2.6.8 - scipy>=1.1 + - numba>=0.46.0 # optional for io # --------------- diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 412293f029fa5..e498b01a562ec 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -27,6 +27,7 @@ "xlrd": "1.1.0", "xlwt": "1.2.0", "xlsxwriter": "0.9.8", + "numba": "0.46.0", } diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index c7d856e9a1e88..5b467b03c1fc2 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -70,6 +70,7 @@ def _apply( floor: int = 1, is_weighted: bool = False, name: Optional[str] = None, + use_numba_cache: bool = False, **kwargs, ): """ diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py new file mode 100644 index 0000000000000..127957943d2ff --- /dev/null +++ b/pandas/core/window/numba_.py @@ -0,0 +1,127 @@ +import types +from typing import Any, Callable, Dict, Optional, Tuple + +import numpy as np + +from pandas._typing import Scalar +from pandas.compat._optional import import_optional_dependency + + +def make_rolling_apply( + func: Callable[..., Scalar], + args: Tuple, + nogil: bool, + parallel: bool, + nopython: bool, +): + """ + Creates a JITted rolling apply function with a JITted version of + the user's function. + + Parameters + ---------- + func : function + function to be applied to each window and will be JITed + args : tuple + *args to be passed into the function + nogil : bool + nogil parameter from engine_kwargs for numba.jit + parallel : bool + parallel parameter from engine_kwargs for numba.jit + nopython : bool + nopython parameter from engine_kwargs for numba.jit + + Returns + ------- + Numba function + """ + numba = import_optional_dependency("numba") + + if parallel: + loop_range = numba.prange + else: + loop_range = range + + if isinstance(func, numba.targets.registry.CPUDispatcher): + # Don't jit a user passed jitted function + numba_func = func + else: + + @numba.generated_jit(nopython=nopython, nogil=nogil, parallel=parallel) + def numba_func(window, *_args): + if getattr(np, func.__name__, False) is func or isinstance( + func, types.BuiltinFunctionType + ): + jf = func + else: + jf = numba.jit(func, nopython=nopython, nogil=nogil) + + def impl(window, *_args): + return jf(window, *_args) + + return impl + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def roll_apply( + values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int, + ) -> np.ndarray: + result = np.empty(len(begin)) + for i in loop_range(len(result)): + start = begin[i] + stop = end[i] + window = values[start:stop] + count_nan = np.sum(np.isnan(window)) + if len(window) - count_nan >= minimum_periods: + result[i] = numba_func(window, *args) + else: + result[i] = np.nan + return result + + return roll_apply + + +def generate_numba_apply_func( + args: Tuple, + kwargs: Dict[str, Any], + func: Callable[..., Scalar], + engine_kwargs: Optional[Dict[str, bool]], +): + """ + Generate a numba jitted apply function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a rolling apply function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the rolling apply function. + + Parameters + ---------- + args : tuple + *args to be passed into the function + kwargs : dict + **kwargs to be passed into the function + func : function + function to be applied to each window and will be JITed + engine_kwargs : dict + dictionary of arguments to be passed into numba.jit + + Returns + ------- + Numba function + """ + + if engine_kwargs is None: + engine_kwargs = {} + + nopython = engine_kwargs.get("nopython", True) + nogil = engine_kwargs.get("nogil", False) + parallel = engine_kwargs.get("parallel", False) + + if kwargs and nopython: + raise ValueError( + "numba does not support kwargs with nopython=True: " + "https://github.com/numba/numba/issues/2916" + ) + + return make_rolling_apply(func, args, nogil, parallel, nopython) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 07b484321a665..5b0fbbb3518d2 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -54,6 +54,7 @@ FixedWindowIndexer, VariableWindowIndexer, ) +from pandas.core.window.numba_ import generate_numba_apply_func class _Window(PandasObject, ShallowMixin, SelectionMixin): @@ -92,6 +93,7 @@ def __init__( self.win_freq = None self.axis = obj._get_axis_number(axis) if axis is not None else None self.validate() + self._numba_func_cache: Dict[Optional[str], Callable] = dict() @property def _constructor(self): @@ -442,6 +444,7 @@ def _apply( floor: int = 1, is_weighted: bool = False, name: Optional[str] = None, + use_numba_cache: bool = False, **kwargs, ): """ @@ -454,10 +457,13 @@ def _apply( func : callable function to apply center : bool require_min_periods : int - floor: int - is_weighted - name: str, + floor : int + is_weighted : bool + name : str, compatibility with groupby.rolling + use_numba_cache : bool + whether to cache a numba compiled function. Only available for numba + enabled methods (so far only apply) **kwargs additional arguments for rolling function and window function @@ -532,6 +538,9 @@ def calc(x): result = calc(values) result = np.asarray(result) + if use_numba_cache: + self._numba_func_cache[name] = func + if center: result = self._center_window(result, window) @@ -1231,7 +1240,11 @@ def count(self): ---------- func : function Must produce a single value from an ndarray input if ``raw=True`` - or a single value from a Series if ``raw=False``. + or a single value from a Series if ``raw=False``. Can also accept a + Numba JIT function with ``engine='numba'`` specified. + + .. versionchanged:: 1.0.0 + raw : bool, default None * ``False`` : passes each row or column as a Series to the function. @@ -1239,9 +1252,27 @@ def count(self): objects instead. If you are just applying a NumPy reduction function this will achieve much better performance. - - *args, **kwargs - Arguments and keyword arguments to be passed into func. + engine : str, default 'cython' + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + + .. versionadded:: 1.0.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to both the ``func`` and the ``apply`` rolling aggregation. + + .. versionadded:: 1.0.0 + + args : tuple, default None + Positional arguments to be passed into func. + kwargs : dict, default None + Keyword arguments to be passed into func. Returns ------- @@ -1252,12 +1283,27 @@ def count(self): -------- Series.%(name)s : Series %(name)s. DataFrame.%(name)s : DataFrame %(name)s. + + Notes + ----- + See :ref:`stats.rolling_apply` for extended documentation and performance + considerations for the Numba engine. """ ) - def apply(self, func, raw=False, args=(), kwargs={}): - from pandas import Series - + def apply( + self, + func, + raw: bool = False, + engine: str = "cython", + engine_kwargs: Optional[Dict] = None, + args: Optional[Tuple] = None, + kwargs: Optional[Dict] = None, + ): + if args is None: + args = () + if kwargs is None: + kwargs = {} kwargs.pop("_level", None) kwargs.pop("floor", None) window = self._get_window() @@ -1265,6 +1311,38 @@ def apply(self, func, raw=False, args=(), kwargs={}): if not is_bool(raw): raise ValueError("raw parameter must be `True` or `False`") + if engine == "cython": + if engine_kwargs is not None: + raise ValueError("cython engine does not accept engine_kwargs") + apply_func = self._generate_cython_apply_func( + args, kwargs, raw, offset, func + ) + elif engine == "numba": + if raw is False: + raise ValueError("raw must be `True` when using the numba engine") + if func in self._numba_func_cache: + # Return an already compiled version of roll_apply if available + apply_func = self._numba_func_cache[func] + else: + apply_func = generate_numba_apply_func( + args, kwargs, func, engine_kwargs + ) + else: + raise ValueError("engine must be either 'numba' or 'cython'") + + # TODO: Why do we always pass center=False? + # name=func for WindowGroupByMixin._apply + return self._apply( + apply_func, + center=False, + floor=0, + name=func, + use_numba_cache=engine == "numba", + ) + + def _generate_cython_apply_func(self, args, kwargs, raw, offset, func): + from pandas import Series + window_func = partial( self._get_cython_func_type("roll_generic"), args=args, @@ -1279,9 +1357,7 @@ def apply_func(values, begin, end, min_periods, raw=raw): values = Series(values, index=self.obj.index) return window_func(values, begin, end, min_periods) - # TODO: Why do we always pass center=False? - # name=func for WindowGroupByMixin._apply - return self._apply(apply_func, center=False, floor=0, name=func) + return apply_func def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) @@ -1927,8 +2003,23 @@ def count(self): @Substitution(name="rolling") @Appender(_shared_docs["apply"]) - def apply(self, func, raw=False, args=(), kwargs={}): - return super().apply(func, raw=raw, args=args, kwargs=kwargs) + def apply( + self, + func, + raw=False, + engine="cython", + engine_kwargs=None, + args=None, + kwargs=None, + ): + return super().apply( + func, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, + ) @Substitution(name="rolling") @Appender(_shared_docs["sum"]) diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 7ea4be25ca2a6..fb46ca51ace58 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -1,5 +1,7 @@ import pytest +import pandas.util._test_decorators as td + @pytest.fixture(params=[True, False]) def raw(request): @@ -47,3 +49,41 @@ def center(request): @pytest.fixture(params=[None, 1]) def min_periods(request): return request.param + + +@pytest.fixture(params=[True, False]) +def parallel(request): + """parallel keyword argument for numba.jit""" + return request.param + + +@pytest.fixture(params=[True, False]) +def nogil(request): + """nogil keyword argument for numba.jit""" + return request.param + + +@pytest.fixture(params=[True, False]) +def nopython(request): + """nopython keyword argument for numba.jit""" + return request.param + + +@pytest.fixture( + params=[pytest.param("numba", marks=td.skip_if_no("numba", "0.46.0")), "cython"] +) +def engine(request): + """engine keyword argument for rolling.apply""" + return request.param + + +@pytest.fixture( + params=[ + pytest.param(("numba", True), marks=td.skip_if_no("numba", "0.46.0")), + ("cython", True), + ("cython", False), + ] +) +def engine_and_raw(request): + """engine and raw keyword arguments for rolling.apply""" + return request.param diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py new file mode 100644 index 0000000000000..4b56cbd48c388 --- /dev/null +++ b/pandas/tests/window/test_apply.py @@ -0,0 +1,140 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame, Series, Timestamp, date_range +import pandas.util.testing as tm + + +@pytest.mark.parametrize("bad_raw", [None, 1, 0]) +def test_rolling_apply_invalid_raw(bad_raw): + with pytest.raises(ValueError, match="raw parameter must be `True` or `False`"): + Series(range(3)).rolling(1).apply(len, raw=bad_raw) + + +def test_rolling_apply_out_of_bounds(engine_and_raw): + # gh-1850 + engine, raw = engine_and_raw + + vals = Series([1, 2, 3, 4]) + + result = vals.rolling(10).apply(np.sum, engine=engine, raw=raw) + assert result.isna().all() + + result = vals.rolling(10, min_periods=1).apply(np.sum, engine=engine, raw=raw) + expected = Series([1, 3, 6, 10], dtype=float) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize("window", [2, "2s"]) +def test_rolling_apply_with_pandas_objects(window): + # 5071 + df = DataFrame( + {"A": np.random.randn(5), "B": np.random.randint(0, 10, size=5)}, + index=date_range("20130101", periods=5, freq="s"), + ) + + # we have an equal spaced timeseries index + # so simulate removing the first period + def f(x): + if x.index[0] == df.index[0]: + return np.nan + return x.iloc[-1] + + result = df.rolling(window).apply(f, raw=False) + expected = df.iloc[2:].reindex_like(df) + tm.assert_frame_equal(result, expected) + + with pytest.raises(AttributeError): + df.rolling(window).apply(f, raw=True) + + +def test_rolling_apply(engine_and_raw): + engine, raw = engine_and_raw + + expected = Series([], dtype="float64") + result = expected.rolling(10).apply(lambda x: x.mean(), engine=engine, raw=raw) + tm.assert_series_equal(result, expected) + + # gh-8080 + s = Series([None, None, None]) + result = s.rolling(2, min_periods=0).apply(lambda x: len(x), engine=engine, raw=raw) + expected = Series([1.0, 2.0, 2.0]) + tm.assert_series_equal(result, expected) + + result = s.rolling(2, min_periods=0).apply(len, engine=engine, raw=raw) + tm.assert_series_equal(result, expected) + + +def test_all_apply(engine_and_raw): + engine, raw = engine_and_raw + + df = ( + DataFrame( + {"A": date_range("20130101", periods=5, freq="s"), "B": range(5)} + ).set_index("A") + * 2 + ) + er = df.rolling(window=1) + r = df.rolling(window="1s") + + result = r.apply(lambda x: 1, engine=engine, raw=raw) + expected = er.apply(lambda x: 1, engine=engine, raw=raw) + tm.assert_frame_equal(result, expected) + + +def test_ragged_apply(engine_and_raw): + engine, raw = engine_and_raw + + df = DataFrame({"B": range(5)}) + df.index = [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] + + f = lambda x: 1 + result = df.rolling(window="1s", min_periods=1).apply(f, engine=engine, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).apply(f, engine=engine, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).apply(f, engine=engine, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) + + +def test_invalid_engine(): + with pytest.raises(ValueError, match="engine must be either 'numba' or 'cython'"): + Series(range(1)).rolling(1).apply(lambda x: x, engine="foo") + + +def test_invalid_engine_kwargs_cython(): + with pytest.raises(ValueError, match="cython engine does not accept engine_kwargs"): + Series(range(1)).rolling(1).apply( + lambda x: x, engine="cython", engine_kwargs={"nopython": False} + ) + + +def test_invalid_raw_numba(): + with pytest.raises( + ValueError, match="raw must be `True` when using the numba engine" + ): + Series(range(1)).rolling(1).apply(lambda x: x, raw=False, engine="numba") + + +@td.skip_if_no("numba") +def test_invalid_kwargs_nopython(): + with pytest.raises(ValueError, match="numba does not support kwargs with"): + Series(range(1)).rolling(1).apply( + lambda x: x, kwargs={"a": 1}, engine="numba", raw=True + ) diff --git a/pandas/tests/window/test_moments.py b/pandas/tests/window/test_moments.py index 7a6c64d9f9036..b1c5fc429cc03 100644 --- a/pandas/tests/window/test_moments.py +++ b/pandas/tests/window/test_moments.py @@ -674,57 +674,6 @@ def f(x): self._check_moment_func(np.mean, name="apply", func=f, raw=raw) - expected = Series([], dtype="float64") - result = expected.rolling(10).apply(lambda x: x.mean(), raw=raw) - tm.assert_series_equal(result, expected) - - # gh-8080 - s = Series([None, None, None]) - result = s.rolling(2, min_periods=0).apply(lambda x: len(x), raw=raw) - expected = Series([1.0, 2.0, 2.0]) - tm.assert_series_equal(result, expected) - - result = s.rolling(2, min_periods=0).apply(len, raw=raw) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("bad_raw", [None, 1, 0]) - def test_rolling_apply_invalid_raw(self, bad_raw): - with pytest.raises(ValueError, match="raw parameter must be `True` or `False`"): - Series(range(3)).rolling(1).apply(len, raw=bad_raw) - - def test_rolling_apply_out_of_bounds(self, raw): - # gh-1850 - vals = pd.Series([1, 2, 3, 4]) - - result = vals.rolling(10).apply(np.sum, raw=raw) - assert result.isna().all() - - result = vals.rolling(10, min_periods=1).apply(np.sum, raw=raw) - expected = pd.Series([1, 3, 6, 10], dtype=float) - tm.assert_almost_equal(result, expected) - - @pytest.mark.parametrize("window", [2, "2s"]) - def test_rolling_apply_with_pandas_objects(self, window): - # 5071 - df = pd.DataFrame( - {"A": np.random.randn(5), "B": np.random.randint(0, 10, size=5)}, - index=pd.date_range("20130101", periods=5, freq="s"), - ) - - # we have an equal spaced timeseries index - # so simulate removing the first period - def f(x): - if x.index[0] == df.index[0]: - return np.nan - return x.iloc[-1] - - result = df.rolling(window).apply(f, raw=False) - expected = df.iloc[2:].reindex_like(df) - tm.assert_frame_equal(result, expected) - - with pytest.raises(AttributeError): - df.rolling(window).apply(f, raw=True) - def test_rolling_std(self, raw): self._check_moment_func(lambda x: np.std(x, ddof=1), name="std", raw=raw) self._check_moment_func( diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py new file mode 100644 index 0000000000000..66e4d4e2e7145 --- /dev/null +++ b/pandas/tests/window/test_numba.py @@ -0,0 +1,72 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import Series +import pandas.util.testing as tm + + +@td.skip_if_no("numba", "0.46.0") +class TestApply: + @pytest.mark.parametrize("jit", [True, False]) + def test_numba_vs_cython(self, jit, nogil, parallel, nopython): + def f(x, *args): + arg_sum = 0 + for arg in args: + arg_sum += arg + return np.mean(x) + arg_sum + + if jit: + import numba + + f = numba.jit(f) + + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} + args = (2,) + + s = Series(range(10)) + result = s.rolling(2).apply( + f, args=args, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = s.rolling(2).apply(f, engine="cython", args=args, raw=True) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("jit", [True, False]) + def test_cache(self, jit, nogil, parallel, nopython): + # Test that the functions are cached correctly if we switch functions + def func_1(x): + return np.mean(x) + 4 + + def func_2(x): + return np.std(x) * 5 + + if jit: + import numba + + func_1 = numba.jit(func_1) + func_2 = numba.jit(func_2) + + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} + + roll = Series(range(10)).rolling(2) + result = roll.apply( + func_1, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = roll.apply(func_1, engine="cython", raw=True) + tm.assert_series_equal(result, expected) + + # func_1 should be in the cache now + assert func_1 in roll._numba_func_cache + + result = roll.apply( + func_2, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = roll.apply(func_2, engine="cython", raw=True) + tm.assert_series_equal(result, expected) + # This run should use the cached func_1 + result = roll.apply( + func_1, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = roll.apply(func_1, engine="cython", raw=True) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 46582b4b50c84..c0d47fc2ca624 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -566,26 +566,6 @@ def test_freqs_ops(self, freq, op, result_data): tm.assert_series_equal(result, expected) - def test_ragged_apply(self, raw): - - df = self.ragged - - f = lambda x: 1 - result = df.rolling(window="1s", min_periods=1).apply(f, raw=raw) - expected = df.copy() - expected["B"] = 1.0 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="2s", min_periods=1).apply(f, raw=raw) - expected = df.copy() - expected["B"] = 1.0 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="5s", min_periods=1).apply(f, raw=raw) - expected = df.copy() - expected["B"] = 1.0 - tm.assert_frame_equal(result, expected) - def test_all(self): # simple comparison of integer vs time-based windowing @@ -614,16 +594,6 @@ def test_all(self): expected = er.quantile(0.5) tm.assert_frame_equal(result, expected) - def test_all_apply(self, raw): - - df = self.regular * 2 - er = df.rolling(window=1) - r = df.rolling(window="1s") - - result = r.apply(lambda x: 1, raw=raw) - expected = er.apply(lambda x: 1, raw=raw) - tm.assert_frame_equal(result, expected) - def test_all2(self): # more sophisticated comparison of integer vs. diff --git a/requirements-dev.txt b/requirements-dev.txt index 827bb809d46e4..d2d34e504bd7f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -50,6 +50,7 @@ jinja2 matplotlib>=2.2.2 numexpr>=2.6.8 scipy>=1.1 +numba>=0.46.0 beautifulsoup4>=4.6.0 html5lib lxml