Merge pull request #1441 from beckernick/feature/series-python-cumula…

…tive-ops [REVIEW] Add Series level cumulative ops (sum, min, max, prod) in python layer
rapidsai · Apr 17, 2019 · d2362a9 · d2362a9
2 parents c1ac4d5 + c8076b4
commit d2362a9
Show file tree

Hide file tree

Showing 5 changed files with 235 additions and 112 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@
 - PR #1396 Add DataFrame.drop method
 - PR #1413 Add DataFrame.melt method
 - PR #1412 Add DataFrame.pop()
+- PR #1441 Add Series level cumulative ops (cumsum, cummin, cummax, cumprod)
 - PR #1440 Add DatetimeColumn.min(), DatetimeColumn.max()
 
 ## Improvements

diff --git a/python/cudf/dataframe/numerical.py b/python/cudf/dataframe/numerical.py
@@ -101,6 +101,11 @@ def unordered_compare(self, cmpop, rhs):
     def ordered_compare(self, cmpop, rhs):
         return numeric_column_compare(self, rhs, op=cmpop)
 
+    def _apply_scan_op(self, op):
+        out_col = columnops.column_empty_like_same_mask(self, dtype=self.dtype)
+        cpp_reduce.apply_scan(self, out_col, op, inclusive=True)
+        return out_col
+
     def normalize_binop_value(self, other):
         other_dtype = np.min_scalar_type(other)
         if other_dtype.kind in 'biuf':

diff --git a/python/cudf/dataframe/series.py b/python/cudf/dataframe/series.py
@@ -1113,6 +1113,43 @@ def product(self, axis=None, skipna=True, dtype=None):
         assert axis in (None, 0) and skipna is True
         return self._column.product(dtype=dtype)
 
+    def cummin(self, axis=0, skipna=True):
+        """Compute the cumulative minimum of the series"""
+        assert axis in (None, 0) and skipna is True
+        return Series(self._column._apply_scan_op('min'), name=self.name,
+                      index=self.index)
+
+    def cummax(self, axis=0, skipna=True):
+        """Compute the cumulative maximum of the series"""
+        assert axis in (None, 0) and skipna is True
+        return Series(self._column._apply_scan_op('max'), name=self.name,
+                      index=self.index)
+
+    def cumsum(self, axis=0, skipna=True):
+        """Compute the cumulative sum of the series"""
+        assert axis in (None, 0) and skipna is True
+
+        # pandas always returns int64 dtype if original dtype is int
+        if np.issubdtype(self.dtype, np.integer):
+            return Series(self.astype(np.int64)._column._apply_scan_op('sum'),
+                          name=self.name, index=self.index)
+        else:
+            return Series(self._column._apply_scan_op('sum'), name=self.name,
+                          index=self.index)
+
+    def cumprod(self, axis=0, skipna=True):
+        """Compute the cumulative product of the series"""
+        assert axis in (None, 0) and skipna is True
+
+        # pandas always returns int64 dtype if original dtype is int
+        if np.issubdtype(self.dtype, np.integer):
+            return Series(
+                self.astype(np.int64)._column._apply_scan_op('product'),
+                name=self.name, index=self.index)
+        else:
+            return Series(self._column._apply_scan_op('product'),
+                          name=self.name, index=self.index)
+
     def mean(self, axis=None, skipna=True, dtype=None):
         """Compute the mean of the series
         """

diff --git a/python/cudf/tests/test_prefixsum.py b/python/cudf/tests/test_prefixsum.py
diff --git a/python/cudf/tests/test_scan.py b/python/cudf/tests/test_scan.py
@@ -0,0 +1,192 @@
+from itertools import product
+
+import pytest
+import numpy as np
+import pandas as pd
+
+from cudf.dataframe.dataframe import Series, DataFrame
+from cudf.tests.utils import gen_rand, assert_eq
+
+
+params_dtype = [
+    np.int8,
+    np.int16,
+    np.int32,
+    np.int64,
+    np.float32,
+    np.float64,
+]
+
+params_sizes = [1, 2, 13, 64, 100, 1000]
+
+
+def _gen_params():
+    for t, n in product(params_dtype, params_sizes):
+        if (t == np.int8 or t == np.int16) and n > 20:
+            # to keep data in range
+            continue
+        yield t, n
+
+
+@pytest.mark.parametrize('dtype,nelem', list(_gen_params()))
+def test_cumsum(dtype, nelem):
+    if dtype == np.int8:
+        # to keep data in range
+        data = gen_rand(dtype, nelem, low=-2, high=2)
+    else:
+        data = gen_rand(dtype, nelem)
+
+    decimal = 4 if dtype == np.float32 else 6
+
+    # series
+    gs = Series(data)
+    ps = pd.Series(data)
+    np.testing.assert_array_almost_equal(gs.cumsum(), ps.cumsum(),
+                                         decimal=decimal)
+
+    # dataframe series (named series)
+    gdf = DataFrame()
+    gdf['a'] = Series(data)
+    pdf = pd.DataFrame()
+    pdf['a'] = pd.Series(data)
+    np.testing.assert_array_almost_equal(gdf.a.cumsum(), pdf.a.cumsum(),
+                                         decimal=decimal)
+
+
+def test_cumsum_masked():
+    data = [1, 2, None, 4, 5]
+    float_types = ['float32', 'float64']
+    int_types = ['int8', 'int16', 'int32', 'int64']
+
+    for type_ in float_types:
+        gs = Series(data).astype(type_)
+        ps = pd.Series(data).astype(type_)
+        assert_eq(gs.cumsum(), ps.cumsum())
+
+    for type_ in int_types:
+        expected = pd.Series([1, 3, -1, 7, 12]).astype('int64')
+        gs = Series(data).astype(type_)
+        assert_eq(gs.cumsum(), expected)
+
+
+@pytest.mark.parametrize('dtype,nelem', list(_gen_params()))
+def test_cummin(dtype, nelem):
+    if dtype == np.int8:
+        # to keep data in range
+        data = gen_rand(dtype, nelem, low=-2, high=2)
+    else:
+        data = gen_rand(dtype, nelem)
+
+    decimal = 4 if dtype == np.float32 else 6
+
+    # series
+    gs = Series(data)
+    ps = pd.Series(data)
+    np.testing.assert_array_almost_equal(gs.cummin(), ps.cummin(),
+                                         decimal=decimal)
+
+    # dataframe series (named series)
+    gdf = DataFrame()
+    gdf['a'] = Series(data)
+    pdf = pd.DataFrame()
+    pdf['a'] = pd.Series(data)
+    np.testing.assert_array_almost_equal(gdf.a.cummin(), pdf.a.cummin(),
+                                         decimal=decimal)
+
+
+def test_cummin_masked():
+    data = [1, 2, None, 4, 5]
+    float_types = ['float32', 'float64']
+    int_types = ['int8', 'int16', 'int32', 'int64']
+
+    for type_ in float_types:
+        gs = Series(data).astype(type_)
+        ps = pd.Series(data).astype(type_)
+        assert_eq(gs.cummin(), ps.cummin())
+
+    for type_ in int_types:
+        expected = pd.Series([1, 1, -1, 1, 1]).astype(type_)
+        gs = Series(data).astype(type_)
+        assert_eq(gs.cummin(), expected)
+
+
+@pytest.mark.parametrize('dtype,nelem', list(_gen_params()))
+def test_cummax(dtype, nelem):
+    if dtype == np.int8:
+        # to keep data in range
+        data = gen_rand(dtype, nelem, low=-2, high=2)
+    else:
+        data = gen_rand(dtype, nelem)
+
+    decimal = 4 if dtype == np.float32 else 6
+
+    # series
+    gs = Series(data)
+    ps = pd.Series(data)
+    np.testing.assert_array_almost_equal(gs.cummax(), ps.cummax(),
+                                         decimal=decimal)
+
+    # dataframe series (named series)
+    gdf = DataFrame()
+    gdf['a'] = Series(data)
+    pdf = pd.DataFrame()
+    pdf['a'] = pd.Series(data)
+    np.testing.assert_array_almost_equal(gdf.a.cummax(), pdf.a.cummax(),
+                                         decimal=decimal)
+
+
+def test_cummax_masked():
+    data = [1, 2, None, 4, 5]
+    float_types = ['float32', 'float64']
+    int_types = ['int8', 'int16', 'int32', 'int64']
+
+    for type_ in float_types:
+        gs = Series(data).astype(type_)
+        ps = pd.Series(data).astype(type_)
+        assert_eq(gs.cummax(), ps.cummax())
+
+    for type_ in int_types:
+        expected = pd.Series([1, 2, -1, 4, 5]).astype(type_)
+        gs = Series(data).astype(type_)
+        assert_eq(gs.cummax(), expected)
+
+
+@pytest.mark.parametrize('dtype,nelem', list(_gen_params()))
+def test_cumprod(dtype, nelem):
+    if dtype == np.int8:
+        # to keep data in range
+        data = gen_rand(dtype, nelem, low=-2, high=2)
+    else:
+        data = gen_rand(dtype, nelem)
+
+    decimal = 4 if dtype == np.float32 else 6
+
+    # series
+    gs = Series(data)
+    ps = pd.Series(data)
+    np.testing.assert_array_almost_equal(gs.cumprod(), ps.cumprod(),
+                                         decimal=decimal)
+
+    # dataframe series (named series)
+    gdf = DataFrame()
+    gdf['a'] = Series(data)
+    pdf = pd.DataFrame()
+    pdf['a'] = pd.Series(data)
+    np.testing.assert_array_almost_equal(gdf.a.cumprod(), pdf.a.cumprod(),
+                                         decimal=decimal)
+
+
+def test_cumprod_masked():
+    data = [1, 2, None, 4, 5]
+    float_types = ['float32', 'float64']
+    int_types = ['int8', 'int16', 'int32', 'int64']
+
+    for type_ in float_types:
+        gs = Series(data).astype(type_)
+        ps = pd.Series(data).astype(type_)
+        assert_eq(gs.cumprod(), ps.cumprod())
+
+    for type_ in int_types:
+        expected = pd.Series([1, 2, -1, 8, 40]).astype('int64')
+        gs = Series(data).astype(type_)
+        assert_eq(gs.cumprod(), expected)