Skip to content

Commit

Permalink
Merge pull request #1441 from beckernick/feature/series-python-cumula…
Browse files Browse the repository at this point in the history
…tive-ops

[REVIEW] Add Series level cumulative ops (sum, min, max, prod) in python layer
  • Loading branch information
kkraus14 authored Apr 17, 2019
2 parents c1ac4d5 + c8076b4 commit d2362a9
Show file tree
Hide file tree
Showing 5 changed files with 235 additions and 112 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- PR #1396 Add DataFrame.drop method
- PR #1413 Add DataFrame.melt method
- PR #1412 Add DataFrame.pop()
- PR #1441 Add Series level cumulative ops (cumsum, cummin, cummax, cumprod)
- PR #1440 Add DatetimeColumn.min(), DatetimeColumn.max()

## Improvements
Expand Down
5 changes: 5 additions & 0 deletions python/cudf/dataframe/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,11 @@ def unordered_compare(self, cmpop, rhs):
def ordered_compare(self, cmpop, rhs):
return numeric_column_compare(self, rhs, op=cmpop)

def _apply_scan_op(self, op):
out_col = columnops.column_empty_like_same_mask(self, dtype=self.dtype)
cpp_reduce.apply_scan(self, out_col, op, inclusive=True)
return out_col

def normalize_binop_value(self, other):
other_dtype = np.min_scalar_type(other)
if other_dtype.kind in 'biuf':
Expand Down
37 changes: 37 additions & 0 deletions python/cudf/dataframe/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1113,6 +1113,43 @@ def product(self, axis=None, skipna=True, dtype=None):
assert axis in (None, 0) and skipna is True
return self._column.product(dtype=dtype)

def cummin(self, axis=0, skipna=True):
"""Compute the cumulative minimum of the series"""
assert axis in (None, 0) and skipna is True
return Series(self._column._apply_scan_op('min'), name=self.name,
index=self.index)

def cummax(self, axis=0, skipna=True):
"""Compute the cumulative maximum of the series"""
assert axis in (None, 0) and skipna is True
return Series(self._column._apply_scan_op('max'), name=self.name,
index=self.index)

def cumsum(self, axis=0, skipna=True):
"""Compute the cumulative sum of the series"""
assert axis in (None, 0) and skipna is True

# pandas always returns int64 dtype if original dtype is int
if np.issubdtype(self.dtype, np.integer):
return Series(self.astype(np.int64)._column._apply_scan_op('sum'),
name=self.name, index=self.index)
else:
return Series(self._column._apply_scan_op('sum'), name=self.name,
index=self.index)

def cumprod(self, axis=0, skipna=True):
"""Compute the cumulative product of the series"""
assert axis in (None, 0) and skipna is True

# pandas always returns int64 dtype if original dtype is int
if np.issubdtype(self.dtype, np.integer):
return Series(
self.astype(np.int64)._column._apply_scan_op('product'),
name=self.name, index=self.index)
else:
return Series(self._column._apply_scan_op('product'),
name=self.name, index=self.index)

def mean(self, axis=None, skipna=True, dtype=None):
"""Compute the mean of the series
"""
Expand Down
112 changes: 0 additions & 112 deletions python/cudf/tests/test_prefixsum.py

This file was deleted.

192 changes: 192 additions & 0 deletions python/cudf/tests/test_scan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
from itertools import product

import pytest
import numpy as np
import pandas as pd

from cudf.dataframe.dataframe import Series, DataFrame
from cudf.tests.utils import gen_rand, assert_eq


params_dtype = [
np.int8,
np.int16,
np.int32,
np.int64,
np.float32,
np.float64,
]

params_sizes = [1, 2, 13, 64, 100, 1000]


def _gen_params():
for t, n in product(params_dtype, params_sizes):
if (t == np.int8 or t == np.int16) and n > 20:
# to keep data in range
continue
yield t, n


@pytest.mark.parametrize('dtype,nelem', list(_gen_params()))
def test_cumsum(dtype, nelem):
if dtype == np.int8:
# to keep data in range
data = gen_rand(dtype, nelem, low=-2, high=2)
else:
data = gen_rand(dtype, nelem)

decimal = 4 if dtype == np.float32 else 6

# series
gs = Series(data)
ps = pd.Series(data)
np.testing.assert_array_almost_equal(gs.cumsum(), ps.cumsum(),
decimal=decimal)

# dataframe series (named series)
gdf = DataFrame()
gdf['a'] = Series(data)
pdf = pd.DataFrame()
pdf['a'] = pd.Series(data)
np.testing.assert_array_almost_equal(gdf.a.cumsum(), pdf.a.cumsum(),
decimal=decimal)


def test_cumsum_masked():
data = [1, 2, None, 4, 5]
float_types = ['float32', 'float64']
int_types = ['int8', 'int16', 'int32', 'int64']

for type_ in float_types:
gs = Series(data).astype(type_)
ps = pd.Series(data).astype(type_)
assert_eq(gs.cumsum(), ps.cumsum())

for type_ in int_types:
expected = pd.Series([1, 3, -1, 7, 12]).astype('int64')
gs = Series(data).astype(type_)
assert_eq(gs.cumsum(), expected)


@pytest.mark.parametrize('dtype,nelem', list(_gen_params()))
def test_cummin(dtype, nelem):
if dtype == np.int8:
# to keep data in range
data = gen_rand(dtype, nelem, low=-2, high=2)
else:
data = gen_rand(dtype, nelem)

decimal = 4 if dtype == np.float32 else 6

# series
gs = Series(data)
ps = pd.Series(data)
np.testing.assert_array_almost_equal(gs.cummin(), ps.cummin(),
decimal=decimal)

# dataframe series (named series)
gdf = DataFrame()
gdf['a'] = Series(data)
pdf = pd.DataFrame()
pdf['a'] = pd.Series(data)
np.testing.assert_array_almost_equal(gdf.a.cummin(), pdf.a.cummin(),
decimal=decimal)


def test_cummin_masked():
data = [1, 2, None, 4, 5]
float_types = ['float32', 'float64']
int_types = ['int8', 'int16', 'int32', 'int64']

for type_ in float_types:
gs = Series(data).astype(type_)
ps = pd.Series(data).astype(type_)
assert_eq(gs.cummin(), ps.cummin())

for type_ in int_types:
expected = pd.Series([1, 1, -1, 1, 1]).astype(type_)
gs = Series(data).astype(type_)
assert_eq(gs.cummin(), expected)


@pytest.mark.parametrize('dtype,nelem', list(_gen_params()))
def test_cummax(dtype, nelem):
if dtype == np.int8:
# to keep data in range
data = gen_rand(dtype, nelem, low=-2, high=2)
else:
data = gen_rand(dtype, nelem)

decimal = 4 if dtype == np.float32 else 6

# series
gs = Series(data)
ps = pd.Series(data)
np.testing.assert_array_almost_equal(gs.cummax(), ps.cummax(),
decimal=decimal)

# dataframe series (named series)
gdf = DataFrame()
gdf['a'] = Series(data)
pdf = pd.DataFrame()
pdf['a'] = pd.Series(data)
np.testing.assert_array_almost_equal(gdf.a.cummax(), pdf.a.cummax(),
decimal=decimal)


def test_cummax_masked():
data = [1, 2, None, 4, 5]
float_types = ['float32', 'float64']
int_types = ['int8', 'int16', 'int32', 'int64']

for type_ in float_types:
gs = Series(data).astype(type_)
ps = pd.Series(data).astype(type_)
assert_eq(gs.cummax(), ps.cummax())

for type_ in int_types:
expected = pd.Series([1, 2, -1, 4, 5]).astype(type_)
gs = Series(data).astype(type_)
assert_eq(gs.cummax(), expected)


@pytest.mark.parametrize('dtype,nelem', list(_gen_params()))
def test_cumprod(dtype, nelem):
if dtype == np.int8:
# to keep data in range
data = gen_rand(dtype, nelem, low=-2, high=2)
else:
data = gen_rand(dtype, nelem)

decimal = 4 if dtype == np.float32 else 6

# series
gs = Series(data)
ps = pd.Series(data)
np.testing.assert_array_almost_equal(gs.cumprod(), ps.cumprod(),
decimal=decimal)

# dataframe series (named series)
gdf = DataFrame()
gdf['a'] = Series(data)
pdf = pd.DataFrame()
pdf['a'] = pd.Series(data)
np.testing.assert_array_almost_equal(gdf.a.cumprod(), pdf.a.cumprod(),
decimal=decimal)


def test_cumprod_masked():
data = [1, 2, None, 4, 5]
float_types = ['float32', 'float64']
int_types = ['int8', 'int16', 'int32', 'int64']

for type_ in float_types:
gs = Series(data).astype(type_)
ps = pd.Series(data).astype(type_)
assert_eq(gs.cumprod(), ps.cumprod())

for type_ in int_types:
expected = pd.Series([1, 2, -1, 8, 40]).astype('int64')
gs = Series(data).astype(type_)
assert_eq(gs.cumprod(), expected)

0 comments on commit d2362a9

Please sign in to comment.