Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Add Series level cumulative ops (sum, min, max, prod) in python layer #1441

Merged
merged 15 commits into from
Apr 17, 2019
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- PR #1396 Add DataFrame.drop method
- PR #1413 Add DataFrame.melt method
- PR #1412 Add DataFrame.pop()
- PR #1441 Add Series level cumulative ops (cumsum, cummin, cummax, cumprod)
- PR #1440 Add DatetimeColumn.min(), DatetimeColumn.max()

## Improvements
Expand Down
5 changes: 5 additions & 0 deletions python/cudf/dataframe/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,11 @@ def unordered_compare(self, cmpop, rhs):
def ordered_compare(self, cmpop, rhs):
return numeric_column_compare(self, rhs, op=cmpop)

def _apply_scan_op(self, op):
out_col = columnops.column_empty_like_same_mask(self, dtype=self.dtype)
cpp_reduce.apply_scan(self, out_col, op, inclusive=True)
return out_col

def normalize_binop_value(self, other):
other_dtype = np.min_scalar_type(other)
if other_dtype.kind in 'biuf':
Expand Down
37 changes: 37 additions & 0 deletions python/cudf/dataframe/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1113,6 +1113,43 @@ def product(self, axis=None, skipna=True, dtype=None):
assert axis in (None, 0) and skipna is True
return self._column.product(dtype=dtype)

def cummin(self, axis=0, skipna=True):
"""Compute the cumulative minimum of the series"""
assert axis in (None, 0) and skipna is True
return Series(self._column._apply_scan_op('min'), name=self.name,
index=self.index)
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved

def cummax(self, axis=0, skipna=True):
"""Compute the cumulative maximum of the series"""
assert axis in (None, 0) and skipna is True
return Series(self._column._apply_scan_op('max'), name=self.name,
index=self.index)
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved

def cumsum(self, axis=0, skipna=True):
"""Compute the cumulative sum of the series"""
assert axis in (None, 0) and skipna is True

# pandas always returns int64 dtype if original dtype is int
if np.issubdtype(self.dtype, np.integer):
return Series(self.astype(np.int64)._column._apply_scan_op('sum'),
name=self.name, index=self.index)
else:
return Series(self._column._apply_scan_op('sum'), name=self.name,
index=self.index)

def cumprod(self, axis=0, skipna=True):
"""Compute the cumulative sum of the series"""
beckernick marked this conversation as resolved.
Show resolved Hide resolved
assert axis in (None, 0) and skipna is True

# pandas always returns int64 dtype if original dtype is int
if np.issubdtype(self.dtype, np.integer):
return Series(
self.astype(np.int64)._column._apply_scan_op('product'),
name=self.name, index=self.index)
else:
return Series(self._column._apply_scan_op('product'),
name=self.name, index=self.index)

def mean(self, axis=None, skipna=True, dtype=None):
"""Compute the mean of the series
"""
Expand Down
112 changes: 0 additions & 112 deletions python/cudf/tests/test_prefixsum.py

This file was deleted.

192 changes: 192 additions & 0 deletions python/cudf/tests/test_scan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
from itertools import product

import pytest
import numpy as np
import pandas as pd

from cudf.dataframe.dataframe import Series, DataFrame
from cudf.tests.utils import gen_rand, assert_eq


params_dtype = [
np.int8,
np.int16,
np.int32,
np.int64,
np.float32,
np.float64,
]

params_sizes = [1, 2, 13, 64, 100, 1000]


def _gen_params():
for t, n in product(params_dtype, params_sizes):
if (t == np.int8 or t == np.int16) and n > 20:
# to keep data in range
continue
yield t, n


@pytest.mark.parametrize('dtype,nelem', list(_gen_params()))
def test_cumsum(dtype, nelem):
if dtype == np.int8:
# to keep data in range
data = gen_rand(dtype, nelem, low=-2, high=2)
else:
data = gen_rand(dtype, nelem)

decimal = 4 if dtype == np.float32 else 6

# series
gs = Series(data)
ps = pd.Series(data)
np.testing.assert_array_almost_equal(gs.cumsum(), ps.cumsum(),
decimal=decimal)

# dataframe series (named series)
gdf = DataFrame()
gdf['a'] = Series(data)
pdf = pd.DataFrame()
pdf['a'] = pd.Series(data)
np.testing.assert_array_almost_equal(gdf.a.cumsum(), pdf.a.cumsum(),
decimal=decimal)


def test_cumsum_masked():
data = [1, 2, None, 4, 5]
float_types = ['float32', 'float64']
int_types = ['int8', 'int16', 'int32', 'int64']

for type_ in float_types:
gs = Series(data).astype(type_)
ps = pd.Series(data).astype(type_)
assert_eq(gs.cumsum(), ps.cumsum())

for type_ in int_types:
expected = pd.Series([1, 3, -1, 7, 12]).astype('int64')
gs = Series(data).astype(type_)
assert_eq(gs.cumsum(), expected)


@pytest.mark.parametrize('dtype,nelem', list(_gen_params()))
def test_cummin(dtype, nelem):
if dtype == np.int8:
# to keep data in range
data = gen_rand(dtype, nelem, low=-2, high=2)
else:
data = gen_rand(dtype, nelem)

decimal = 4 if dtype == np.float32 else 6

# series
gs = Series(data)
ps = pd.Series(data)
np.testing.assert_array_almost_equal(gs.cummin(), ps.cummin(),
decimal=decimal)

# dataframe series (named series)
gdf = DataFrame()
gdf['a'] = Series(data)
pdf = pd.DataFrame()
pdf['a'] = pd.Series(data)
np.testing.assert_array_almost_equal(gdf.a.cummin(), pdf.a.cummin(),
decimal=decimal)


def test_cummin_masked():
data = [1, 2, None, 4, 5]
float_types = ['float32', 'float64']
int_types = ['int8', 'int16', 'int32', 'int64']

for type_ in float_types:
gs = Series(data).astype(type_)
ps = pd.Series(data).astype(type_)
assert_eq(gs.cummin(), ps.cummin())

for type_ in int_types:
expected = pd.Series([1, 1, -1, 1, 1]).astype(type_)
gs = Series(data).astype(type_)
assert_eq(gs.cummin(), expected)


@pytest.mark.parametrize('dtype,nelem', list(_gen_params()))
def test_cummax(dtype, nelem):
if dtype == np.int8:
# to keep data in range
data = gen_rand(dtype, nelem, low=-2, high=2)
else:
data = gen_rand(dtype, nelem)

decimal = 4 if dtype == np.float32 else 6

# series
gs = Series(data)
ps = pd.Series(data)
np.testing.assert_array_almost_equal(gs.cummax(), ps.cummax(),
decimal=decimal)

# dataframe series (named series)
gdf = DataFrame()
gdf['a'] = Series(data)
pdf = pd.DataFrame()
pdf['a'] = pd.Series(data)
np.testing.assert_array_almost_equal(gdf.a.cummax(), pdf.a.cummax(),
decimal=decimal)


def test_cummax_masked():
data = [1, 2, None, 4, 5]
float_types = ['float32', 'float64']
int_types = ['int8', 'int16', 'int32', 'int64']

for type_ in float_types:
gs = Series(data).astype(type_)
ps = pd.Series(data).astype(type_)
assert_eq(gs.cummax(), ps.cummax())

for type_ in int_types:
expected = pd.Series([1, 2, -1, 4, 5]).astype(type_)
gs = Series(data).astype(type_)
assert_eq(gs.cummax(), expected)


@pytest.mark.parametrize('dtype,nelem', list(_gen_params()))
def test_cumprod(dtype, nelem):
if dtype == np.int8:
# to keep data in range
data = gen_rand(dtype, nelem, low=-2, high=2)
else:
data = gen_rand(dtype, nelem)

decimal = 4 if dtype == np.float32 else 6

# series
gs = Series(data)
ps = pd.Series(data)
np.testing.assert_array_almost_equal(gs.cumprod(), ps.cumprod(),
decimal=decimal)

# dataframe series (named series)
gdf = DataFrame()
gdf['a'] = Series(data)
pdf = pd.DataFrame()
pdf['a'] = pd.Series(data)
np.testing.assert_array_almost_equal(gdf.a.cumprod(), pdf.a.cumprod(),
decimal=decimal)


def test_cumprod_masked():
data = [1, 2, None, 4, 5]
float_types = ['float32', 'float64']
int_types = ['int8', 'int16', 'int32', 'int64']

for type_ in float_types:
gs = Series(data).astype(type_)
ps = pd.Series(data).astype(type_)
assert_eq(gs.cumprod(), ps.cumprod())

for type_ in int_types:
expected = pd.Series([1, 2, -1, 8, 40]).astype('int64')
gs = Series(data).astype(type_)
assert_eq(gs.cumprod(), expected)