Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Add Series level cumulative ops (sum, min, max, prod) in python layer #1441

Merged
merged 15 commits into from
Apr 17, 2019
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions python/cudf/dataframe/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,11 @@ def unordered_compare(self, cmpop, rhs):
def ordered_compare(self, cmpop, rhs):
return numeric_column_compare(self, rhs, op=cmpop)

def _apply_scan_op(self, op):
out_col = columnops.column_empty_like_same_mask(self, dtype=self.dtype)
cpp_reduce.apply_scan(self, out_col, op, inclusive=True)
return out_col

def normalize_binop_value(self, other):
other_dtype = np.min_scalar_type(other)
if other_dtype.kind in 'biuf':
Expand Down
34 changes: 34 additions & 0 deletions python/cudf/dataframe/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1113,6 +1113,40 @@ def product(self, axis=None, skipna=True, dtype=None):
assert axis in (None, 0) and skipna is True
return self._column.product(dtype=dtype)

def cummin(self, axis=0, skipna=True):
"""Compute the cumulative minimum of the series"""
assert axis in (None, 0) and skipna is True
return Series(self._column._apply_scan_op('min'), name=self.name)
beckernick marked this conversation as resolved.
Show resolved Hide resolved

def cummax(self, axis=0, skipna=True):
"""Compute the cumulative maximum of the series"""
assert axis in (None, 0) and skipna is True
return Series(self._column._apply_scan_op('max'), name=self.name)

def cumsum(self, axis=0, skipna=True):
"""Compute the cumulative sum of the series"""
assert axis in (None, 0) and skipna is True

# pandas always returns int64 dtype if original dtype is int
if np.issubdtype(self.dtype, np.integer):
return Series(self.astype(np.int64)._column._apply_scan_op('sum'),
name=self.name)
else:
return Series(self._column._apply_scan_op('sum'), name=self.name)

def cumprod(self, axis=0, skipna=True):
"""Compute the cumulative sum of the series"""
beckernick marked this conversation as resolved.
Show resolved Hide resolved
assert axis in (None, 0) and skipna is True

# pandas always returns int64 dtype if original dtype is int
if np.issubdtype(self.dtype, np.integer):
return Series(
self.astype(np.int64)._column._apply_scan_op('product'),
name=self.name)
else:
return Series(self._column._apply_scan_op('product'),
name=self.name)

def mean(self, axis=None, skipna=True, dtype=None):
"""Compute the mean of the series
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

import pytest
import numpy as np
import pandas as pd
import cudf.bindings.reduce as cpp_reduce

from itertools import product
from cudf.dataframe.buffer import Buffer
from cudf.dataframe.numerical import NumericalColumn
from cudf.dataframe.dataframe import Series, DataFrame
from cudf.tests import utils
from cudf.tests.utils import gen_rand

Expand Down Expand Up @@ -110,3 +112,56 @@ def test_prefixsum_masked(dtype, nelem):

decimal = 4 if dtype == np.float32 else 6
np.testing.assert_array_almost_equal(expect, got, decimal=decimal)


@pytest.mark.parametrize('dtype,nelem', list(_gen_params()))
def test_cumsum(dtype, nelem):
if dtype == np.int8:
# to keep data in range
data = gen_rand(dtype, nelem, low=-2, high=2)
else:
data = gen_rand(dtype, nelem)

decimal = 4 if dtype == np.float32 else 6

# series
gs = Series(data)
ps = pd.Series(data)
np.testing.assert_array_almost_equal(gs.cumsum(), ps.cumsum(),
decimal=decimal)

# dataframe series (named series)
gdf = DataFrame()
gdf['a'] = Series(data)
pdf = pd.DataFrame()
pdf['a'] = pd.Series(data)
np.testing.assert_array_almost_equal(gdf.a.cumsum(), pdf.a.cumsum(),
decimal=decimal)


def test_cumsum_masked():
pass


def test_cummin():
pass


def test_cummin_masked():
pass


def test_cummax():
pass


def test_cummax_masked():
pass


def test_cumprod():
pass


def test_cumprod_masked():
pass