Skip to content

Commit

Permalink
Merge pull request #1456 from beckernick/feature/numba-numeric-diff
Browse files Browse the repository at this point in the history
[REVIEW] Add Series.diff() via Numba kernel
  • Loading branch information
Keith Kraus authored May 3, 2019
2 parents c04ed61 + b57b396 commit 7f3d5fe
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
- PR #1466 Add GPU-accelerated ORC Reader
- PR #1565 Add build script for nightly doc builds
- PR #1508 Add Series isna, isnull, and notna
- PR #1456 Add Series.diff() via Numba kernel
- PR #1588 Add Index `astype` typecasting
- PR #1301 MultiIndex support
- PR #1599 Level keyword supported in groupby
Expand Down
22 changes: 22 additions & 0 deletions python/cudf/dataframe/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1546,6 +1546,28 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
periods)
return Series(output_dary, name=self.name, index=self.index)

def diff(self, periods=1):
"""Calculate the difference between values at positions i and i - N in
an array and store the output in a new array.
Notes
-----
Diff currently only supports float and integer dtype columns with
no null values.
"""
if self.null_count != 0:
raise AssertionError("Diff currently requires columns with no "
"null values")

if not np.issubdtype(self.dtype, np.number):
raise NotImplementedError("Diff currently only supports "
"numeric dtypes")

input_dary = self.data.to_gpu_array()
output_dary = rmm.device_array_like(input_dary)
cudautils.gpu_diff.forall(output_dary.size)(input_dary, output_dary,
periods)
return Series(output_dary, name=self.name, index=self.index)

def groupby(self, group_series=None, level=None, sort=False):
from cudf.groupby.groupby import SeriesGroupBy
return SeriesGroupBy(self, group_series, level, sort)
Expand Down
18 changes: 18 additions & 0 deletions python/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2270,6 +2270,24 @@ def test_shift(dtype, period):
assert_eq(shifted_outcome, expected_outcome)


@pytest.mark.parametrize('dtype', ['int8', 'int16', 'int32', 'int64',
'float32', 'float64'])
@pytest.mark.parametrize('period', [-1, -5, -10, -20, 0, 1, 5, 10, 20])
def test_diff(dtype, period):
if dtype == np.int8:
# to keep data in range
data = gen_rand(dtype, 100000, low=-2, high=2)
else:
data = gen_rand(dtype, 100000)

gdf = DataFrame({'a': data})
pdf = pd.DataFrame({'a': data})

diffed_outcome = gdf.a.diff(period)
expected_outcome = pdf.a.diff(period).fillna(-1).astype(dtype)
assert_eq(diffed_outcome, expected_outcome)


def test_isnull_isna():
# float some missing
ps = pd.DataFrame({'a': [0, 1, 2, np.nan, 4, None, 6]})
Expand Down
19 changes: 19 additions & 0 deletions python/cudf/utils/cudautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,25 @@ def gpu_shift(in_col, out_col, N):
out_col[i] = -1


@cuda.jit
def gpu_diff(in_col, out_col, N):
"""Calculate the difference between values at positions i and i - N in an
array and store the output in a new array.
"""
i = cuda.grid(1)

if N > 0:
if i < in_col.size:
out_col[i] = in_col[i] - in_col[i - N]
if i < N:
out_col[i] = -1
else:
if i <= (in_col.size + N):
out_col[i] = in_col[i] - in_col[i - N]
if i >= (in_col.size + N) and i < in_col.size:
out_col[i] = -1


MAX_FAST_UNIQUE_K = 2 * 1024


Expand Down

0 comments on commit 7f3d5fe

Please sign in to comment.