Merge pull request #1456 from beckernick/feature/numba-numeric-diff

[REVIEW] Add Series.diff() via Numba kernel
rapidsai · May 3, 2019 · 7f3d5fe · 7f3d5fe
2 parents c04ed61 + b57b396
commit 7f3d5fe
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,7 @@
 - PR #1466 Add GPU-accelerated ORC Reader
 - PR #1565 Add build script for nightly doc builds
 - PR #1508 Add Series isna, isnull, and notna
+- PR #1456 Add Series.diff() via Numba kernel
 - PR #1588 Add Index `astype` typecasting
 - PR #1301 MultiIndex support
 - PR #1599 Level keyword supported in groupby

diff --git a/python/cudf/dataframe/series.py b/python/cudf/dataframe/series.py
@@ -1546,6 +1546,28 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
                                                      periods)
         return Series(output_dary, name=self.name, index=self.index)
 
+    def diff(self, periods=1):
+        """Calculate the difference between values at positions i and i - N in
+        an array and store the output in a new array.
+        Notes
+        -----
+        Diff currently only supports float and integer dtype columns with
+        no null values.
+        """
+        if self.null_count != 0:
+            raise AssertionError("Diff currently requires columns with no "
+                                 "null values")
+
+        if not np.issubdtype(self.dtype, np.number):
+            raise NotImplementedError("Diff currently only supports "
+                                      "numeric dtypes")
+
+        input_dary = self.data.to_gpu_array()
+        output_dary = rmm.device_array_like(input_dary)
+        cudautils.gpu_diff.forall(output_dary.size)(input_dary, output_dary,
+                                                    periods)
+        return Series(output_dary, name=self.name, index=self.index)
+
     def groupby(self, group_series=None, level=None, sort=False):
         from cudf.groupby.groupby import SeriesGroupBy
         return SeriesGroupBy(self, group_series, level, sort)

diff --git a/python/cudf/tests/test_dataframe.py b/python/cudf/tests/test_dataframe.py
@@ -2270,6 +2270,24 @@ def test_shift(dtype, period):
     assert_eq(shifted_outcome, expected_outcome)
 
 
+@pytest.mark.parametrize('dtype', ['int8', 'int16', 'int32', 'int64',
+                                   'float32', 'float64'])
+@pytest.mark.parametrize('period', [-1, -5, -10, -20, 0, 1, 5, 10, 20])
+def test_diff(dtype, period):
+    if dtype == np.int8:
+        # to keep data in range
+        data = gen_rand(dtype, 100000, low=-2, high=2)
+    else:
+        data = gen_rand(dtype, 100000)
+
+    gdf = DataFrame({'a': data})
+    pdf = pd.DataFrame({'a': data})
+
+    diffed_outcome = gdf.a.diff(period)
+    expected_outcome = pdf.a.diff(period).fillna(-1).astype(dtype)
+    assert_eq(diffed_outcome, expected_outcome)
+
+
 def test_isnull_isna():
     # float some missing
     ps = pd.DataFrame({'a': [0, 1, 2, np.nan, 4, None, 6]})

diff --git a/python/cudf/utils/cudautils.py b/python/cudf/utils/cudautils.py
@@ -645,6 +645,25 @@ def gpu_shift(in_col, out_col, N):
             out_col[i] = -1
 
 
+@cuda.jit
+def gpu_diff(in_col, out_col, N):
+    """Calculate the difference between values at positions i and i - N in an
+    array and store the output in a new array.
+    """
+    i = cuda.grid(1)
+
+    if N > 0:
+        if i < in_col.size:
+            out_col[i] = in_col[i] - in_col[i - N]
+        if i < N:
+            out_col[i] = -1
+    else:
+        if i <= (in_col.size + N):
+            out_col[i] = in_col[i] - in_col[i - N]
+        if i >= (in_col.size + N) and i < in_col.size:
+            out_col[i] = -1
+
+
 MAX_FAST_UNIQUE_K = 2 * 1024