Merge branch 'branch-0.19' into copy-decimal-type-metadata

rapidsai · Apr 1, 2021 · 389d14b · 389d14b
2 parents f439c20 + 6cab04a
commit 389d14b
Show file tree

Hide file tree

Showing 15 changed files with 783 additions and 122 deletions.
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
@@ -142,7 +142,11 @@ void gather_helper(InputItr source_itr,
 // Error case when no other overload or specialization is available
 template <typename Element, typename Enable = void>
 struct column_gatherer_impl {
-  std::unique_ptr<column> operator()(...) { CUDF_FAIL("Unsupported type in gather."); }
+  template <typename... Args>
+  std::unique_ptr<column> operator()(Args&&...)
+  {
+    CUDF_FAIL("Unsupported type in gather.");
+  }
 };
 
 /**

diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
@@ -81,7 +81,11 @@ auto scatter_to_gather(MapIterator scatter_map_begin,
 
 template <typename Element, typename Enable = void>
 struct column_scatterer_impl {
-  std::unique_ptr<column> operator()(...) const { CUDF_FAIL("Unsupported type for scatter."); }
+  template <typename... Args>
+  std::unique_ptr<column> operator()(Args&&...) const
+  {
+    CUDF_FAIL("Unsupported type for scatter.");
+  }
 };
 
 template <typename Element>

diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
@@ -31,7 +31,11 @@ namespace {
 
 template <typename T, typename Enable = void>
 struct copy_if_else_functor_impl {
-  std::unique_ptr<column> operator()(...) { CUDF_FAIL("Unsupported type for copy_if_else."); }
+  template <typename... Args>
+  std::unique_ptr<column> operator()(Args&&...)
+  {
+    CUDF_FAIL("Unsupported type for copy_if_else.");
+  }
 };
 
 template <typename T>

diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
@@ -424,7 +424,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
   CUDF_EXPECTS(replacement.size() == input.size(), "Column size mismatch");
 
   if (input.is_empty()) { return cudf::empty_like(input); }
-  if (!input.has_nulls()) { return std::make_unique<cudf::column>(input); }
+  if (!input.has_nulls()) { return std::make_unique<cudf::column>(input, stream, mr); }
 
   return cudf::type_dispatcher<dispatch_storage_type>(
     input.type(), replace_nulls_column_kernel_forwarder{}, input, replacement, stream, mr);

diff --git a/docs/cudf/source/basics.rst b/docs/cudf/source/basics.rst
@@ -34,6 +34,8 @@ The following table lists all of cudf types. For methods requiring dtype argumen
 +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
 | Boolean                |                  | np.bool_                                                                            | ``'bool'``                                  |
 +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
+| Decimal                | Decimal64Dtype   | (none)                                                                              | (none)                                      |
++------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
 
 **Note: All dtypes above are Nullable**
 

diff --git a/docs/cudf/source/groupby.md b/docs/cudf/source/groupby.md
@@ -120,24 +120,24 @@ a
 
 The following table summarizes the available aggregations and the types that support them:
 
-| Aggregations\dtypes | Numeric  | Datetime | String   | Categorical | List | Struct |
-| ------------------- | -------- | -------  | -------- | ----------- | ---- | ------ |
-| count               | ✅       | ✅       | ✅       | ✅          |      |        |
-| size                | ✅       | ✅       | ✅       | ✅          |      |        |
-| sum                 | ✅       | ✅       |          |             |      |        |
-| idxmin              | ✅       | ✅       |          |             |      |        |
-| idxmax              | ✅       | ✅       |          |             |      |        |
-| min                 | ✅       | ✅       | ✅       |             |      |        |
-| max                 | ✅       | ✅       | ✅       |             |      |        |
-| mean                | ✅       | ✅       |          |             |      |        |
-| var                 | ✅       | ✅       |          |             |      |        |
-| std                 | ✅       | ✅       |          |             |      |        |
-| quantile            | ✅       | ✅       |          |             |      |        |
-| median              | ✅       | ✅       |          |             |      |        |
-| nunique             | ✅       | ✅       | ✅       | ✅          |      |        |
-| nth                 | ✅       | ✅       | ✅       |             |      |        |
-| collect             | ✅       | ✅       | ✅       |             | ✅   |        |
-| unique              | ✅       | ✅       | ✅       | ✅          |      |        |
+| Aggregations\dtypes | Numeric  | Datetime | String   | Categorical | List | Struct | Interval | Decimal |
+| ------------------- | -------- | -------  | -------- | ----------- | ---- | ------ | -------- | ------- |
+| count               | ✅       | ✅       | ✅       | ✅          |      |        |          | ✅      |
+| size                | ✅       | ✅       | ✅       | ✅          |      |        |          | ✅      |
+| sum                 | ✅       | ✅       |          |             |      |        |          | ✅      |
+| idxmin              | ✅       | ✅       |          |             |      |        |          | ✅      |
+| idxmax              | ✅       | ✅       |          |             |      |        |          | ✅      |
+| min                 | ✅       | ✅       | ✅       |             |      |        |          | ✅      |
+| max                 | ✅       | ✅       | ✅       |             |      |        |          | ✅      |
+| mean                | ✅       | ✅       |          |             |      |        |          |         |
+| var                 | ✅       | ✅       |          |             |      |        |          |         |
+| std                 | ✅       | ✅       |          |             |      |        |          |         |
+| quantile            | ✅       | ✅       |          |             |      |        |          |         |
+| median              | ✅       | ✅       |          |             |      |        |          |         |
+| nunique             | ✅       | ✅       | ✅       | ✅          |      |        |          | ✅      |
+| nth                 | ✅       | ✅       | ✅       |             |      |        |          | ✅      |
+| collect             | ✅       | ✅       | ✅       |             | ✅   |        |          | ✅      |
+| unique              | ✅       | ✅       | ✅       | ✅          |      |        |          |         |
 
 ## GroupBy apply
 

diff --git a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd b/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd
@@ -2,6 +2,7 @@
 from libc.stdint cimport int64_t, int32_t
 
 cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil:
+    # cython type stub to help resolve to numeric::decimal64
     ctypedef int64_t decimal64
 
     cdef cppclass scale_type:

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
@@ -3,6 +3,7 @@
 from collections import defaultdict
 
 import numpy as np
+import rmm
 
 from libcpp.pair cimport pair
 from libcpp.memory cimport unique_ptr
@@ -20,25 +21,9 @@ cimport cudf._lib.cpp.groupby as libcudf_groupby
 cimport cudf._lib.cpp.aggregation as libcudf_aggregation
 
 
-_GROUPBY_AGGS = {
-    "count",
-    "size",
-    "sum",
-    "idxmin",
-    "idxmax",
-    "min",
-    "max",
-    "mean",
-    "var",
-    "std",
-    "quantile",
-    "median",
-    "nunique",
-    "nth",
-    "collect",
-    "unique",
-}
-
+# The sets below define the possible aggregations that can be performed on
+# different dtypes. The uppercased versions of these strings correspond to
+# elements of the AggregationKind enum.
 _CATEGORICAL_AGGS = {
     "count",
     "size",
@@ -61,6 +46,24 @@ _LIST_AGGS = {
     "collect",
 }
 
+_STRUCT_AGGS = {
+}
+
+_INTERVAL_AGGS = {
+}
+
+_DECIMAL_AGGS = {
+    "count",
+    "sum",
+    "argmin",
+    "argmax",
+    "min",
+    "max",
+    "nunique",
+    "nth",
+    "collect"
+}
+
 
 cdef class GroupBy:
     cdef unique_ptr[libcudf_groupby.groupby] c_obj
@@ -197,7 +200,10 @@ def _drop_unsupported_aggs(Table values, aggs):
     from cudf.utils.dtypes import (
         is_categorical_dtype,
         is_string_dtype,
-        is_list_dtype
+        is_list_dtype,
+        is_interval_dtype,
+        is_struct_dtype,
+        is_decimal_dtype,
     )
     result = aggs.copy()
 
@@ -220,6 +226,29 @@ def _drop_unsupported_aggs(Table values, aggs):
             for i, agg_name in enumerate(aggs[col_name]):
                 if Aggregation(agg_name).kind not in _CATEGORICAL_AGGS:
                     del result[col_name][i]
+        elif (
+                is_struct_dtype(values._data[col_name].dtype)
+        ):
+            for i, agg_name in enumerate(aggs[col_name]):
+                if Aggregation(agg_name).kind not in _STRUCT_AGGS:
+                    del result[col_name][i]
+        elif (
+                is_interval_dtype(values._data[col_name].dtype)
+        ):
+            for i, agg_name in enumerate(aggs[col_name]):
+                if Aggregation(agg_name).kind not in _INTERVAL_AGGS:
+                    del result[col_name][i]
+        elif (
+                is_decimal_dtype(values._data[col_name].dtype)
+        ):
+            if rmm._cuda.gpu.runtimeGetVersion() < 11000:
+                raise RuntimeError(
+                    "Decimal aggregations are only supported on CUDA >= 11 "
+                    "due to an nvcc compiler bug."
+                )
+            for i, agg_name in enumerate(aggs[col_name]):
+                if Aggregation(agg_name).kind not in _DECIMAL_AGGS:
+                    del result[col_name][i]
 
     if all(len(v) == 0 for v in result.values()):
         raise DataError("No numeric types to aggregate")

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -426,6 +426,8 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
             array.type, pd.core.arrays._arrow_utils.ArrowIntervalType
         ):
             return cudf.core.column.IntervalColumn.from_arrow(array)
+        elif isinstance(array.type, pa.Decimal128Type):
+            return cudf.core.column.DecimalColumn.from_arrow(array)
 
         return libcudf.interop.from_arrow(data, data.column_names)._data[
             "None"
@@ -1853,10 +1855,14 @@ def as_column(
                 cupy.asarray(arbitrary), nan_as_null=nan_as_null, dtype=dtype
             )
         else:
-            data = as_column(
-                pa.array(arbitrary, from_pandas=nan_as_null),
-                dtype=arbitrary.dtype,
-            )
+            pyarrow_array = pa.array(arbitrary, from_pandas=nan_as_null)
+            if isinstance(pyarrow_array.type, pa.Decimal128Type):
+                pyarrow_type = cudf.Decimal64Dtype.from_arrow(
+                    pyarrow_array.type
+                )
+            else:
+                pyarrow_type = arbitrary.dtype
+            data = as_column(pyarrow_array, dtype=pyarrow_type)
         if dtype is not None:
             data = data.astype(dtype)
 

diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
@@ -6,6 +6,7 @@
 import cupy as cp
 import numpy as np
 import pyarrow as pa
+from pandas.api.types import is_integer_dtype
 
 import cudf
 from cudf import _lib as libcudf
@@ -66,17 +67,47 @@ def to_arrow(self):
     def binary_operator(self, op, other, reflect=False):
         if reflect:
             self, other = other, self
-        scale = _binop_scale(self.dtype, other.dtype, op)
-        output_type = Decimal64Dtype(
-            scale=scale, precision=Decimal64Dtype.MAX_PRECISION
-        )  # precision will be ignored, libcudf has no notion of precision
-        result = libcudf.binaryop.binaryop(self, other, op, output_type)
-        result.dtype.precision = _binop_precision(self.dtype, other.dtype, op)
+
+        # Binary Arithmatics between decimal columns. `Scale` and `precision`
+        # are computed outside of libcudf
+        if op in ("add", "sub", "mul"):
+            scale = _binop_scale(self.dtype, other.dtype, op)
+            output_type = Decimal64Dtype(
+                scale=scale, precision=Decimal64Dtype.MAX_PRECISION
+            )  # precision will be ignored, libcudf has no notion of precision
+            result = libcudf.binaryop.binaryop(self, other, op, output_type)
+            result.dtype.precision = _binop_precision(
+                self.dtype, other.dtype, op
+            )
+        elif op in ("eq", "lt", "gt", "le", "ge"):
+            if not isinstance(
+                other,
+                (DecimalColumn, cudf.core.column.NumericalColumn, cudf.Scalar),
+            ):
+                raise TypeError(
+                    f"Operator {op} not supported between"
+                    f"{str(type(self))} and {str(type(other))}"
+                )
+            if isinstance(
+                other, cudf.core.column.NumericalColumn
+            ) and not is_integer_dtype(other.dtype):
+                raise TypeError(
+                    f"Only decimal and integer column is supported for {op}."
+                )
+            if isinstance(other, cudf.core.column.NumericalColumn):
+                other = other.as_decimal_column(
+                    Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0)
+                )
+            result = libcudf.binaryop.binaryop(self, other, op, bool)
         return result
 
     def normalize_binop_value(self, other):
         if is_scalar(other) and isinstance(other, (int, np.int, Decimal)):
             return cudf.Scalar(Decimal(other))
+        elif isinstance(other, cudf.Scalar) and isinstance(
+            other.dtype, cudf.Decimal64Dtype
+        ):
+            return other
         else:
             raise TypeError(f"cannot normalize {type(other)}")
 

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -22,6 +22,7 @@
     column,
     string,
 )
+from cudf.core.dtypes import Decimal64Dtype
 from cudf.utils import cudautils, utils
 from cudf.utils.dtypes import (
     min_column_type,
@@ -103,11 +104,23 @@ def binary_operator(
             out_dtype = self.dtype
         else:
             if not (
-                isinstance(rhs, (NumericalColumn, cudf.Scalar,),)
+                isinstance(
+                    rhs,
+                    (
+                        NumericalColumn,
+                        cudf.Scalar,
+                        cudf.core.column.DecimalColumn,
+                    ),
+                )
                 or np.isscalar(rhs)
             ):
                 msg = "{!r} operator not supported between {} and {}"
                 raise TypeError(msg.format(binop, type(self), type(rhs)))
+            if isinstance(rhs, cudf.core.column.DecimalColumn):
+                lhs = self.as_decimal_column(
+                    Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0)
+                )
+                return lhs.binary_operator(binop, rhs)
             out_dtype = np.result_type(self.dtype, rhs.dtype)
             if binop in ["mod", "floordiv"]:
                 tmp = self if reflect else rhs
-Original file line number
+Diff line change
@@ Expand Up @@
     +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
     | Boolean                |                  | np.bool_                                                                            | ``'bool'``                                  |
     +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
+    | Decimal                | Decimal64Dtype   | (none)                                                                              | (none)                                      |
+    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
     **Note: All dtypes above are Nullable**
@@ Expand Down @@