Skip to content

Commit

Permalink
Merge branch 'branch-0.19' into copy-decimal-type-metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
rgsl888prabhu authored Apr 1, 2021
2 parents f439c20 + 6cab04a commit 389d14b
Show file tree
Hide file tree
Showing 15 changed files with 783 additions and 122 deletions.
6 changes: 5 additions & 1 deletion cpp/include/cudf/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,11 @@ void gather_helper(InputItr source_itr,
// Error case when no other overload or specialization is available
template <typename Element, typename Enable = void>
struct column_gatherer_impl {
std::unique_ptr<column> operator()(...) { CUDF_FAIL("Unsupported type in gather."); }
template <typename... Args>
std::unique_ptr<column> operator()(Args&&...)
{
CUDF_FAIL("Unsupported type in gather.");
}
};

/**
Expand Down
6 changes: 5 additions & 1 deletion cpp/include/cudf/detail/scatter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,11 @@ auto scatter_to_gather(MapIterator scatter_map_begin,

template <typename Element, typename Enable = void>
struct column_scatterer_impl {
std::unique_ptr<column> operator()(...) const { CUDF_FAIL("Unsupported type for scatter."); }
template <typename... Args>
std::unique_ptr<column> operator()(Args&&...) const
{
CUDF_FAIL("Unsupported type for scatter.");
}
};

template <typename Element>
Expand Down
6 changes: 5 additions & 1 deletion cpp/src/copying/copy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ namespace {

template <typename T, typename Enable = void>
struct copy_if_else_functor_impl {
std::unique_ptr<column> operator()(...) { CUDF_FAIL("Unsupported type for copy_if_else."); }
template <typename... Args>
std::unique_ptr<column> operator()(Args&&...)
{
CUDF_FAIL("Unsupported type for copy_if_else.");
}
};

template <typename T>
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/replace/nulls.cu
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
CUDF_EXPECTS(replacement.size() == input.size(), "Column size mismatch");

if (input.is_empty()) { return cudf::empty_like(input); }
if (!input.has_nulls()) { return std::make_unique<cudf::column>(input); }
if (!input.has_nulls()) { return std::make_unique<cudf::column>(input, stream, mr); }

return cudf::type_dispatcher<dispatch_storage_type>(
input.type(), replace_nulls_column_kernel_forwarder{}, input, replacement, stream, mr);
Expand Down
2 changes: 2 additions & 0 deletions docs/cudf/source/basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ The following table lists all of cudf types. For methods requiring dtype argumen
+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
| Boolean | | np.bool_ | ``'bool'`` |
+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
| Decimal | Decimal64Dtype | (none) | (none) |
+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+

**Note: All dtypes above are Nullable**

Expand Down
36 changes: 18 additions & 18 deletions docs/cudf/source/groupby.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,24 +120,24 @@ a

The following table summarizes the available aggregations and the types that support them:

| Aggregations\dtypes | Numeric | Datetime | String | Categorical | List | Struct |
| ------------------- | -------- | ------- | -------- | ----------- | ---- | ------ |
| count ||||| | |
| size ||||| | |
| sum ||| | | | |
| idxmin ||| | | | |
| idxmax ||| | | | |
| min |||| | | |
| max |||| | | |
| mean ||| | | | |
| var ||| | | | |
| std ||| | | | |
| quantile ||| | | | |
| median ||| | | | |
| nunique ||||| | |
| nth |||| | | |
| collect |||| || |
| unique ||||| | |
| Aggregations\dtypes | Numeric | Datetime | String | Categorical | List | Struct | Interval | Decimal |
| ------------------- | -------- | ------- | -------- | ----------- | ---- | ------ | -------- | ------- |
| count ||||| | | ||
| size ||||| | | ||
| sum ||| | | | | ||
| idxmin ||| | | | | ||
| idxmax ||| | | | | ||
| min |||| | | | ||
| max |||| | | | ||
| mean ||| | | | | | |
| var ||| | | | | | |
| std ||| | | | | | |
| quantile ||| | | | | | |
| median ||| | | | | | |
| nunique ||||| | | ||
| nth |||| | | | ||
| collect |||| || | ||
| unique ||||| | | | |

## GroupBy apply

Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from libc.stdint cimport int64_t, int32_t

cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil:
# cython type stub to help resolve to numeric::decimal64
ctypedef int64_t decimal64

cdef cppclass scale_type:
Expand Down
69 changes: 49 additions & 20 deletions python/cudf/cudf/_lib/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import defaultdict

import numpy as np
import rmm

from libcpp.pair cimport pair
from libcpp.memory cimport unique_ptr
Expand All @@ -20,25 +21,9 @@ cimport cudf._lib.cpp.groupby as libcudf_groupby
cimport cudf._lib.cpp.aggregation as libcudf_aggregation


_GROUPBY_AGGS = {
"count",
"size",
"sum",
"idxmin",
"idxmax",
"min",
"max",
"mean",
"var",
"std",
"quantile",
"median",
"nunique",
"nth",
"collect",
"unique",
}

# The sets below define the possible aggregations that can be performed on
# different dtypes. The uppercased versions of these strings correspond to
# elements of the AggregationKind enum.
_CATEGORICAL_AGGS = {
"count",
"size",
Expand All @@ -61,6 +46,24 @@ _LIST_AGGS = {
"collect",
}

_STRUCT_AGGS = {
}

_INTERVAL_AGGS = {
}

_DECIMAL_AGGS = {
"count",
"sum",
"argmin",
"argmax",
"min",
"max",
"nunique",
"nth",
"collect"
}


cdef class GroupBy:
cdef unique_ptr[libcudf_groupby.groupby] c_obj
Expand Down Expand Up @@ -197,7 +200,10 @@ def _drop_unsupported_aggs(Table values, aggs):
from cudf.utils.dtypes import (
is_categorical_dtype,
is_string_dtype,
is_list_dtype
is_list_dtype,
is_interval_dtype,
is_struct_dtype,
is_decimal_dtype,
)
result = aggs.copy()

Expand All @@ -220,6 +226,29 @@ def _drop_unsupported_aggs(Table values, aggs):
for i, agg_name in enumerate(aggs[col_name]):
if Aggregation(agg_name).kind not in _CATEGORICAL_AGGS:
del result[col_name][i]
elif (
is_struct_dtype(values._data[col_name].dtype)
):
for i, agg_name in enumerate(aggs[col_name]):
if Aggregation(agg_name).kind not in _STRUCT_AGGS:
del result[col_name][i]
elif (
is_interval_dtype(values._data[col_name].dtype)
):
for i, agg_name in enumerate(aggs[col_name]):
if Aggregation(agg_name).kind not in _INTERVAL_AGGS:
del result[col_name][i]
elif (
is_decimal_dtype(values._data[col_name].dtype)
):
if rmm._cuda.gpu.runtimeGetVersion() < 11000:
raise RuntimeError(
"Decimal aggregations are only supported on CUDA >= 11 "
"due to an nvcc compiler bug."
)
for i, agg_name in enumerate(aggs[col_name]):
if Aggregation(agg_name).kind not in _DECIMAL_AGGS:
del result[col_name][i]

if all(len(v) == 0 for v in result.values()):
raise DataError("No numeric types to aggregate")
Expand Down
14 changes: 10 additions & 4 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,8 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
array.type, pd.core.arrays._arrow_utils.ArrowIntervalType
):
return cudf.core.column.IntervalColumn.from_arrow(array)
elif isinstance(array.type, pa.Decimal128Type):
return cudf.core.column.DecimalColumn.from_arrow(array)

return libcudf.interop.from_arrow(data, data.column_names)._data[
"None"
Expand Down Expand Up @@ -1853,10 +1855,14 @@ def as_column(
cupy.asarray(arbitrary), nan_as_null=nan_as_null, dtype=dtype
)
else:
data = as_column(
pa.array(arbitrary, from_pandas=nan_as_null),
dtype=arbitrary.dtype,
)
pyarrow_array = pa.array(arbitrary, from_pandas=nan_as_null)
if isinstance(pyarrow_array.type, pa.Decimal128Type):
pyarrow_type = cudf.Decimal64Dtype.from_arrow(
pyarrow_array.type
)
else:
pyarrow_type = arbitrary.dtype
data = as_column(pyarrow_array, dtype=pyarrow_type)
if dtype is not None:
data = data.astype(dtype)

Expand Down
43 changes: 37 additions & 6 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import cupy as cp
import numpy as np
import pyarrow as pa
from pandas.api.types import is_integer_dtype

import cudf
from cudf import _lib as libcudf
Expand Down Expand Up @@ -66,17 +67,47 @@ def to_arrow(self):
def binary_operator(self, op, other, reflect=False):
if reflect:
self, other = other, self
scale = _binop_scale(self.dtype, other.dtype, op)
output_type = Decimal64Dtype(
scale=scale, precision=Decimal64Dtype.MAX_PRECISION
) # precision will be ignored, libcudf has no notion of precision
result = libcudf.binaryop.binaryop(self, other, op, output_type)
result.dtype.precision = _binop_precision(self.dtype, other.dtype, op)

# Binary Arithmatics between decimal columns. `Scale` and `precision`
# are computed outside of libcudf
if op in ("add", "sub", "mul"):
scale = _binop_scale(self.dtype, other.dtype, op)
output_type = Decimal64Dtype(
scale=scale, precision=Decimal64Dtype.MAX_PRECISION
) # precision will be ignored, libcudf has no notion of precision
result = libcudf.binaryop.binaryop(self, other, op, output_type)
result.dtype.precision = _binop_precision(
self.dtype, other.dtype, op
)
elif op in ("eq", "lt", "gt", "le", "ge"):
if not isinstance(
other,
(DecimalColumn, cudf.core.column.NumericalColumn, cudf.Scalar),
):
raise TypeError(
f"Operator {op} not supported between"
f"{str(type(self))} and {str(type(other))}"
)
if isinstance(
other, cudf.core.column.NumericalColumn
) and not is_integer_dtype(other.dtype):
raise TypeError(
f"Only decimal and integer column is supported for {op}."
)
if isinstance(other, cudf.core.column.NumericalColumn):
other = other.as_decimal_column(
Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0)
)
result = libcudf.binaryop.binaryop(self, other, op, bool)
return result

def normalize_binop_value(self, other):
if is_scalar(other) and isinstance(other, (int, np.int, Decimal)):
return cudf.Scalar(Decimal(other))
elif isinstance(other, cudf.Scalar) and isinstance(
other.dtype, cudf.Decimal64Dtype
):
return other
else:
raise TypeError(f"cannot normalize {type(other)}")

Expand Down
15 changes: 14 additions & 1 deletion python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
column,
string,
)
from cudf.core.dtypes import Decimal64Dtype
from cudf.utils import cudautils, utils
from cudf.utils.dtypes import (
min_column_type,
Expand Down Expand Up @@ -103,11 +104,23 @@ def binary_operator(
out_dtype = self.dtype
else:
if not (
isinstance(rhs, (NumericalColumn, cudf.Scalar,),)
isinstance(
rhs,
(
NumericalColumn,
cudf.Scalar,
cudf.core.column.DecimalColumn,
),
)
or np.isscalar(rhs)
):
msg = "{!r} operator not supported between {} and {}"
raise TypeError(msg.format(binop, type(self), type(rhs)))
if isinstance(rhs, cudf.core.column.DecimalColumn):
lhs = self.as_decimal_column(
Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0)
)
return lhs.binary_operator(binop, rhs)
out_dtype = np.result_type(self.dtype, rhs.dtype)
if binop in ["mod", "floordiv"]:
tmp = self if reflect else rhs
Expand Down
Loading

0 comments on commit 389d14b

Please sign in to comment.