Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixed_point + cudf::binary_operation API Changes #7435

Merged
merged 35 commits into from
Mar 12, 2021
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
d07c35f
Initial changes for binary_v_v
codereport Feb 24, 2021
0f528de
Fix
codereport Feb 24, 2021
223fe40
Revert "Fix"
codereport Feb 25, 2021
f8ec02b
Revert "Initial changes for binary_v_v"
codereport Feb 25, 2021
e0bf2c6
Require TRUE_DIV scale to be == to lhs - rhs scales
codereport Feb 25, 2021
2476eab
Use output_type
codereport Feb 26, 2021
ba38c80
Remove TRUE_DIV
codereport Feb 26, 2021
f1a62b3
Don't hardcode BOOL
codereport Feb 26, 2021
c0769f1
Add unit tests + CUDF_EXPECTS
codereport Mar 1, 2021
65d071f
Remove dead code
codereport Mar 1, 2021
0c8b7e0
Merge branch 'branch-0.19' into binaryop-error
codereport Mar 2, 2021
83070e8
Fix tests
codereport Mar 2, 2021
4860f22
Fix rescale
codereport Mar 2, 2021
aa2b5fe
Python changes (incomplete)
codereport Mar 4, 2021
e76979b
Python fix
codereport Mar 4, 2021
0f9a043
Black python formatting
codereport Mar 4, 2021
9388748
Flake8 python
codereport Mar 4, 2021
4f85dd4
flake8
codereport Mar 4, 2021
fada90b
Python cleanup/refactoring
codereport Mar 4, 2021
7517f73
flake8 fix
codereport Mar 4, 2021
64c3f40
flake8 fix
codereport Mar 4, 2021
3d325f8
flake8 fix
codereport Mar 4, 2021
5063738
flake8 fix
codereport Mar 4, 2021
055bf1a
Adding cudf::cast for APIs
codereport Mar 4, 2021
ff1926f
Add except *
codereport Mar 4, 2021
d90b69b
Merge branch 'branch-0.19' into binaryop-error
codereport Mar 6, 2021
f86f720
Fix + unit test for same scale comparison op
codereport Mar 8, 2021
ff540c3
Use MAX_PRECISION
codereport Mar 8, 2021
98f4411
Use more declarative ternary operator
codereport Mar 8, 2021
8178320
Use absolute path and remove local import
codereport Mar 9, 2021
c638707
black / flake8 fix
codereport Mar 9, 2021
efe92a3
Use MAX_PRECISION
codereport Mar 9, 2021
a1e3887
Unit tests
codereport Mar 9, 2021
440bf52
Merge branch 'branch-0.19' into binaryop-error
codereport Mar 9, 2021
43c2def
Merge branch 'branch-0.19' into binaryop-error
codereport Mar 9, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions cpp/include/cudf/binaryop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,5 +178,29 @@ std::unique_ptr<column> binary_operation(
data_type output_type,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Computes the `scale` for a `fixed_point` number based on given binary operator `op`
*
* @param op The binary_operator used for two `fixed_point` numbers
* @param left_scale Scale of left `fixed_point` number
* @param right_scale Scale of right `fixed_point` number
* @return The resulting `scale` of the computed `fixed_point` number
*/
int32_t binary_operation_fixed_point_scale(binary_operator op,
int32_t left_scale,
int32_t right_scale);

/**
* @brief Computes the `data_type` for a `fixed_point` number based on given binary operator `op`
*
* @param op The binary_operator used for two `fixed_point` numbers
* @param lhs `cudf::data_type` of left `fixed_point` number
* @param rhs `cudf::data_type` of right `fixed_point` number
* @return The resulting `cudf::data_type` of the computed `fixed_point` number
*/
cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
cudf::data_type const& lhs,
cudf::data_type const& rhs);

/** @} */ // end of group
} // namespace cudf
260 changes: 87 additions & 173 deletions cpp/src/binaryop/binaryop.cpp

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions cpp/src/unary/cast_ops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -178,11 +178,13 @@ std::unique_ptr<column> rescale(column_view input,

if (input.type().scale() > scale) {
auto const scalar = make_fixed_point_scalar<T>(0, scale_type{scale});
return detail::binary_operation(input, *scalar, binary_operator::ADD, {}, stream, mr);
auto const type = cudf::data_type{cudf::type_to_id<T>(), scale};
return detail::binary_operation(input, *scalar, binary_operator::ADD, type, stream, mr);
} else {
auto const diff = input.type().scale() - scale;
auto const scalar = make_fixed_point_scalar<T>(std::pow(10, -diff), scale_type{diff});
return detail::binary_operation(input, *scalar, binary_operator::DIV, {}, stream, mr);
auto const type = cudf::data_type{cudf::type_to_id<T>(), scale};
return detail::binary_operation(input, *scalar, binary_operator::DIV, type, stream, mr);
}
};

Expand Down
280 changes: 207 additions & 73 deletions cpp/tests/binaryop/binop-integration-test.cpp

Large diffs are not rendered by default.

18 changes: 12 additions & 6 deletions cpp/tests/fixed_point/fixed_point_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -596,8 +596,9 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType32)
auto const expected1 = fp_wrapper{{150000000}, scale_type{6}};
auto const expected2 = fp_wrapper{{50000000}, scale_type{6}};

auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {});
auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {});
auto const type = cudf::data_type{cudf::type_id::DECIMAL32, 6};
auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
Expand All @@ -618,8 +619,9 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType64)
auto const expected1 = fp_wrapper{{150000000}, scale_type{100}};
auto const expected2 = fp_wrapper{{50000000}, scale_type{100}};

auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {});
auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {});
auto const type = cudf::data_type{cudf::type_id::DECIMAL64, 100};
auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
Expand All @@ -630,6 +632,7 @@ TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
// This is testing fixed_point values with an extremely large negative scale. The fixed_point
// implementation should be able to handle any scale representable by an int32_t

using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
using fp_wrapper = cudf::test::fixed_point_column_wrapper<TypeParam>;

auto const a = fp_wrapper{{10}, scale_type{-201}};
Expand All @@ -639,8 +642,11 @@ TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
auto const expected1 = fp_wrapper{{150}, scale_type{-202}};
auto const expected2 = fp_wrapper{{5}, scale_type{-201}};

auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {});
auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {});
auto const type1 = cudf::data_type{cudf::type_to_id<decimalXX>(), -202};
auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type1);

auto const type2 = cudf::data_type{cudf::type_to_id<decimalXX>(), -201};
auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type2);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
Expand Down
11 changes: 2 additions & 9 deletions python/cudf/cudf/_lib/binaryop.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ from cudf._lib.replace import replace_nulls
from cudf._lib.scalar import as_device_scalar
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.types import np_to_cudf_types
from cudf._lib.types cimport underlying_type_t_type_id
from cudf._lib.types cimport underlying_type_t_type_id, dtype_to_data_type

from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.scalar.scalar cimport scalar
Expand Down Expand Up @@ -174,15 +174,8 @@ def binaryop(lhs, rhs, op, dtype):
cdef binary_operator c_op = <binary_operator> (
<underlying_type_t_binary_operator> op
)
cdef type_id tid = (
<type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[np.dtype(dtype)]
)
)
)

cdef data_type c_dtype = data_type(tid)
cdef data_type c_dtype = dtype_to_data_type(dtype)

if is_scalar(lhs) or lhs is None:
is_string_col = is_string_dtype(rhs.dtype)
Expand Down
26 changes: 5 additions & 21 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ from rmm._lib.device_buffer cimport DeviceBuffer
from cudf._lib.types import np_to_cudf_types, cudf_to_np_types
from cudf._lib.types cimport (
underlying_type_t_type_id,
dtype_from_column_view
dtype_from_column_view,
dtype_to_data_type
)
from cudf._lib.null_mask import bitmask_allocation_size_bytes

Expand Down Expand Up @@ -378,29 +379,12 @@ cdef class Column:
cdef column_view _view(self, libcudf_types.size_type null_count) except *:
if is_categorical_dtype(self.dtype):
col = self.base_children[0]
data_dtype = col.dtype
else:
col = self
data_dtype = self.dtype

data_dtype = col.dtype
cdef libcudf_types.type_id tid

if is_list_dtype(self.dtype):
tid = libcudf_types.type_id.LIST
elif is_struct_dtype(self.dtype):
tid = libcudf_types.type_id.STRUCT
elif is_decimal_dtype(self.dtype):
tid = libcudf_types.type_id.DECIMAL64
else:
tid = <libcudf_types.type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[np.dtype(data_dtype)]
)
)
cdef libcudf_types.data_type dtype = (
libcudf_types.data_type(tid, -self.dtype.scale)
if tid == libcudf_types.type_id.DECIMAL64
else libcudf_types.data_type(tid)
)
cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
codereport marked this conversation as resolved.
Show resolved Hide resolved
cdef libcudf_types.size_type offset = self.offset
cdef vector[column_view] children
cdef void* data
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/_lib/types.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ from libc.stdint cimport int32_t
from libcpp cimport bool
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view

cimport cudf._lib.cpp.types as libcudf_types

ctypedef bool underlying_type_t_order
ctypedef bool underlying_type_t_null_order
Expand All @@ -14,3 +14,5 @@ ctypedef int32_t underlying_type_t_type_id
ctypedef bool underlying_type_t_null_policy

cdef dtype_from_column_view(column_view cv)

cdef libcudf_types.data_type dtype_to_data_type(dtype) except *
21 changes: 19 additions & 2 deletions python/cudf/cudf/_lib/types.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ from cudf._lib.types cimport (
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
from cudf.core.dtypes import ListDtype, StructDtype, Decimal64Dtype
from cudf.utils.dtypes import is_decimal_dtype, is_list_dtype, is_struct_dtype

cimport cudf._lib.cpp.types as libcudf_types

Expand Down Expand Up @@ -192,8 +193,7 @@ cdef dtype_from_structs_column_view(column_view cv):

cdef dtype_from_decimal_column_view(column_view cv):
scale = -cv.type().scale()
precision = 18 # max of 64 bit integer
return Decimal64Dtype(precision=precision, scale=scale)
return Decimal64Dtype(precision=Decimal64Dtype.MAX_PRECISION, scale=scale)

cdef dtype_from_column_view(column_view cv):
cdef libcudf_types.type_id tid = cv.type().id()
Expand All @@ -208,3 +208,20 @@ cdef dtype_from_column_view(column_view cv):
"Use decimal64 instead")
else:
return cudf_to_np_types[<underlying_type_t_type_id>(tid)]

cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
if is_list_dtype(dtype):
tid = libcudf_types.type_id.LIST
elif is_struct_dtype(dtype):
tid = libcudf_types.type_id.STRUCT
elif is_decimal_dtype(dtype):
tid = libcudf_types.type_id.DECIMAL64
else:
tid = <libcudf_types.type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[np.dtype(dtype)]))

if tid == libcudf_types.type_id.DECIMAL64:
return libcudf_types.data_type(tid, -dtype.scale)
else:
return libcudf_types.data_type(tid)
18 changes: 17 additions & 1 deletion python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,11 @@ def to_arrow(self):
def binary_operator(self, op, other, reflect=False):
if reflect:
self, other = other, self
result = libcudf.binaryop.binaryop(self, other, op, "int32")
scale = _binop_scale(self.dtype, other.dtype, op)
output_type = Decimal64Dtype(
scale=scale, precision=18
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense for the Decimal64Dtype constructor to default precision to 18?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we can avoid the magic number 18 all over our code?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It helps for hooking into libcudf APIS, so as long as it doesn't cause issues in the Python code - sounds like a decent idea to me :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kkraus14 any objections? 18 being the max precision the type can support. Note that this will deviate from PyArrow which requires the user to specify a precision explicitly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made the changes locally and it cleans up the code nicely. Will wait on @kkraus14 before I push

Copy link
Contributor

@shwina shwina Mar 8, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@codereport can we make MAX_PRECISION a class attribute and use that here instead?

In dtypes.py:

class DecimalDtype:
    MAX_PRECISION = 18

    # rest of class

Here:

Suggested change
scale=scale, precision=18
scale=scale, precision=cudf.Decimal64Dtype.MAX_PRECISION

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pushed the changes. Turns out we already had MAX_PRECISION 🤦

MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max))

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not seeing these changes, did you push them?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@codereport instead of hardcoding 18 here we should use the dtypes MAX_PRECISION attribute that we defined

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤦 sry, I changed 1 occurrence of 18, missed the 2nd. pushed now.

) # precision will be ignored, libcudf has no notion of precision
result = libcudf.binaryop.binaryop(self, other, op, output_type)
result.dtype.precision = _binop_precision(self.dtype, other.dtype, op)
return result

Expand All @@ -78,6 +82,18 @@ def as_string_column(
)


def _binop_scale(l_dtype, r_dtype, op):
# This should at some point be hooked up to libcudf's
# binary_operation_fixed_point_scale
s1, s2 = l_dtype.scale, r_dtype.scale
if op in ("add", "sub"):
return max(s1, s2)
elif op == "mul":
return s1 + s2
else:
raise NotImplementedError()


def _binop_precision(l_dtype, r_dtype, op):
"""
Returns the result precision when performing the
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ class Decimal64Dtype(ExtensionDtype):

name = "decimal"
_metadata = ("precision", "scale")
_MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max))
MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max))
codereport marked this conversation as resolved.
Show resolved Hide resolved

def __init__(self, precision, scale=0):
"""
Expand Down Expand Up @@ -303,10 +303,10 @@ def __hash__(self):

@classmethod
def _validate(cls, precision, scale=0):
if precision > Decimal64Dtype._MAX_PRECISION:
if precision > Decimal64Dtype.MAX_PRECISION:
raise ValueError(
f"Cannot construct a {cls.__name__}"
f" with precision > {cls._MAX_PRECISION}"
f" with precision > {cls.MAX_PRECISION}"
)
if abs(scale) > precision:
raise ValueError(f"scale={scale} exceeds precision={precision}")
Expand Down
7 changes: 5 additions & 2 deletions python/cudf/cudf/utils/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType

import cudf
from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar
from cudf._lib.scalar import DeviceScalar
from cudf.core._compat import PANDAS_GE_120

_NA_REP = "<NA>"
Expand Down Expand Up @@ -331,7 +331,10 @@ def to_cudf_compatible_scalar(val, dtype=None):

If `val` is None, returns None.
"""
if _is_null_host_scalar(val) or isinstance(val, cudf.Scalar):

if cudf._lib.scalar._is_null_host_scalar(val) or isinstance(
val, cudf.Scalar
):
return val

if not is_scalar(val):
Expand Down