From fff51b87da2152702c75d159f678f5d985bfeaec Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 8 Feb 2022 15:24:25 -0500 Subject: [PATCH 01/47] Fix strings handling of hex in regex pattern (#10220) Closes #10213 Fixes parsing logic for `\x` hex characters specified in a regex pattern. This also adds a gtest checking for all 127 possible matchable hex characters. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/10220 --- cpp/src/strings/regex/regcomp.cpp | 8 ++++---- cpp/tests/strings/contains_tests.cpp | 29 +++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 065c358d08b..6f36658523b 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -280,15 +280,15 @@ class regex_parser { yy = 0; if (a >= '0' && a <= '9') yy += (a - '0') << 4; - else if (a > 'a' && a <= 'f') + else if (a >= 'a' && a <= 'f') yy += (a - 'a' + 10) << 4; - else if (a > 'A' && a <= 'F') + else if (a >= 'A' && a <= 'F') yy += (a - 'A' + 10) << 4; if (b >= '0' && b <= '9') yy += b - '0'; - else if (b > 'a' && b <= 'f') + else if (b >= 'a' && b <= 'f') yy += b - 'a' + 10; - else if (b > 'A' && b <= 'F') + else if (b >= 'A' && b <= 'F') yy += b - 'A' + 10; break; } diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index a72ec61dd8f..bacd62ac86e 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -14,12 +14,12 @@ * limitations under the License. */ +#include #include #include #include #include #include -#include #include #include @@ -250,6 +250,33 @@ TEST_F(StringsContainsTests, OctalTest) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringsContainsTests, HexTest) +{ + std::vector ascii_chars( // all possible matchable chars + {thrust::make_counting_iterator(0), thrust::make_counting_iterator(127)}); + auto const count = static_cast(ascii_chars.size()); + std::vector offsets( + {thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + count + 1}); + auto d_chars = cudf::detail::make_device_uvector_sync(ascii_chars); + auto d_offsets = cudf::detail::make_device_uvector_sync(offsets); + auto input = cudf::make_strings_column(d_chars, d_offsets); + + auto strings_view = cudf::strings_column_view(input->view()); + for (auto ch : ascii_chars) { + std::stringstream str; + str << "\\x" << std::setfill('0') << std::setw(2) << std::hex << static_cast(ch); + std::string pattern = str.str(); + + auto results = cudf::strings::contains_re(strings_view, pattern); + // only one element in the input should match ch + auto true_dat = cudf::detail::make_counting_transform_iterator( + 0, [ch](auto idx) { return ch == static_cast(idx); }); + cudf::test::fixed_width_column_wrapper expected(true_dat, true_dat + count); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } +} + TEST_F(StringsContainsTests, EmbeddedNullCharacter) { std::vector data(10); From 6e267bdb7c40400fb43b3c32552be55cc4180f47 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 8 Feb 2022 14:43:25 -0800 Subject: [PATCH 02/47] Refactor Series.__array_ufunc__ (#10217) This PR is a first step in addressing #9083. It rewrites Series ufunc dispatch to use a simpler dispatch pattern that removes lookup in the overall cudf namespace, restricting only to Series methods or cupy functions. The changes in this PR also enable proper support for indexing so that ufuncs between two Series objects will work correctly and preserve indexes. Additionally, ufuncs will now support Series that contain nulls, which was not previously the case. Series methods that don't exist in `pandas.Series` that were previously only necessary to support ufunc dispatch have now been removed. Once this PR is merged, I will work on generalizing this approach to also support DataFrame objects, which should enable full support for ufuncs as well as allowing us to deprecate significant chunks of existing code. For the cases that previously worked, this PR does have some performance implications. Any operator that dispatches to cupy is about 3x faster now (assuming no nulls, since the nullable case was not previously supported), with the exception of logical operators for which we previously defined functions in the Series namespace that do not have pandas analogs. I've made a note in the code that we could reintroduce internal versions of these just for ufunc dispatch if that slowdown becomes a bottleneck for users, but for now I would prefer to avoid any more special cases than we really need. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Christopher Harris (https://github.com/cwharris) - Charles Blackmon-Luca (https://github.com/charlesbluca) URL: https://github.com/rapidsai/cudf/pull/10217 --- python/cudf/cudf/core/frame.py | 18 ++ python/cudf/cudf/core/series.py | 136 +++++++++- python/cudf/cudf/testing/_utils.py | 2 +- python/cudf/cudf/tests/test_array_ufunc.py | 302 ++++++++++----------- python/cudf/cudf/utils/utils.py | 5 +- 5 files changed, 294 insertions(+), 169 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d55ab901b59..a9e425f2012 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -669,6 +669,12 @@ def get_column_values_na(col): matrix[:, i] = get_column_values_na(col) return matrix + # TODO: As of now, calling cupy.asarray is _much_ faster than calling + # to_cupy. We should investigate the reasons why and whether we can provide + # a more efficient method here by exploiting __cuda_array_interface__. In + # particular, we need to benchmark how much of the overhead is coming from + # (potentially unavoidable) local copies in to_cupy and how much comes from + # inefficiencies in the implementation. def to_cupy( self, dtype: Union[Dtype, None] = None, @@ -3622,6 +3628,8 @@ def dot(self, other, reflect=False): >>> [1, 2, 3, 4] @ s 10 """ + # TODO: This function does not currently support nulls. + # TODO: This function does not properly support misaligned indexes. lhs = self.values if isinstance(other, Frame): rhs = other.values @@ -3632,6 +3640,16 @@ def dot(self, other, reflect=False): ): rhs = cupy.asarray(other) else: + # TODO: This should raise an exception, not return NotImplemented, + # but __matmul__ relies on the current behavior. We should either + # move this implementation to __matmul__ and call it from here + # (checking for NotImplemented and raising NotImplementedError if + # that's what's returned), or __matmul__ should catch a + # NotImplementedError from here and return NotImplemented. The + # latter feels cleaner (putting the implementation in this method + # rather than in the operator) but will be slower in the (highly + # unlikely) case that we're multiplying a cudf object with another + # type of object that somehow supports this behavior. return NotImplemented if reflect: lhs, rhs = rhs, lhs diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 193facefb95..41bf75fb48f 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -7,6 +7,7 @@ import pickle import warnings from collections import abc as abc +from itertools import repeat from numbers import Number from shutil import get_terminal_size from typing import Any, MutableMapping, Optional, Set, Union @@ -958,14 +959,123 @@ def to_frame(self, name=None): def memory_usage(self, index=True, deep=False): return sum(super().memory_usage(index, deep).values()) + # For more detail on this function and how it should work, see + # https://numpy.org/doc/stable/reference/ufuncs.html def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - if method == "__call__": - return get_appropriate_dispatched_func( - cudf, cudf.Series, cupy, ufunc, inputs, kwargs - ) - else: + # We don't currently support reduction, accumulation, etc. We also + # don't support any special kwargs or higher arity ufuncs than binary. + if method != "__call__" or kwargs or ufunc.nin > 2: return NotImplemented + # Binary operations + binary_operations = { + # Arithmetic binary operations. + "add": "add", + "subtract": "sub", + "multiply": "mul", + "matmul": "matmul", + "divide": "truediv", + "true_divide": "truediv", + "floor_divide": "floordiv", + "power": "pow", + "float_power": "pow", + "remainder": "mod", + "mod": "mod", + "fmod": "mod", + # Bitwise binary operations. + "bitwise_and": "and", + "bitwise_or": "or", + "bitwise_xor": "xor", + # Comparison binary operators + "greater": "gt", + "greater_equal": "ge", + "less": "lt", + "less_equal": "le", + "not_equal": "ne", + "equal": "eq", + } + + # First look for methods of the class. + fname = ufunc.__name__ + if fname in binary_operations: + not_reflect = self is inputs[0] + other = inputs[not_reflect] + op = f"__{'' if not_reflect else 'r'}{binary_operations[fname]}__" + + # pandas bitwise operations return bools if indexes are misaligned. + # TODO: Generalize for other types of Frames + if ( + "bitwise" in fname + and isinstance(other, Series) + and not self.index.equals(other.index) + ): + return getattr(self, op)(other).astype(bool) + # Float_power returns float irrespective of the input type. + if fname == "float_power": + return getattr(self, op)(other).astype(float) + return getattr(self, op)(other) + + # Special handling for unary operations. + if fname == "negative": + return self * -1 + if fname == "positive": + return self.copy(deep=True) + if fname == "invert": + return ~self + if fname == "absolute": + return self.abs() + if fname == "fabs": + return self.abs().astype(np.float64) + + # Note: There are some operations that may be supported by libcudf but + # are not supported by pandas APIs. In particular, libcudf binary + # operations support logical and/or operations, but those operations + # are not defined on pd.Series/DataFrame. For now those operations will + # dispatch to cupy, but if ufuncs are ever a bottleneck we could add + # special handling to dispatch those (or any other) functions that we + # could implement without cupy. + + # Attempt to dispatch all other functions to cupy. + cupy_func = getattr(cupy, fname) + if cupy_func: + # Indices must be aligned before converting to arrays. + if ufunc.nin == 2 and all(map(isinstance, inputs, repeat(Series))): + inputs = _align_indices(inputs, allow_non_unique=True) + index = inputs[0].index + else: + index = self.index + + cupy_inputs = [] + mask = None + for inp in inputs: + # TODO: Generalize for other types of Frames + if isinstance(inp, Series) and inp.has_nulls: + new_mask = as_column(inp.nullmask) + + # TODO: This is a hackish way to perform a bitwise and of + # bitmasks. Once we expose cudf::detail::bitwise_and, then + # we can use that instead. + mask = new_mask if mask is None else (mask & new_mask) + + # Arbitrarily fill with zeros. For ufuncs, we assume that + # the end result propagates nulls via a bitwise and, so + # these elements are irrelevant. + inp = inp.fillna(0) + cupy_inputs.append(cupy.asarray(inp)) + + cp_output = cupy_func(*cupy_inputs, **kwargs) + + def make_frame(arr): + return self.__class__._from_data( + {self.name: as_column(arr).set_mask(mask)}, index=index + ) + + if ufunc.nout > 1: + return tuple(make_frame(out) for out in cp_output) + return make_frame(cp_output) + + return NotImplemented + def __array_function__(self, func, types, args, kwargs): handled_types = [cudf.Series] for t in types: @@ -1257,15 +1367,31 @@ def _binaryop( ) def logical_and(self, other): + warnings.warn( + "Series.logical_and is deprecated and will be removed.", + FutureWarning, + ) return self._binaryop(other, "l_and").astype(np.bool_) def remainder(self, other): + warnings.warn( + "Series.remainder is deprecated and will be removed.", + FutureWarning, + ) return self._binaryop(other, "mod") def logical_or(self, other): + warnings.warn( + "Series.logical_or is deprecated and will be removed.", + FutureWarning, + ) return self._binaryop(other, "l_or").astype(np.bool_) def logical_not(self): + warnings.warn( + "Series.logical_not is deprecated and will be removed.", + FutureWarning, + ) return self._unaryop("not") @copy_docstring(CategoricalAccessor) # type: ignore diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 6c602d321eb..b97b2d660d6 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -46,7 +46,7 @@ def set_random_null_mask_inplace(series, null_probability=0.5, seed=None): probs = [null_probability, 1 - null_probability] rng = np.random.default_rng(seed=seed) mask = rng.choice([False, True], size=len(series), p=probs) - series[mask] = None + series.iloc[mask] = None # TODO: This function should be removed. Anywhere that it is being used should diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index 3fe0321ec54..2db0e8d29b8 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -1,175 +1,153 @@ +import operator +from functools import reduce + import cupy as cp import numpy as np -import pandas as pd import pytest import cudf -from cudf.testing._utils import assert_eq - - -@pytest.fixture -def np_ar_tup(): - np.random.seed(0) - return (np.random.random(100), np.random.random(100)) +from cudf.testing._utils import assert_eq, set_random_null_mask_inplace - -comparison_ops_ls = [ - np.greater, - np.greater_equal, - np.less, - np.less_equal, - np.equal, - np.not_equal, +_UFUNCS = [ + obj + for obj in (getattr(np, name) for name in dir(np)) + if isinstance(obj, np.ufunc) ] -@pytest.mark.parametrize( - "func", comparison_ops_ls + [np.subtract, np.fmod, np.power] -) -def test_ufunc_cudf_non_nullseries(np_ar_tup, func): - x, y = np_ar_tup[0], np_ar_tup[1] - s_1, s_2 = cudf.Series(x), cudf.Series(y) - expect = func(x, y) - got = func(s_1, s_2) - assert_eq(expect, got.to_numpy()) - - -@pytest.mark.parametrize( - "func", [np.bitwise_and, np.bitwise_or, np.bitwise_xor], -) -def test_ufunc_cudf_series_bitwise(func): - np.random.seed(0) - x = np.random.randint(size=100, low=0, high=100) - y = np.random.randint(size=100, low=0, high=100) - - s_1, s_2 = cudf.Series(x), cudf.Series(y) - expect = func(x, y) - got = func(s_1, s_2) - assert_eq(expect, got.to_numpy()) - - -@pytest.mark.parametrize( - "func", - [ - np.subtract, - np.multiply, - np.floor_divide, - np.true_divide, - np.power, - np.remainder, - np.divide, - ], -) -def test_ufunc_cudf_null_series(np_ar_tup, func): - x, y = np_ar_tup[0].astype(np.float32), np_ar_tup[1].astype(np.float32) - x[0] = np.nan - y[1] = np.nan - s_1, s_2 = cudf.Series(x), cudf.Series(y) - expect = func(x, y) - got = func(s_1, s_2) - assert_eq(expect, got.fillna(np.nan).to_numpy()) - - scalar = 0.5 - expect = func(x, scalar) - got = func(s_1, scalar) - assert_eq(expect, got.fillna(np.nan).to_numpy()) - - expect = func(scalar, x) - got = func(scalar, s_1) - assert_eq(expect, got.fillna(np.nan).to_numpy()) - - -@pytest.mark.xfail( - reason="""cuDF comparison operations with incorrectly - returns False rather than """ -) -@pytest.mark.parametrize( - "func", comparison_ops_ls, -) -def test_ufunc_cudf_null_series_comparison_ops(np_ar_tup, func): - x, y = np_ar_tup[0].astype(np.float32), np_ar_tup[1].astype(np.float32) - x[0] = np.nan - y[1] = np.nan - s_1, s_2 = cudf.Series(x), cudf.Series(y) - expect = func(x, y) - got = func(s_1, s_2) - assert_eq(expect, got.fillna(np.nan).to_numpy()) - - scalar = 0.5 - expect = func(x, scalar) - got = func(s_1, scalar) - assert_eq(expect, got.fillna(np.nan).to_numpy()) - - expect = func(scalar, x) - got = func(scalar, s_1) - assert_eq(expect, got.fillna(np.nan).to_numpy()) - - -@pytest.mark.parametrize( - "func", [np.logaddexp, np.fmax, np.fmod], -) -def test_ufunc_cudf_series_cupy_array(np_ar_tup, func): - x, y = np_ar_tup[0], np_ar_tup[1] - expect = func(x, y) - - cudf_s = cudf.Series(x) - cupy_ar = cp.array(y) - got = func(cudf_s, cupy_ar) - assert_eq(expect, got.to_numpy()) - - -@pytest.mark.parametrize( - "func", - [np.fmod, np.logaddexp, np.bitwise_and, np.bitwise_or, np.bitwise_xor], -) -def test_error_with_null_cudf_series(func): - s_1 = cudf.Series([1, 2]) - s_2 = cudf.Series([1, None]) - - # this thows a value error - # because of nulls in cudf.Series - with pytest.raises(ValueError): - func(s_1, s_2) - - s_1 = cudf.Series([1, 2]) - s_2 = cudf.Series([1, 2, None]) - - # this throws a value-error if indexes are not aligned - # following pandas behavior for ufunc numpy dispatching - with pytest.raises( - ValueError, match="Can only compare identically-labeled Series objects" - ): - func(s_1, s_2) - - -@pytest.mark.parametrize( - "func", [np.absolute, np.sign, np.exp2, np.tanh], -) -def test_ufunc_cudf_series_with_index(func): - data = [-1, 2, 3, 0] - index = [2, 3, 1, 0] - cudf_s = cudf.Series(data=data, index=index) - pd_s = pd.Series(data=data, index=index) - - expect = func(pd_s) - got = func(cudf_s) - - assert_eq(got, expect) - - -@pytest.mark.parametrize( - "func", [np.logaddexp2], -) -def test_ufunc_cudf_series_with_nonaligned_index(func): - cudf_s1 = cudf.Series(data=[-1, 2, 3, 0], index=[2, 3, 1, 0]) - cudf_s2 = cudf.Series(data=[-1, 2, 3, 0], index=[3, 1, 0, 2]) - - # this throws a value-error if indexes are not aligned - # following pandas behavior for ufunc numpy dispatching - with pytest.raises( - ValueError, match="Can only compare identically-labeled Series objects" +@pytest.mark.parametrize("ufunc", _UFUNCS) +@pytest.mark.parametrize("has_nulls", [True, False]) +@pytest.mark.parametrize("indexed", [True, False]) +def test_ufunc_series(ufunc, has_nulls, indexed): + # Note: This test assumes that all ufuncs are unary or binary. + fname = ufunc.__name__ + if indexed and fname in ( + "greater", + "greater_equal", + "less", + "less_equal", + "not_equal", + "equal", ): - func(cudf_s1, cudf_s2) + pytest.skip("Comparison operators do not support misaligned indexes.") + + if (indexed or has_nulls) and fname == "matmul": + pytest.xfail("Frame.dot currently does not support indexes or nulls") + + N = 100 + # Avoid zeros in either array to skip division by 0 errors. Also limit the + # scale to avoid issues with overflow, etc. We use ints because some + # operations (like bitwise ops) are not defined for floats. + pandas_args = args = [ + cudf.Series( + cp.random.randint(low=1, high=10, size=N), + index=cp.random.choice(range(N), N, False) if indexed else None, + ) + for _ in range(ufunc.nin) + ] + + if has_nulls: + # Converting nullable integer cudf.Series to pandas will produce a + # float pd.Series, so instead we replace nulls with an arbitrary + # integer value, precompute the mask, and then reapply it afterwards. + for arg in args: + set_random_null_mask_inplace(arg) + pandas_args = [arg.fillna(0) for arg in args] + + # Note: Different indexes must be aligned before the mask is computed. + # This requires using an internal function (_align_indices), and that + # is unlikely to change for the foreseeable future. + aligned = ( + cudf.core.series._align_indices(args, allow_non_unique=True) + if indexed and ufunc.nin == 2 + else args + ) + mask = reduce(operator.or_, (a.isna() for a in aligned)).to_pandas() + + try: + got = ufunc(*args) + except AttributeError as e: + # We xfail if we don't have an explicit dispatch and cupy doesn't have + # the method so that we can easily identify these methods. As of this + # writing, the only missing methods are isnat and heaviside. + if "module 'cupy' has no attribute" in str(e): + pytest.xfail(reason="Operation not supported by cupy") + raise + + expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) + + try: + if ufunc.nout > 1: + for g, e in zip(got, expect): + if has_nulls: + e[mask] = np.nan + assert_eq(g, e) + else: + if has_nulls: + expect[mask] = np.nan + assert_eq(got, expect) + except AssertionError: + # TODO: This branch can be removed when + # https://github.com/rapidsai/cudf/issues/10178 is resolved + if fname in ("power", "float_power"): + not_equal = cudf.from_pandas(expect) != got + not_equal[got.isna()] = False + diffs = got[not_equal] - expect[not_equal.to_pandas()] + if diffs.abs().max() == 1: + pytest.xfail("https://github.com/rapidsai/cudf/issues/10178") + raise + + +@pytest.mark.parametrize("ufunc", [np.add, np.greater, np.logical_and]) +@pytest.mark.parametrize("has_nulls", [True, False]) +@pytest.mark.parametrize("indexed", [True, False]) +@pytest.mark.parametrize("type_", ["cupy", "numpy", "list"]) +def test_binary_ufunc_series_array(ufunc, has_nulls, indexed, type_): + fname = ufunc.__name__ + if fname == "greater" and has_nulls: + pytest.xfail( + "The way cudf casts nans in arrays to nulls during binops with " + "cudf objects is currently incompatible with pandas." + ) + N = 100 + # Avoid zeros in either array to skip division by 0 errors. Also limit the + # scale to avoid issues with overflow, etc. We use ints because some + # operations (like bitwise ops) are not defined for floats. + args = [ + cudf.Series( + cp.random.rand(N), + index=cp.random.choice(range(N), N, False) if indexed else None, + ) + for _ in range(ufunc.nin) + ] + + if has_nulls: + # Converting nullable integer cudf.Series to pandas will produce a + # float pd.Series, so instead we replace nulls with an arbitrary + # integer value, precompute the mask, and then reapply it afterwards. + for arg in args: + set_random_null_mask_inplace(arg) + + # Cupy doesn't support nulls, so we fill with nans before converting. + args[1] = args[1].fillna(cp.nan) + mask = args[0].isna().to_pandas() + + arg1 = args[1].to_cupy() if type_ == "cupy" else args[1].to_numpy() + if type_ == "list": + arg1 = arg1.tolist() + + got = ufunc(args[0], arg1) + expect = ufunc(args[0].to_pandas(), args[1].to_numpy()) + + if ufunc.nout > 1: + for g, e in zip(got, expect): + if has_nulls: + e[mask] = np.nan + assert_eq(g, e) + else: + if has_nulls: + expect[mask] = np.nan + assert_eq(got, expect) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 65a803d6768..8571d9ffed5 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -361,7 +361,10 @@ def get_appropriate_dispatched_func( cupy_compatible_args, index = _get_cupy_compatible_args_index(args) if cupy_compatible_args: cupy_output = cupy_func(*cupy_compatible_args, **kwargs) - return _cast_to_appropriate_cudf_type(cupy_output, index) + if isinstance(cupy_output, cp.ndarray): + return _cast_to_appropriate_cudf_type(cupy_output, index) + else: + return cupy_output return NotImplemented From 3fe168df2a2f664479546aa4ca7ba7267371127c Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 9 Feb 2022 09:08:09 -0500 Subject: [PATCH 03/47] Fix string to decimal128 conversion handling large exponents (#10231) Closes #10221 Fixes exponent handling for large exponents. Avoids using the `exp10` function for exponent adjusts since it only accepts and returns `double` which can become imprecise for large exponents. Also adds a gtest using the values from #10221. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Conor Hoekstra (https://github.com/codereport) - Paul Taylor (https://github.com/trxcllnt) URL: https://github.com/rapidsai/cudf/pull/10231 --- .../strings/detail/convert/fixed_point.cuh | 12 ++++++++---- cpp/tests/strings/fixed_point_tests.cpp | 18 +++++++++++++++++- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh index aa3f544202f..7af56f89449 100644 --- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh +++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + #include #include @@ -150,9 +152,11 @@ __device__ DecimalType parse_decimal(char const* iter, char const* iter_end, int exp_ten += exp_offset; // shift the output value based on the exp_ten and the scale values - value = exp_ten < scale - ? value / static_cast(exp10(static_cast(scale - exp_ten))) - : value * static_cast(exp10(static_cast(exp_ten - scale))); + auto const shift_adjust = + abs(scale - exp_ten) > cuda::std::numeric_limits::digits10 + ? cuda::std::numeric_limits::max() + : numeric::detail::exp10(abs(scale - exp_ten)); + value = exp_ten < scale ? value / shift_adjust : value * shift_adjust; return static_cast(value) * (sign == 0 ? 1 : sign); } diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp index 5872a9e5bb7..81122b1c5d8 100644 --- a/cpp/tests/strings/fixed_point_tests.cpp +++ b/cpp/tests/strings/fixed_point_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -112,6 +112,22 @@ TEST_F(StringsConvertTest, ToFixedPointDecimal128) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } +TEST_F(StringsConvertTest, ToFixedPointLargeScale) +{ + using namespace numeric; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + auto const strings = cudf::test::strings_column_wrapper({"0.05", "0.06", "0.50", "5.01"}); + + auto const scale = scale_type{-25}; + auto const type = cudf::data_type{cudf::type_to_id(), scale}; + auto const results = cudf::strings::to_fixed_point(cudf::strings_column_view(strings), type); + + auto const expected = fp_wrapper{{5, 6, 50, 501}, scale_type{-2}}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); +} + TEST_F(StringsConvertTest, FromFixedPointDecimal128) { using namespace numeric; From 19dc46fdb068e8613b6e7e85686c622f92277d0f Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 9 Feb 2022 15:12:07 -0600 Subject: [PATCH 04/47] Upgrade `arrow` & `pyarrow` to `6.0.1` (#9686) Resolves: #9645 This PR upgrades `arrow` & `pyarrow` to `6.0.1` from `5.0.0`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - https://github.com/nvdbaranec - AJ Schmidt (https://github.com/ajschmidt8) - Robert Maynard (https://github.com/robertmaynard) - Keith Kraus (https://github.com/kkraus14) - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/9686 --- conda/environments/cudf_dev_cuda11.5.yml | 6 +++--- conda/recipes/cudf/meta.yaml | 4 ++-- conda/recipes/libcudf/meta.yaml | 4 ++-- cpp/cmake/thirdparty/get_arrow.cmake | 4 ++-- python/cudf/cudf/core/column/column.py | 18 +++++------------- python/cudf/cudf/core/dataframe.py | 7 +++++++ python/cudf/cudf/core/frame.py | 6 +++++- 7 files changed, 26 insertions(+), 23 deletions(-) diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index b9577d937d9..b926a6cdc99 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. name: cudf_dev channels: @@ -17,7 +17,7 @@ dependencies: - numba>=0.54 - numpy - pandas>=1.0,<1.4.0dev0 - - pyarrow=5.0.0=*cuda + - pyarrow=6.0.1=*cuda - fastavro>=0.22.9 - python-snappy>=0.6.0 - notebook>=0.5.0 @@ -45,7 +45,7 @@ dependencies: - dask>=2021.11.1,<=2022.01.0 - distributed>=2021.11.1,<=2022.01.0 - streamz - - arrow-cpp=5.0.0 + - arrow-cpp=6.0.1 - dlpack>=0.5,<0.6.0a0 - arrow-cpp-proc * cuda - double-conversion diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index bd1412bc611..0145e2e4d01 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -31,7 +31,7 @@ requirements: - setuptools - numba >=0.54 - dlpack>=0.5,<0.6.0a0 - - pyarrow 5.0.0 *cuda + - pyarrow 6.0.1 *cuda - libcudf {{ version }} - rmm {{ minor_version }} - cudatoolkit {{ cuda_version }} diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 70c020d4abd..57205f22e57 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -40,7 +40,7 @@ requirements: host: - librmm {{ minor_version }}.* - cudatoolkit {{ cuda_version }}.* - - arrow-cpp 5.0.0 *cuda + - arrow-cpp 6.0.1 *cuda - arrow-cpp-proc * cuda - dlpack>=0.5,<0.6.0a0 run: diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index ae1448da502..83c5e4c3e8f 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -308,7 +308,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endfunction() -set(CUDF_VERSION_Arrow 5.0.0) +set(CUDF_VERSION_Arrow 6.0.1) find_and_configure_arrow( ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} ${CUDF_ENABLE_ARROW_ORC} diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2c3951c0e5e..ad95c23e395 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2093,24 +2093,16 @@ def as_column( dtype = "bool" np_type = np.dtype(dtype).type pa_type = np_to_pa_dtype(np.dtype(dtype)) - # TODO: A warning is emitted from pyarrow 5.0.0's function - # pyarrow.lib._sequence_to_array: - # "DeprecationWarning: an integer is required (got type float). - # Implicit conversion to integers using __int__ is deprecated, - # and may be removed in a future version of Python." - # This warning does not appear in pyarrow 6.0.1 and will be - # resolved by https://github.com/rapidsai/cudf/pull/9686/. - with warnings.catch_warnings(): - warnings.simplefilter("ignore", DeprecationWarning) - pa_array = pa.array( + data = as_column( + pa.array( arbitrary, type=pa_type, from_pandas=True if nan_as_null is None else nan_as_null, - ) - data = as_column( - pa_array, dtype=dtype, nan_as_null=nan_as_null, + ), + dtype=dtype, + nan_as_null=nan_as_null, ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): if is_categorical_dtype(dtype): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d03cd6b1124..8a49efabcde 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4605,10 +4605,17 @@ def to_arrow(self, preserve_index=True): a: int64 b: int64 index: int64 + ---- + a: [[1,2,3]] + b: [[4,5,6]] + index: [[1,2,3]] >>> df.to_arrow(preserve_index=False) pyarrow.Table a: int64 b: int64 + ---- + a: [[1,2,3]] + b: [[4,5,6]] """ data = self.copy(deep=False) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index a9e425f2012..2c5606cb17f 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from __future__ import annotations @@ -1922,6 +1922,10 @@ def to_arrow(self): a: int64 b: int64 index: int64 + ---- + a: [[1,2,3]] + b: [[4,5,6]] + index: [[1,2,3]] """ return pa.Table.from_pydict( {name: col.to_arrow() for name, col in self._data.items()} From 37acb9b9399e139ceab019a645143f28ab03dd6e Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 9 Feb 2022 19:23:44 -0600 Subject: [PATCH 05/47] Remove redundant copies in `fillna` to improve performance (#10241) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes: #10207 This PR removes redundant copy being made in `fillna` and utilizes `nan_count` & `null_count` cached properties before making a decision to make a `copy` or perform a `fillna` operation. This results in **1.7x** speedup of a dataframe with shape `30000000 rows x 5 columns` (with 2 columns having nulls), this should ideally scale with the number of columns having nulls and their size too. ```python In [1]: import cudf In [2]: df = cudf.read_parquet("temp") ``` On `branch-22.04`: ```python In [3]: %timeit df.fillna(3) 51.4 ms ± 58.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) ``` This PR: ```python In [3]: %timeit df.fillna(3) 30.4 ms ± 378 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) ``` ``` In [18]: df.fillna(3) Out[18]: a b c d e 0 1 a 0.00 1 1.2 1 2 b 0.12 2 3.0 2 3 c 10.12 3 2.3 3 1 a 0.00 1 1.2 4 2 b 0.12 2 3.0 ... .. .. ... .. ... 29999995 2 b 0.12 2 3.0 29999996 3 c 10.12 3 2.3 29999997 1 a 0.00 1 1.2 29999998 2 b 0.12 2 3.0 29999999 3 c 10.12 3 2.3 [30000000 rows x 5 columns] ``` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/10241 --- python/cudf/cudf/core/column/column.py | 4 ++++ python/cudf/cudf/core/column/numerical.py | 4 ++++ python/cudf/cudf/core/frame.py | 26 +++++++++++++++-------- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index ad95c23e395..e5669fd44aa 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1167,6 +1167,10 @@ def corr(self, other: ColumnBase): def nans_to_nulls(self: T) -> T: return self + @property + def contains_na_entries(self) -> bool: + return self.null_count != 0 + def _process_for_reduction( self, skipna: bool = None, min_count: int = 0 ) -> Union[ColumnBase, ScalarLike]: diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 9b54c4d9acd..0e32c530e69 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -325,6 +325,10 @@ def nan_count(self) -> int: self._nan_count = nan_col.sum() return self._nan_count + @property + def contains_na_entries(self) -> bool: + return (self.nan_count != 0) or (self.null_count != 0) + def _process_values_for_isin( self, values: Sequence ) -> Tuple[ColumnBase, ColumnBase]: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 2c5606cb17f..8d8dd2a1bc0 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1244,7 +1244,7 @@ def fillna( value = value elif not isinstance(value, abc.Mapping): value = {name: copy.deepcopy(value) for name in self._data.names} - elif isinstance(value, abc.Mapping): + else: value = { key: value.reindex(self.index) if isinstance(value, cudf.Series) @@ -1252,18 +1252,26 @@ def fillna( for key, value in value.items() } - copy_data = self._data.copy(deep=True) - - for name in copy_data.keys(): + filled_data = {} + for col_name, col in self._data.items(): + if col_name in value and method is None: + replace_val = value[col_name] + else: + replace_val = None should_fill = ( - name in value - and not libcudf.scalar._is_null_host_scalar(value[name]) + col_name in value + and col.contains_na_entries + and not libcudf.scalar._is_null_host_scalar(replace_val) ) or method is not None if should_fill: - copy_data[name] = copy_data[name].fillna(value[name], method) - result = self._from_data(copy_data, self._index) + filled_data[col_name] = col.fillna(replace_val, method) + else: + filled_data[col_name] = col.copy(deep=True) - return self._mimic_inplace(result, inplace=inplace) + return self._mimic_inplace( + self._from_data(data=filled_data, index=self._index), + inplace=inplace, + ) def _drop_na_columns(self, how="any", subset=None, thresh=None): """ From 927ebcb1d11a8c217a8a26dd9553dc704c84b4fc Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 9 Feb 2022 19:30:40 -0600 Subject: [PATCH 06/47] Add `copyright` check in `cudf` (#10253) Fixes: #https://github.com/rapidsai/cudf/issues/9729 This PR adds copyright check to cudf, this is taken from https://github.com/rapidsai/cuml Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ray Douglass (https://github.com/raydouglass) - Vyas Ramasubramani (https://github.com/vyasr) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/10253 --- ci/checks/copyright.py | 233 +++++++++++++++++++++++++++++++++ ci/checks/gitutils.py | 286 +++++++++++++++++++++++++++++++++++++++++ ci/checks/style.sh | 19 ++- 3 files changed, 536 insertions(+), 2 deletions(-) create mode 100644 ci/checks/copyright.py create mode 100644 ci/checks/gitutils.py diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py new file mode 100644 index 00000000000..d72fd95fea3 --- /dev/null +++ b/ci/checks/copyright.py @@ -0,0 +1,233 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import datetime +import re +import argparse +import io +import os +import sys + +SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) + +# Add the scripts dir for gitutils +sys.path.append(os.path.normpath(SCRIPT_DIR)) + +# Now import gitutils. Ignore flake8 error here since there is no other way to +# set up imports +import gitutils # noqa: E402 + +FilesToCheck = [ + re.compile(r"[.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$"), + re.compile(r"CMakeLists[.]txt$"), + re.compile(r"CMakeLists_standalone[.]txt$"), + re.compile(r"setup[.]cfg$"), + re.compile(r"[.]flake8[.]cython$"), + re.compile(r"meta[.]yaml$") +] +ExemptFiles = [] + +# this will break starting at year 10000, which is probably OK :) +CheckSimple = re.compile( + r"Copyright *(?:\(c\))? *(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)") +CheckDouble = re.compile( + r"Copyright *(?:\(c\))? *(\d{4})-(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)" # noqa: E501 +) + + +def checkThisFile(f): + # This check covers things like symlinks which point to files that DNE + if not (os.path.exists(f)): + return False + if gitutils and gitutils.isFileEmpty(f): + return False + for exempt in ExemptFiles: + if exempt.search(f): + return False + for checker in FilesToCheck: + if checker.search(f): + return True + return False + + +def getCopyrightYears(line): + res = CheckSimple.search(line) + if res: + return (int(res.group(1)), int(res.group(1))) + res = CheckDouble.search(line) + if res: + return (int(res.group(1)), int(res.group(2))) + return (None, None) + + +def replaceCurrentYear(line, start, end): + # first turn a simple regex into double (if applicable). then update years + res = CheckSimple.sub(r"Copyright (c) \1-\1, NVIDIA CORPORATION", line) + res = CheckDouble.sub( + r"Copyright (c) {:04d}-{:04d}, NVIDIA CORPORATION".format(start, end), + res) + return res + + +def checkCopyright(f, update_current_year): + """ + Checks for copyright headers and their years + """ + errs = [] + thisYear = datetime.datetime.now().year + lineNum = 0 + crFound = False + yearMatched = False + with io.open(f, "r", encoding="utf-8") as fp: + lines = fp.readlines() + for line in lines: + lineNum += 1 + start, end = getCopyrightYears(line) + if start is None: + continue + crFound = True + if start > end: + e = [ + f, + lineNum, + "First year after second year in the copyright " + "header (manual fix required)", + None + ] + errs.append(e) + if thisYear < start or thisYear > end: + e = [ + f, + lineNum, + "Current year not included in the " + "copyright header", + None + ] + if thisYear < start: + e[-1] = replaceCurrentYear(line, thisYear, end) + if thisYear > end: + e[-1] = replaceCurrentYear(line, start, thisYear) + errs.append(e) + else: + yearMatched = True + fp.close() + # copyright header itself not found + if not crFound: + e = [ + f, + 0, + "Copyright header missing or formatted incorrectly " + "(manual fix required)", + None + ] + errs.append(e) + # even if the year matches a copyright header, make the check pass + if yearMatched: + errs = [] + + if update_current_year: + errs_update = [x for x in errs if x[-1] is not None] + if len(errs_update) > 0: + print("File: {}. Changing line(s) {}".format( + f, ', '.join(str(x[1]) for x in errs if x[-1] is not None))) + for _, lineNum, __, replacement in errs_update: + lines[lineNum - 1] = replacement + with io.open(f, "w", encoding="utf-8") as out_file: + for new_line in lines: + out_file.write(new_line) + errs = [x for x in errs if x[-1] is None] + + return errs + + +def getAllFilesUnderDir(root, pathFilter=None): + retList = [] + for (dirpath, dirnames, filenames) in os.walk(root): + for fn in filenames: + filePath = os.path.join(dirpath, fn) + if pathFilter(filePath): + retList.append(filePath) + return retList + + +def checkCopyright_main(): + """ + Checks for copyright headers in all the modified files. In case of local + repo, this script will just look for uncommitted files and in case of CI + it compares between branches "$PR_TARGET_BRANCH" and "current-pr-branch" + """ + retVal = 0 + global ExemptFiles + + argparser = argparse.ArgumentParser( + "Checks for a consistent copyright header in git's modified files") + argparser.add_argument("--update-current-year", + dest='update_current_year', + action="store_true", + required=False, + help="If set, " + "update the current year if a header is already " + "present and well formatted.") + argparser.add_argument("--git-modified-only", + dest='git_modified_only', + action="store_true", + required=False, + help="If set, " + "only files seen as modified by git will be " + "processed.") + + (args, dirs) = argparser.parse_known_args() + try: + ExemptFiles = [re.compile(file) for file in ExemptFiles] + except re.error as reException: + print("Regular expression error:") + print(reException) + return 1 + + if args.git_modified_only: + files = gitutils.modifiedFiles(pathFilter=checkThisFile) + else: + files = [] + for d in [os.path.abspath(d) for d in dirs]: + if not (os.path.isdir(d)): + raise ValueError(f"{d} is not a directory.") + files += getAllFilesUnderDir(d, pathFilter=checkThisFile) + + errors = [] + for f in files: + errors += checkCopyright(f, args.update_current_year) + + if len(errors) > 0: + print("Copyright headers incomplete in some of the files!") + for e in errors: + print(" %s:%d Issue: %s" % (e[0], e[1], e[2])) + print("") + n_fixable = sum(1 for e in errors if e[-1] is not None) + path_parts = os.path.abspath(__file__).split(os.sep) + file_from_repo = os.sep.join(path_parts[path_parts.index("ci"):]) + if n_fixable > 0: + print(("You can run `python {} --git-modified-only " + "--update-current-year` to fix {} of these " + "errors.\n").format(file_from_repo, n_fixable)) + retVal = 1 + else: + print("Copyright check passed") + + return retVal + + +if __name__ == "__main__": + import sys + sys.exit(checkCopyright_main()) \ No newline at end of file diff --git a/ci/checks/gitutils.py b/ci/checks/gitutils.py new file mode 100644 index 00000000000..0aea1d660cb --- /dev/null +++ b/ci/checks/gitutils.py @@ -0,0 +1,286 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import subprocess +import os +import re + + +def isFileEmpty(f): + return os.stat(f).st_size == 0 + + +def __git(*opts): + """Runs a git command and returns its output""" + cmd = "git " + " ".join(list(opts)) + ret = subprocess.check_output(cmd, shell=True) + return ret.decode("UTF-8").rstrip("\n") + + +def __gitdiff(*opts): + """Runs a git diff command with no pager set""" + return __git("--no-pager", "diff", *opts) + + +def branch(): + """Returns the name of the current branch""" + name = __git("rev-parse", "--abbrev-ref", "HEAD") + name = name.rstrip() + return name + + +def repo_version(): + """ + Determines the version of the repo by using `git describe` + + Returns + ------- + str + The full version of the repo in the format 'v#.#.#{a|b|rc}' + """ + return __git("describe", "--tags", "--abbrev=0") + + +def repo_version_major_minor(): + """ + Determines the version of the repo using `git describe` and returns only + the major and minor portion + + Returns + ------- + str + The partial version of the repo in the format '{major}.{minor}' + """ + + full_repo_version = repo_version() + + match = re.match(r"^v?(?P[0-9]+)(?:\.(?P[0-9]+))?", + full_repo_version) + + if (match is None): + print(" [DEBUG] Could not determine repo major minor version. " + f"Full repo version: {full_repo_version}.") + return None + + out_version = match.group("major") + + if (match.group("minor")): + out_version += "." + match.group("minor") + + return out_version + + +def determine_merge_commit(current_branch="HEAD"): + """ + When running outside of CI, this will estimate the target merge commit hash + of `current_branch` by finding a common ancester with the remote branch + 'branch-{major}.{minor}' where {major} and {minor} are determined from the + repo version. + + Parameters + ---------- + current_branch : str, optional + Which branch to consider as the current branch, by default "HEAD" + + Returns + ------- + str + The common commit hash ID + """ + + try: + # Try to determine the target branch from the most recent tag + head_branch = __git("describe", + "--all", + "--tags", + "--match='branch-*'", + "--abbrev=0") + except subprocess.CalledProcessError: + print(" [DEBUG] Could not determine target branch from most recent " + "tag. Falling back to 'branch-{major}.{minor}.") + head_branch = None + + if (head_branch is not None): + # Convert from head to branch name + head_branch = __git("name-rev", "--name-only", head_branch) + else: + # Try and guess the target branch as "branch-." + version = repo_version_major_minor() + + if (version is None): + return None + + head_branch = "branch-{}".format(version) + + try: + # Now get the remote tracking branch + remote_branch = __git("rev-parse", + "--abbrev-ref", + "--symbolic-full-name", + head_branch + "@{upstream}") + except subprocess.CalledProcessError: + print(" [DEBUG] Could not remote tracking reference for " + f"branch {head_branch}.") + remote_branch = None + + if (remote_branch is None): + return None + + print(f" [DEBUG] Determined TARGET_BRANCH as: '{remote_branch}'. " + "Finding common ancestor.") + + common_commit = __git("merge-base", remote_branch, current_branch) + + return common_commit + + +def uncommittedFiles(): + """ + Returns a list of all changed files that are not yet committed. This + means both untracked/unstaged as well as uncommitted files too. + """ + files = __git("status", "-u", "-s") + ret = [] + for f in files.splitlines(): + f = f.strip(" ") + f = re.sub("\s+", " ", f) # noqa: W605 + tmp = f.split(" ", 1) + # only consider staged files or uncommitted files + # in other words, ignore untracked files + if tmp[0] == "M" or tmp[0] == "A": + ret.append(tmp[1]) + return ret + + +def changedFilesBetween(baseName, branchName, commitHash): + """ + Returns a list of files changed between branches baseName and latest commit + of branchName. + """ + current = branch() + # checkout "base" branch + __git("checkout", "--force", baseName) + # checkout branch for comparing + __git("checkout", "--force", branchName) + # checkout latest commit from branch + __git("checkout", "-fq", commitHash) + + files = __gitdiff("--name-only", + "--ignore-submodules", + f"{baseName}..{branchName}") + + # restore the original branch + __git("checkout", "--force", current) + return files.splitlines() + + +def changesInFileBetween(file, b1, b2, filter=None): + """Filters the changed lines to a file between the branches b1 and b2""" + current = branch() + __git("checkout", "--quiet", b1) + __git("checkout", "--quiet", b2) + diffs = __gitdiff("--ignore-submodules", + "-w", + "--minimal", + "-U0", + "%s...%s" % (b1, b2), + "--", + file) + __git("checkout", "--quiet", current) + lines = [] + for line in diffs.splitlines(): + if filter is None or filter(line): + lines.append(line) + return lines + + +def modifiedFiles(pathFilter=None): + """ + If inside a CI-env (ie. TARGET_BRANCH and COMMIT_HASH are defined, and + current branch is "current-pr-branch"), then lists out all files modified + between these 2 branches. Locally, TARGET_BRANCH will try to be determined + from the current repo version and finding a coresponding branch named + 'branch-{major}.{minor}'. If this fails, this functino will list out all + the uncommitted files in the current branch. + + Such utility function is helpful while putting checker scripts as part of + cmake, as well as CI process. This way, during development, only the files + touched (but not yet committed) by devs can be checked. But, during the CI + process ALL files modified by the dev, as submiited in the PR, will be + checked. This happens, all the while using the same script. + """ + targetBranch = os.environ.get("TARGET_BRANCH") + commitHash = os.environ.get("COMMIT_HASH") + currentBranch = branch() + print( + f" [DEBUG] TARGET_BRANCH={targetBranch}, COMMIT_HASH={commitHash}, " + f"currentBranch={currentBranch}") + + if targetBranch and commitHash and (currentBranch == "current-pr-branch"): + print(" [DEBUG] Assuming a CI environment.") + allFiles = changedFilesBetween(targetBranch, currentBranch, commitHash) + else: + print(" [DEBUG] Did not detect CI environment. " + "Determining TARGET_BRANCH locally.") + + common_commit = determine_merge_commit(currentBranch) + + if (common_commit is not None): + + # Now get the diff. Use --staged to get both diff between + # common_commit..HEAD and any locally staged files + allFiles = __gitdiff("--name-only", + "--ignore-submodules", + "--staged", + f"{common_commit}").splitlines() + else: + # Fallback to just uncommitted files + allFiles = uncommittedFiles() + + files = [] + for f in allFiles: + if pathFilter is None or pathFilter(f): + files.append(f) + + filesToCheckString = "\n\t".join(files) if files else "" + print(f" [DEBUG] Found files to check:\n\t{filesToCheckString}\n") + return files + + +def listAllFilesInDir(folder): + """Utility function to list all files/subdirs in the input folder""" + allFiles = [] + for root, dirs, files in os.walk(folder): + for name in files: + allFiles.append(os.path.join(root, name)) + return allFiles + + +def listFilesToCheck(filesDirs, filter=None): + """ + Utility function to filter the input list of files/dirs based on the input + filter method and returns all the files that need to be checked + """ + allFiles = [] + for f in filesDirs: + if os.path.isfile(f): + if filter is None or filter(f): + allFiles.append(f) + elif os.path.isdir(f): + files = listAllFilesInDir(f) + for f_ in files: + if filter is None or filter(f_): + allFiles.append(f_) + return allFiles diff --git a/ci/checks/style.sh b/ci/checks/style.sh index 9fb86b0b3c5..a7ad260758d 100755 --- a/ci/checks/style.sh +++ b/ci/checks/style.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. ##################### # cuDF Style Tester # ##################### @@ -19,9 +19,24 @@ export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake. mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE}) wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL} + pre-commit run --hook-stage manual --all-files PRE_COMMIT_RETVAL=$? +# Check for copyright headers in the files modified currently +COPYRIGHT=`python ci/checks/copyright.py --git-modified-only 2>&1` +CR_RETVAL=$? + +# Output results if failure otherwise show pass +if [ "$CR_RETVAL" != "0" ]; then + echo -e "\n\n>>>> FAILED: copyright check; begin output\n\n" + echo -e "$COPYRIGHT" + echo -e "\n\n>>>> FAILED: copyright check; end output\n\n" +else + echo -e "\n\n>>>> PASSED: copyright check\n\n" + echo -e "$COPYRIGHT" +fi + # Run clang-format and check for a consistent code format CLANG_FORMAT=`python cpp/scripts/run-clang-format.py 2>&1` CLANG_FORMAT_RETVAL=$? @@ -40,7 +55,7 @@ HEADER_META_RETVAL=$? echo -e "$HEADER_META" RETVALS=( - $PRE_COMMIT_RETVAL $CLANG_FORMAT_RETVAL + $CR_RETVAL $PRE_COMMIT_RETVAL $CLANG_FORMAT_RETVAL ) IFS=$'\n' RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1` From eb5e3e3f8d44e15bf2dd1597397721cf7c9aa64e Mon Sep 17 00:00:00 2001 From: Conor Hoekstra <36027403+codereport@users.noreply.github.com> Date: Wed, 9 Feb 2022 21:13:17 -0500 Subject: [PATCH 07/47] Remove `std::numeric_limit` specializations for timestamp & durations (#10239) This resolves https://github.com/rapidsai/cudf/issues/9711 We current inject specializations of `std::numeric_limits` for [`timestamps`](https://github.com/rapidsai/cudf/blob/60380def2363a4bb5ab6b6059a1b0969c45ae2f6/cpp/include/cudf/wrappers/timestamps.hpp#L82-L107) and [`durations` ](https://github.com/rapidsai/cudf/blob/60380def2363a4bb5ab6b6059a1b0969c45ae2f6/cpp/include/cudf/wrappers/durations.hpp#L72-L97). We should [not do this](https://eel.is/c++draft/numeric.limits.general#6). This PR removes this hack. To Do: * [x] timestamp * [x] duration Authors: - Conor Hoekstra (https://github.com/codereport) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Devavret Makkar (https://github.com/devavret) - David Wendt (https://github.com/davidwendt) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/10239 --- .../detail/utilities/device_operators.cuh | 6 +++- cpp/include/cudf/wrappers/durations.hpp | 33 +---------------- cpp/include/cudf/wrappers/timestamps.hpp | 31 +--------------- cpp/tests/copying/shift_tests.cpp | 36 ++++++++++++------- .../device_atomics/device_atomics_test.cu | 11 ++++-- 5 files changed, 39 insertions(+), 78 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh index a59ad4c42ee..d31f85d6603 100644 --- a/cpp/include/cudf/detail/utilities/device_operators.cuh +++ b/cpp/include/cudf/detail/utilities/device_operators.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -129,6 +129,8 @@ struct DeviceMin { !cudf::is_fixed_point()>* = nullptr> static constexpr T identity() { + // chrono types do not have std::numeric_limits specializations and should use T::max() + // https://eel.is/c++draft/numeric.limits.general#6 if constexpr (cudf::is_chrono()) return T::max(); return cuda::std::numeric_limits::max(); } @@ -171,6 +173,8 @@ struct DeviceMax { !cudf::is_fixed_point()>* = nullptr> static constexpr T identity() { + // chrono types do not have std::numeric_limits specializations and should use T::min() + // https://eel.is/c++draft/numeric.limits.general#6 if constexpr (cudf::is_chrono()) return T::min(); return cuda::std::numeric_limits::lowest(); } diff --git a/cpp/include/cudf/wrappers/durations.hpp b/cpp/include/cudf/wrappers/durations.hpp index 3a27d798487..62aa22c2788 100644 --- a/cpp/include/cudf/wrappers/durations.hpp +++ b/cpp/include/cudf/wrappers/durations.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +16,6 @@ #pragma once -#include - #include namespace cudf { @@ -68,32 +66,3 @@ static_assert(sizeof(duration_ns) == sizeof(typename duration_ns::rep), ""); /** @} */ // end of group } // namespace cudf - -namespace std { -/** - * @brief Specialization of std::numeric_limits for cudf::detail::duration - * - * Pass through to return the limits of the underlying numeric representation. - */ -#define DURATION_LIMITS(TypeName) \ - template <> \ - struct numeric_limits { \ - static constexpr TypeName max() noexcept { return TypeName::max(); } \ - static constexpr TypeName lowest() noexcept \ - { \ - return TypeName(std::numeric_limits::lowest()); \ - } \ - static constexpr TypeName min() noexcept { return TypeName::min(); } \ - } - -DURATION_LIMITS(cudf::duration_D); -DURATION_LIMITS(cudf::duration_h); -DURATION_LIMITS(cudf::duration_m); -DURATION_LIMITS(cudf::duration_s); -DURATION_LIMITS(cudf::duration_ms); -DURATION_LIMITS(cudf::duration_us); -DURATION_LIMITS(cudf::duration_ns); - -#undef DURATION_LIMITS - -} // namespace std diff --git a/cpp/include/cudf/wrappers/timestamps.hpp b/cpp/include/cudf/wrappers/timestamps.hpp index 8481068ca05..5a4424112de 100644 --- a/cpp/include/cudf/wrappers/timestamps.hpp +++ b/cpp/include/cudf/wrappers/timestamps.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,32 +78,3 @@ static_assert(sizeof(timestamp_ns) == sizeof(typename timestamp_ns::rep), ""); /** @} */ // end of group } // namespace cudf - -namespace std { -/** - * @brief Specialization of std::numeric_limits for cudf::detail::timestamp - * - * Pass through to return the limits of the underlying numeric representation. - */ -#define TIMESTAMP_LIMITS(TypeName) \ - template <> \ - struct numeric_limits { \ - static constexpr TypeName max() noexcept { return TypeName::max(); } \ - static constexpr TypeName lowest() noexcept \ - { \ - return TypeName{TypeName::duration{std::numeric_limits::lowest()}}; \ - } \ - static constexpr TypeName min() noexcept { return TypeName::min(); } \ - } - -TIMESTAMP_LIMITS(cudf::timestamp_D); -TIMESTAMP_LIMITS(cudf::timestamp_h); -TIMESTAMP_LIMITS(cudf::timestamp_m); -TIMESTAMP_LIMITS(cudf::timestamp_s); -TIMESTAMP_LIMITS(cudf::timestamp_ms); -TIMESTAMP_LIMITS(cudf::timestamp_us); -TIMESTAMP_LIMITS(cudf::timestamp_ns); - -#undef TIMESTAMP_LIMITS - -} // namespace std diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp index 210b4b8f90d..256f9129cbf 100644 --- a/cpp/tests/copying/shift_tests.cpp +++ b/cpp/tests/copying/shift_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -54,10 +54,22 @@ std::unique_ptr make_scalar( } template -auto lowest = std::numeric_limits::lowest(); +constexpr auto highest() +{ + // chrono types do not have std::numeric_limits specializations and should use T::max() + // https://eel.is/c++draft/numeric.limits.general#6 + if constexpr (cudf::is_chrono()) return T::max(); + return std::numeric_limits::max(); +} template -auto highest = std::numeric_limits::max(); +constexpr auto lowest() +{ + // chrono types do not have std::numeric_limits specializations and should use T::min() + // https://eel.is/c++draft/numeric.limits.general#6 + if constexpr (cudf::is_chrono()) return T::min(); + return std::numeric_limits::lowest(); +} template struct ShiftTest : public cudf::test::BaseFixture { @@ -101,16 +113,16 @@ TYPED_TEST(ShiftTest, OneColumn) { using T = TypeParam; - auto input = fixed_width_column_wrapper{lowest, + auto input = fixed_width_column_wrapper{lowest(), cudf::test::make_type_param_scalar(1), cudf::test::make_type_param_scalar(2), cudf::test::make_type_param_scalar(3), cudf::test::make_type_param_scalar(4), cudf::test::make_type_param_scalar(5), - highest}; + highest()}; auto expected = fixed_width_column_wrapper{cudf::test::make_type_param_scalar(7), cudf::test::make_type_param_scalar(7), - lowest, + lowest(), cudf::test::make_type_param_scalar(1), cudf::test::make_type_param_scalar(2), cudf::test::make_type_param_scalar(3), @@ -126,16 +138,16 @@ TYPED_TEST(ShiftTest, OneColumnNegativeShift) { using T = TypeParam; - auto input = fixed_width_column_wrapper{lowest, + auto input = fixed_width_column_wrapper{lowest(), cudf::test::make_type_param_scalar(1), cudf::test::make_type_param_scalar(2), cudf::test::make_type_param_scalar(3), cudf::test::make_type_param_scalar(4), cudf::test::make_type_param_scalar(5), - highest}; + highest()}; auto expected = fixed_width_column_wrapper{cudf::test::make_type_param_scalar(4), cudf::test::make_type_param_scalar(5), - highest, + highest(), cudf::test::make_type_param_scalar(7), cudf::test::make_type_param_scalar(7), cudf::test::make_type_param_scalar(7), @@ -151,16 +163,16 @@ TYPED_TEST(ShiftTest, OneColumnNullFill) { using T = TypeParam; - auto input = fixed_width_column_wrapper{lowest, + auto input = fixed_width_column_wrapper{lowest(), cudf::test::make_type_param_scalar(5), cudf::test::make_type_param_scalar(0), cudf::test::make_type_param_scalar(3), cudf::test::make_type_param_scalar(0), cudf::test::make_type_param_scalar(1), - highest}; + highest()}; auto expected = fixed_width_column_wrapper({cudf::test::make_type_param_scalar(0), cudf::test::make_type_param_scalar(0), - lowest, + lowest(), cudf::test::make_type_param_scalar(5), cudf::test::make_type_param_scalar(0), cudf::test::make_type_param_scalar(3), diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index fddaa9d2050..fd065249c4e 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -129,8 +129,13 @@ struct AtomicsTest : public cudf::test::BaseFixture { thrust::host_vector result_init(9); // +3 padding for int8 tests result_init[0] = cudf::test::make_type_param_scalar(0); - result_init[1] = std::numeric_limits::max(); - result_init[2] = std::numeric_limits::min(); + if constexpr (cudf::is_chrono()) { + result_init[1] = T::max(); + result_init[2] = T::min(); + } else { + result_init[1] = std::numeric_limits::max(); + result_init[2] = std::numeric_limits::min(); + } result_init[3] = result_init[0]; result_init[4] = result_init[1]; result_init[5] = result_init[2]; From 6d162b4760a471f622d08e5e6179749f7a2807fa Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 10 Feb 2022 14:31:08 +0000 Subject: [PATCH 08/47] Bump hadoop-common from 3.1.0 to 3.1.4 in /java (#10259) Bumps hadoop-common from 3.1.0 to 3.1.4. [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.hadoop:hadoop-common&package-manager=maven&previous-version=3.1.0&new-version=3.1.4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) - `@dependabot use these labels` will set the current labels as the default for future PRs for this repo and language - `@dependabot use these reviewers` will set the current reviewers as the default for future PRs for this repo and language - `@dependabot use these assignees` will set the current assignees as the default for future PRs for this repo and language - `@dependabot use this milestone` will set the current milestone as the default for future PRs for this repo and language You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/rapidsai/cudf/network/alerts).
Authors: - https://github.com/apps/dependabot Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/10259 --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100755 => 100644 java/pom.xml diff --git a/java/pom.xml b/java/pom.xml old mode 100755 new mode 100644 index 8f0fb1000d8..02828a21e67 --- a/java/pom.xml +++ b/java/pom.xml @@ -147,7 +147,7 @@ org.apache.hadoop hadoop-common - 3.1.0 + 3.1.4 test From 2741e6bb2cd1dc0a10dce6fc6792cfdbbc0e2c4e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 10 Feb 2022 13:26:12 -0500 Subject: [PATCH 09/47] Remove probe-time null equality parameters in `cudf::hash_join` (#10260) Closes https://github.com/rapidsai/cudf/issues/9155 This PR removes the probe-time `cudf::null_equality` parameter in `cudf::hash_join` to avoid potential mismatching bugs between building and probing a hash join object. Authors: - Yunsong Wang (https://github.com/PointKernel) - Jason Lowe (https://github.com/jlowe) Approvers: - Conor Hoekstra (https://github.com/codereport) - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/10260 --- cpp/benchmarks/join/join.cu | 8 +- cpp/include/cudf/join.hpp | 20 +---- cpp/src/join/hash_join.cu | 76 +++++++--------- cpp/src/join/hash_join.cuh | 30 +++---- cpp/src/join/join.cu | 28 +++--- cpp/tests/join/join_tests.cpp | 20 ++--- java/src/main/java/ai/rapids/cudf/Table.java | 57 ++++-------- java/src/main/native/src/TableJni.cpp | 92 ++++++++------------ 8 files changed, 128 insertions(+), 203 deletions(-) diff --git a/cpp/benchmarks/join/join.cu b/cpp/benchmarks/join/join.cu index 55a1e524479..f21356aff02 100644 --- a/cpp/benchmarks/join/join.cu +++ b/cpp/benchmarks/join/join.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -49,7 +49,7 @@ void nvbench_inner_join(nvbench::state& state, cudf::null_equality compare_nulls, rmm::cuda_stream_view stream) { cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream); - return hj_obj.inner_join(right_input.select(right_on), compare_nulls, std::nullopt, stream); + return hj_obj.inner_join(right_input.select(right_on), std::nullopt, stream); }; BM_join(state, join); @@ -71,7 +71,7 @@ void nvbench_left_join(nvbench::state& state, cudf::null_equality compare_nulls, rmm::cuda_stream_view stream) { cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream); - return hj_obj.left_join(right_input.select(right_on), compare_nulls, std::nullopt, stream); + return hj_obj.left_join(right_input.select(right_on), std::nullopt, stream); }; BM_join(state, join); @@ -93,7 +93,7 @@ void nvbench_full_join(nvbench::state& state, cudf::null_equality compare_nulls, rmm::cuda_stream_view stream) { cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream); - return hj_obj.full_join(right_input.select(right_on), compare_nulls, std::nullopt, stream); + return hj_obj.full_join(right_input.select(right_on), std::nullopt, stream); }; BM_join(state, join); diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index f6efea5f2bb..d56f8f0e904 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -530,7 +530,6 @@ class hash_join { * provided `output_size` is smaller than the actual output size. * * @param probe The probe table, from which the tuples are probed. - * @param compare_nulls Controls whether null join-key values should match or not. * @param output_size Optional value which allows users to specify the exact output size. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device @@ -543,7 +542,6 @@ class hash_join { std::pair>, std::unique_ptr>> inner_join(cudf::table_view const& probe, - null_equality compare_nulls = null_equality::EQUAL, std::optional output_size = {}, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; @@ -554,7 +552,6 @@ class hash_join { * provided `output_size` is smaller than the actual output size. * * @param probe The probe table, from which the tuples are probed. - * @param compare_nulls Controls whether null join-key values should match or not. * @param output_size Optional value which allows users to specify the exact output size. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device @@ -567,7 +564,6 @@ class hash_join { std::pair>, std::unique_ptr>> left_join(cudf::table_view const& probe, - null_equality compare_nulls = null_equality::EQUAL, std::optional output_size = {}, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; @@ -578,7 +574,6 @@ class hash_join { * provided `output_size` is smaller than the actual output size. * * @param probe The probe table, from which the tuples are probed. - * @param compare_nulls Controls whether null join-key values should match or not. * @param output_size Optional value which allows users to specify the exact output size. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device @@ -591,7 +586,6 @@ class hash_join { std::pair>, std::unique_ptr>> full_join(cudf::table_view const& probe, - null_equality compare_nulls = null_equality::EQUAL, std::optional output_size = {}, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; @@ -601,39 +595,32 @@ class hash_join { * probe table. * * @param probe The probe table, from which the tuples are probed. - * @param compare_nulls Controls whether null join-key values should match or not. * @param stream CUDA stream used for device memory operations and kernel launches * * @return The exact number of output when performing an inner join between two tables with * `build` and `probe` as the the join keys . */ [[nodiscard]] std::size_t inner_join_size( - cudf::table_view const& probe, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; + cudf::table_view const& probe, rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; /** * Returns the exact number of matches (rows) when performing a left join with the specified probe * table. * * @param probe The probe table, from which the tuples are probed. - * @param compare_nulls Controls whether null join-key values should match or not. * @param stream CUDA stream used for device memory operations and kernel launches * * @return The exact number of output when performing a left join between two tables with `build` * and `probe` as the the join keys . */ [[nodiscard]] std::size_t left_join_size( - cudf::table_view const& probe, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; + cudf::table_view const& probe, rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; /** * Returns the exact number of matches (rows) when performing a full join with the specified probe * table. * * @param probe The probe table, from which the tuples are probed. - * @param compare_nulls Controls whether null join-key values should match or not. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the intermediate table and columns' device * memory. @@ -643,7 +630,6 @@ class hash_join { */ std::size_t full_join_size( cudf::table_view const& probe, - null_equality compare_nulls = null_equality::EQUAL, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 7590c93f0c3..b89bcabf23e 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -140,8 +140,8 @@ probe_join_hash_table(cudf::table_device_view build_table, std::size_t get_full_join_size(cudf::table_device_view build_table, cudf::table_device_view probe_table, multimap_type const& hash_table, - bool has_nulls, - null_equality compare_nulls, + bool const has_nulls, + null_equality const compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -235,6 +235,7 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build, null_equality compare_nulls, rmm::cuda_stream_view stream) : _is_empty{build.num_rows() == 0}, + _nulls_equal{compare_nulls}, _hash_table{compute_hash_table_size(build.num_rows()), std::numeric_limits::max(), cudf::detail::JoinNoneValue, @@ -253,50 +254,43 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build, if (_is_empty) { return; } - build_join_hash_table(_build, _hash_table, compare_nulls, stream); + cudf::detail::build_join_hash_table(_build, _hash_table, _nulls_equal, stream); } std::pair>, std::unique_ptr>> hash_join::hash_join_impl::inner_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { CUDF_FUNC_RANGE(); - return compute_hash_join( - probe, compare_nulls, output_size, stream, mr); + return compute_hash_join(probe, output_size, stream, mr); } std::pair>, std::unique_ptr>> hash_join::hash_join_impl::left_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { CUDF_FUNC_RANGE(); - return compute_hash_join( - probe, compare_nulls, output_size, stream, mr); + return compute_hash_join(probe, output_size, stream, mr); } std::pair>, std::unique_ptr>> hash_join::hash_join_impl::full_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { CUDF_FUNC_RANGE(); - return compute_hash_join( - probe, compare_nulls, output_size, stream, mr); + return compute_hash_join(probe, output_size, stream, mr); } std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream) const { CUDF_FUNC_RANGE(); @@ -316,12 +310,11 @@ std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& p *flattened_probe_table_ptr, _hash_table, cudf::has_nulls(flattened_probe_table) | cudf::has_nulls(_build), - compare_nulls, + _nulls_equal, stream); } std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream) const { CUDF_FUNC_RANGE(); @@ -341,12 +334,11 @@ std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& pr *flattened_probe_table_ptr, _hash_table, cudf::has_nulls(flattened_probe_table) | cudf::has_nulls(_build), - compare_nulls, + _nulls_equal, stream); } std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { @@ -362,20 +354,20 @@ std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& pr auto build_table_ptr = cudf::table_device_view::create(_build, stream); auto flattened_probe_table_ptr = cudf::table_device_view::create(flattened_probe_table, stream); - return get_full_join_size(*build_table_ptr, - *flattened_probe_table_ptr, - _hash_table, - cudf::has_nulls(flattened_probe_table) | cudf::has_nulls(_build), - compare_nulls, - stream, - mr); + return cudf::detail::get_full_join_size( + *build_table_ptr, + *flattened_probe_table_ptr, + _hash_table, + cudf::has_nulls(flattened_probe_table) | cudf::has_nulls(_build), + _nulls_equal, + stream, + mr); } template std::pair>, std::unique_ptr>> hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const @@ -403,42 +395,40 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe, [](const auto& b, const auto& p) { return b.type() == p.type(); }), "Mismatch in joining column data types"); - return probe_join_indices( - flattened_probe_table, compare_nulls, output_size, stream, mr); + return probe_join_indices(flattened_probe_table, output_size, stream, mr); } template std::pair>, std::unique_ptr>> -hash_join::hash_join_impl::probe_join_indices(cudf::table_view const& probe, - null_equality compare_nulls, +hash_join::hash_join_impl::probe_join_indices(cudf::table_view const& probe_table, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { // Trivial left join case - exit early if (_is_empty and JoinKind != cudf::detail::join_kind::INNER_JOIN) { - return get_trivial_left_join_indices(probe, stream, mr); + return get_trivial_left_join_indices(probe_table, stream, mr); } CUDF_EXPECTS(!_is_empty, "Hash table of hash join is null."); auto build_table_ptr = cudf::table_device_view::create(_build, stream); - auto probe_table_ptr = cudf::table_device_view::create(probe, stream); - - auto join_indices = - cudf::detail::probe_join_hash_table(*build_table_ptr, - *probe_table_ptr, - _hash_table, - cudf::has_nulls(probe) | cudf::has_nulls(_build), - compare_nulls, - output_size, - stream, - mr); + auto probe_table_ptr = cudf::table_device_view::create(probe_table, stream); + + auto join_indices = cudf::detail::probe_join_hash_table( + *build_table_ptr, + *probe_table_ptr, + _hash_table, + cudf::has_nulls(probe_table) | cudf::has_nulls(_build), + _nulls_equal, + output_size, + stream, + mr); if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) { auto complement_indices = detail::get_left_join_indices_complement( - join_indices.second, probe.num_rows(), _build.num_rows(), stream, mr); + join_indices.second, probe_table.num_rows(), _build.num_rows(), stream, mr); join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream); } return join_indices; diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index 21bfd8120f7..9c44aeebd59 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -89,7 +89,7 @@ class make_pair_function { * @param probe_table The left hand table * @param hash_table A hash table built on the build table that maps the index * of every row to the hash value of that row. - * @param compare_nulls Controls whether null join-key values should match or not. + * @param nulls_equal Flag to denote nulls are equal or not. * @param stream CUDA stream used for device memory operations and kernel launches * * @return The exact size of the output of the join operation @@ -98,8 +98,8 @@ template std::size_t compute_join_output_size(table_device_view build_table, table_device_view probe_table, multimap_type const& hash_table, - bool has_nulls, - null_equality compare_nulls, + bool const has_nulls, + cudf::null_equality const nulls_equal, rmm::cuda_stream_view stream) { const size_type build_table_num_rows{build_table.num_rows()}; @@ -121,7 +121,7 @@ std::size_t compute_join_output_size(table_device_view build_table, } auto const probe_nulls = cudf::nullate::DYNAMIC{has_nulls}; - pair_equality equality{probe_table, build_table, probe_nulls, compare_nulls}; + pair_equality equality{probe_table, build_table, probe_nulls, nulls_equal}; row_hash hash_probe{probe_nulls, probe_table}; auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); @@ -152,14 +152,14 @@ std::unique_ptr combine_table_pair(std::unique_ptr&& l * * @param build Table of columns used to build join hash. * @param hash_table Build hash table. - * @param compare_nulls Controls whether null join-key values should match or not. + * @param nulls_equal Flag to denote nulls are equal or not. * @param stream CUDA stream used for device memory operations and kernel launches. * */ template void build_join_hash_table(cudf::table_view const& build, MultimapType& hash_table, - null_equality compare_nulls, + null_equality const nulls_equal, rmm::cuda_stream_view stream) { auto build_table_ptr = cudf::table_device_view::create(build, stream); @@ -174,7 +174,7 @@ void build_join_hash_table(cudf::table_view const& build, auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func); size_type const build_table_num_rows{build_table_ptr->num_rows()}; - if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) { + if (nulls_equal == cudf::null_equality::EQUAL or (not nullable(build))) { hash_table.insert(iter, iter + build_table_num_rows, stream.value()); } else { thrust::counting_iterator stencil(0); @@ -197,7 +197,8 @@ struct hash_join::hash_join_impl { hash_join_impl& operator=(hash_join_impl&&) = delete; private: - bool _is_empty; + bool const _is_empty; + cudf::null_equality const _nulls_equal; cudf::table_view _build; std::vector> _created_null_columns; cudf::structs::detail::flattened_table _flattened_build_table; @@ -221,7 +222,6 @@ struct hash_join::hash_join_impl { std::pair>, std::unique_ptr>> inner_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; @@ -229,7 +229,6 @@ struct hash_join::hash_join_impl { std::pair>, std::unique_ptr>> left_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; @@ -237,21 +236,17 @@ struct hash_join::hash_join_impl { std::pair>, std::unique_ptr>> full_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; [[nodiscard]] std::size_t inner_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream) const; [[nodiscard]] std::size_t left_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream) const; std::size_t full_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; @@ -260,7 +255,6 @@ struct hash_join::hash_join_impl { std::pair>, std::unique_ptr>> compute_hash_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; @@ -276,7 +270,6 @@ struct hash_join::hash_join_impl { * @tparam JoinKind The type of join to be performed. * * @param probe_table Table of probe side columns to join. - * @param compare_nulls Controls whether null join-key values should match or not. * @param output_size Optional value which allows users to specify the exact output size. * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned vectors. @@ -286,8 +279,7 @@ struct hash_join::hash_join_impl { template std::pair>, std::unique_ptr>> - probe_join_indices(cudf::table_view const& probe, - null_equality compare_nulls, + probe_join_indices(cudf::table_view const& probe_table, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index db79075d864..ef9e7867a2d 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,11 +51,11 @@ inner_join(table_view const& left_input, // build the hash map from the smaller table. if (right.num_rows() > left.num_rows()) { cudf::hash_join hj_obj(left, compare_nulls, stream); - auto result = hj_obj.inner_join(right, compare_nulls, std::nullopt, stream, mr); + auto result = hj_obj.inner_join(right, std::nullopt, stream, mr); return std::make_pair(std::move(result.second), std::move(result.first)); } else { cudf::hash_join hj_obj(right, compare_nulls, stream); - return hj_obj.inner_join(left, compare_nulls, std::nullopt, stream, mr); + return hj_obj.inner_join(left, std::nullopt, stream, mr); } } @@ -113,7 +113,7 @@ left_join(table_view const& left_input, table_view const right = matched.second.back(); cudf::hash_join hj_obj(right, compare_nulls, stream); - return hj_obj.left_join(left, compare_nulls, std::nullopt, stream, mr); + return hj_obj.left_join(left, std::nullopt, stream, mr); } std::unique_ptr left_join(table_view const& left_input, @@ -176,7 +176,7 @@ full_join(table_view const& left_input, table_view const right = matched.second.back(); cudf::hash_join hj_obj(right, compare_nulls, stream); - return hj_obj.full_join(left, compare_nulls, std::nullopt, stream, mr); + return hj_obj.full_join(left, std::nullopt, stream, mr); } std::unique_ptr
full_join(table_view const& left_input, @@ -234,56 +234,50 @@ hash_join::hash_join(cudf::table_view const& build, std::pair>, std::unique_ptr>> hash_join::inner_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - return impl->inner_join(probe, compare_nulls, output_size, stream, mr); + return impl->inner_join(probe, output_size, stream, mr); } std::pair>, std::unique_ptr>> hash_join::left_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - return impl->left_join(probe, compare_nulls, output_size, stream, mr); + return impl->left_join(probe, output_size, stream, mr); } std::pair>, std::unique_ptr>> hash_join::full_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - return impl->full_join(probe, compare_nulls, output_size, stream, mr); + return impl->full_join(probe, output_size, stream, mr); } std::size_t hash_join::inner_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream) const { - return impl->inner_join_size(probe, compare_nulls, stream); + return impl->inner_join_size(probe, stream); } std::size_t hash_join::left_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream) const { - return impl->left_join_size(probe, compare_nulls, stream); + return impl->left_join_size(probe, stream); } std::size_t hash_join::full_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - return impl->full_join_size(probe, compare_nulls, stream, mr); + return impl->full_join_size(probe, stream, mr); } // external APIs diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index e6ae709f009..57041e448a2 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1004,7 +1004,7 @@ TEST_F(JoinTest, EmptyRightTableInnerJoin) std::size_t const size_gold = 0; EXPECT_EQ(output_size, size_gold); - auto result = hash_join.inner_join(t0, cudf::null_equality::EQUAL, optional_size); + auto result = hash_join.inner_join(t0, optional_size); column_wrapper col_gold_0{}; column_wrapper col_gold_1{}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); @@ -1043,7 +1043,7 @@ TEST_F(JoinTest, EmptyRightTableLeftJoin) std::size_t const size_gold = 5; EXPECT_EQ(output_size, size_gold); - auto result = hash_join.left_join(t0, cudf::null_equality::EQUAL, optional_size); + auto result = hash_join.left_join(t0, optional_size); column_wrapper col_gold_0{{0, 1, 2, 3, 4}}; column_wrapper col_gold_1{{NoneValue, NoneValue, NoneValue, NoneValue, NoneValue}}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); @@ -1082,7 +1082,7 @@ TEST_F(JoinTest, EmptyRightTableFullJoin) std::size_t const size_gold = 5; EXPECT_EQ(output_size, size_gold); - auto result = hash_join.full_join(t0, cudf::null_equality::EQUAL, optional_size); + auto result = hash_join.full_join(t0, optional_size); column_wrapper col_gold_0{{0, 1, 2, 3, 4}}; column_wrapper col_gold_1{{NoneValue, NoneValue, NoneValue, NoneValue, NoneValue}}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); @@ -1310,7 +1310,7 @@ TEST_F(JoinTest, HashJoinSequentialProbes) std::size_t const size_gold = 9; EXPECT_EQ(output_size, size_gold); - auto result = hash_join.full_join(t0, cudf::null_equality::EQUAL, optional_size); + auto result = hash_join.full_join(t0, optional_size); column_wrapper col_gold_0{{NoneValue, NoneValue, NoneValue, NoneValue, 4, 0, 1, 2, 3}}; column_wrapper col_gold_1{{0, 1, 2, 3, 4, NoneValue, NoneValue, NoneValue, NoneValue}}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); @@ -1330,7 +1330,7 @@ TEST_F(JoinTest, HashJoinSequentialProbes) std::size_t const size_gold = 5; EXPECT_EQ(output_size, size_gold); - auto result = hash_join.left_join(t0, cudf::null_equality::EQUAL, optional_size); + auto result = hash_join.left_join(t0, optional_size); column_wrapper col_gold_0{{0, 1, 2, 3, 4}}; column_wrapper col_gold_1{{NoneValue, NoneValue, NoneValue, NoneValue, 4}}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); @@ -1350,7 +1350,7 @@ TEST_F(JoinTest, HashJoinSequentialProbes) std::size_t const size_gold = 3; EXPECT_EQ(output_size, size_gold); - auto result = hash_join.inner_join(t0, cudf::null_equality::EQUAL, optional_size); + auto result = hash_join.inner_join(t0, optional_size); column_wrapper col_gold_0{{2, 4, 0}}; column_wrapper col_gold_1{{1, 1, 4}}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); @@ -1390,7 +1390,7 @@ TEST_F(JoinTest, HashJoinWithStructsAndNulls) { auto output_size = hash_join.left_join_size(t0); EXPECT_EQ(5, output_size); - auto result = hash_join.left_join(t0, cudf::null_equality::EQUAL, output_size); + auto result = hash_join.left_join(t0, output_size); column_wrapper col_gold_0{{0, 1, 2, 3, 4}}; column_wrapper col_gold_1{{0, NoneValue, 2, NoneValue, NoneValue}}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); @@ -1400,7 +1400,7 @@ TEST_F(JoinTest, HashJoinWithStructsAndNulls) { auto output_size = hash_join.inner_join_size(t0); EXPECT_EQ(2, output_size); - auto result = hash_join.inner_join(t0, cudf::null_equality::EQUAL, output_size); + auto result = hash_join.inner_join(t0, output_size); column_wrapper col_gold_0{{0, 2}}; column_wrapper col_gold_1{{0, 2}}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); @@ -1410,7 +1410,7 @@ TEST_F(JoinTest, HashJoinWithStructsAndNulls) { auto output_size = hash_join.full_join_size(t0); EXPECT_EQ(8, output_size); - auto result = hash_join.full_join(t0, cudf::null_equality::EQUAL, output_size); + auto result = hash_join.full_join(t0, output_size); column_wrapper col_gold_0{{NoneValue, NoneValue, NoneValue, 0, 1, 2, 3, 4}}; column_wrapper col_gold_1{{1, 3, 4, 0, NoneValue, 2, NoneValue, NoneValue}}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index bb0321d0a16..17e10933b65 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -539,14 +539,11 @@ private static native long[] leftJoin(long leftTable, int[] leftJoinCols, long r private static native long[] leftJoinGatherMaps(long leftKeys, long rightKeys, boolean compareNullsEqual) throws CudfException; - private static native long leftJoinRowCount(long leftTable, long rightHashJoin, - boolean nullsEqual) throws CudfException; + private static native long leftJoinRowCount(long leftTable, long rightHashJoin) throws CudfException; - private static native long[] leftHashJoinGatherMaps(long leftTable, long rightHashJoin, - boolean nullsEqual) throws CudfException; + private static native long[] leftHashJoinGatherMaps(long leftTable, long rightHashJoin) throws CudfException; private static native long[] leftHashJoinGatherMapsWithCount(long leftTable, long rightHashJoin, - boolean nullsEqual, long outputRowCount) throws CudfException; private static native long[] innerJoin(long leftTable, int[] leftJoinCols, long rightTable, @@ -555,14 +552,11 @@ private static native long[] innerJoin(long leftTable, int[] leftJoinCols, long private static native long[] innerJoinGatherMaps(long leftKeys, long rightKeys, boolean compareNullsEqual) throws CudfException; - private static native long innerJoinRowCount(long table, long hashJoin, - boolean nullsEqual) throws CudfException; + private static native long innerJoinRowCount(long table, long hashJoin) throws CudfException; - private static native long[] innerHashJoinGatherMaps(long table, long hashJoin, - boolean nullsEqual) throws CudfException; + private static native long[] innerHashJoinGatherMaps(long table, long hashJoin) throws CudfException; private static native long[] innerHashJoinGatherMapsWithCount(long table, long hashJoin, - boolean nullsEqual, long outputRowCount) throws CudfException; private static native long[] fullJoin(long leftTable, int[] leftJoinCols, long rightTable, @@ -571,14 +565,11 @@ private static native long[] fullJoin(long leftTable, int[] leftJoinCols, long r private static native long[] fullJoinGatherMaps(long leftKeys, long rightKeys, boolean compareNullsEqual) throws CudfException; - private static native long fullJoinRowCount(long leftTable, long rightHashJoin, - boolean nullsEqual) throws CudfException; + private static native long fullJoinRowCount(long leftTable, long rightHashJoin) throws CudfException; - private static native long[] fullHashJoinGatherMaps(long leftTable, long rightHashJoin, - boolean nullsEqual) throws CudfException; + private static native long[] fullHashJoinGatherMaps(long leftTable, long rightHashJoin) throws CudfException; private static native long[] fullHashJoinGatherMapsWithCount(long leftTable, long rightHashJoin, - boolean nullsEqual, long outputRowCount) throws CudfException; private static native long[] leftSemiJoin(long leftTable, int[] leftJoinCols, long rightTable, @@ -2318,8 +2309,7 @@ public long leftJoinRowCount(HashJoin rightHash) { throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } - return leftJoinRowCount(getNativeView(), rightHash.getNativeView(), - rightHash.getCompareNulls()); + return leftJoinRowCount(getNativeView(), rightHash.getNativeView()); } /** @@ -2337,9 +2327,7 @@ public GatherMap[] leftJoinGatherMaps(HashJoin rightHash) { throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } - long[] gatherMapData = - leftHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(), - rightHash.getCompareNulls()); + long[] gatherMapData = leftHashJoinGatherMaps(getNativeView(), rightHash.getNativeView()); return buildJoinGatherMaps(gatherMapData); } @@ -2363,9 +2351,8 @@ public GatherMap[] leftJoinGatherMaps(HashJoin rightHash, long outputRowCount) { throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } - long[] gatherMapData = - leftHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(), - rightHash.getCompareNulls(), outputRowCount); + long[] gatherMapData = leftHashJoinGatherMapsWithCount(getNativeView(), + rightHash.getNativeView(), outputRowCount); return buildJoinGatherMaps(gatherMapData); } @@ -2545,8 +2532,7 @@ public long innerJoinRowCount(HashJoin otherHash) { throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "otherKeys: " + otherHash.getNumberOfColumns()); } - return innerJoinRowCount(getNativeView(), otherHash.getNativeView(), - otherHash.getCompareNulls()); + return innerJoinRowCount(getNativeView(), otherHash.getNativeView()); } /** @@ -2564,9 +2550,7 @@ public GatherMap[] innerJoinGatherMaps(HashJoin rightHash) { throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } - long[] gatherMapData = - innerHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(), - rightHash.getCompareNulls()); + long[] gatherMapData = innerHashJoinGatherMaps(getNativeView(), rightHash.getNativeView()); return buildJoinGatherMaps(gatherMapData); } @@ -2590,9 +2574,8 @@ public GatherMap[] innerJoinGatherMaps(HashJoin rightHash, long outputRowCount) throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } - long[] gatherMapData = - innerHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(), - rightHash.getCompareNulls(), outputRowCount); + long[] gatherMapData = innerHashJoinGatherMapsWithCount(getNativeView(), + rightHash.getNativeView(), outputRowCount); return buildJoinGatherMaps(gatherMapData); } @@ -2778,8 +2761,7 @@ public long fullJoinRowCount(HashJoin rightHash) { throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } - return fullJoinRowCount(getNativeView(), rightHash.getNativeView(), - rightHash.getCompareNulls()); + return fullJoinRowCount(getNativeView(), rightHash.getNativeView()); } /** @@ -2797,9 +2779,7 @@ public GatherMap[] fullJoinGatherMaps(HashJoin rightHash) { throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } - long[] gatherMapData = - fullHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(), - rightHash.getCompareNulls()); + long[] gatherMapData = fullHashJoinGatherMaps(getNativeView(), rightHash.getNativeView()); return buildJoinGatherMaps(gatherMapData); } @@ -2823,9 +2803,8 @@ public GatherMap[] fullJoinGatherMaps(HashJoin rightHash, long outputRowCount) { throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } - long[] gatherMapData = - fullHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(), - rightHash.getCompareNulls(), outputRowCount); + long[] gatherMapData = fullHashJoinGatherMapsWithCount(getNativeView(), + rightHash.getNativeView(), outputRowCount); return buildJoinGatherMaps(gatherMapData); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index aeac1856db0..eac76222475 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -812,15 +812,14 @@ jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys, // a hash table built from the join's right table. template jlongArray hash_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_hash_join, - jboolean compare_nulls_equal, T join_func) { + T join_func) { JNI_NULL_CHECK(env, j_left_keys, "left table is null", NULL); JNI_NULL_CHECK(env, j_right_hash_join, "hash join is null", NULL); try { cudf::jni::auto_set_device(env); auto left_keys = reinterpret_cast(j_left_keys); auto hash_join = reinterpret_cast(j_right_hash_join); - auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; - return gather_maps_to_java(env, join_func(*left_keys, *hash_join, nulleq)); + return gather_maps_to_java(env, join_func(*left_keys, *hash_join)); } CATCH_STD(env, NULL); } @@ -2172,41 +2171,36 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoinGatherMaps( JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_leftJoinRowCount(JNIEnv *env, jclass, jlong j_left_table, - jlong j_right_hash_join, - jboolean compare_nulls_equal) { + jlong j_right_hash_join) { JNI_NULL_CHECK(env, j_left_table, "left table is null", 0); JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0); try { cudf::jni::auto_set_device(env); auto left_table = reinterpret_cast(j_left_table); auto hash_join = reinterpret_cast(j_right_hash_join); - auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; - auto row_count = hash_join->left_join_size(*left_table, nulleq); + auto row_count = hash_join->left_join_size(*left_table); return static_cast(row_count); } CATCH_STD(env, 0); } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMaps( - JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, - jboolean compare_nulls_equal) { + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) { return cudf::jni::hash_join_gather_maps( - env, j_left_table, j_right_hash_join, compare_nulls_equal, - [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) { - return hash.left_join(left, nulleq); + env, j_left_table, j_right_hash_join, + [](cudf::table_view const &left, cudf::hash_join const &hash) { + return hash.left_join(left); }); } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMapsWithCount( - JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal, - jlong j_output_row_count) { + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) { auto output_row_count = static_cast(j_output_row_count); - return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal, - [output_row_count](cudf::table_view const &left, - cudf::hash_join const &hash, - cudf::null_equality nulleq) { - return hash.left_join(left, nulleq, output_row_count); - }); + return cudf::jni::hash_join_gather_maps( + env, j_left_table, j_right_hash_join, + [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) { + return hash.left_join(left, output_row_count); + }); } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinRowCount(JNIEnv *env, jclass, @@ -2305,41 +2299,36 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoinGatherMaps( JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_innerJoinRowCount(JNIEnv *env, jclass, jlong j_left_table, - jlong j_right_hash_join, - jboolean compare_nulls_equal) { + jlong j_right_hash_join) { JNI_NULL_CHECK(env, j_left_table, "left table is null", 0); JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0); try { cudf::jni::auto_set_device(env); auto left_table = reinterpret_cast(j_left_table); auto hash_join = reinterpret_cast(j_right_hash_join); - auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; - auto row_count = hash_join->inner_join_size(*left_table, nulleq); + auto row_count = hash_join->inner_join_size(*left_table); return static_cast(row_count); } CATCH_STD(env, 0); } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMaps( - JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, - jboolean compare_nulls_equal) { + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) { return cudf::jni::hash_join_gather_maps( - env, j_left_table, j_right_hash_join, compare_nulls_equal, - [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) { - return hash.inner_join(left, nulleq); + env, j_left_table, j_right_hash_join, + [](cudf::table_view const &left, cudf::hash_join const &hash) { + return hash.inner_join(left); }); } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMapsWithCount( - JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal, - jlong j_output_row_count) { + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) { auto output_row_count = static_cast(j_output_row_count); - return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal, - [output_row_count](cudf::table_view const &left, - cudf::hash_join const &hash, - cudf::null_equality nulleq) { - return hash.inner_join(left, nulleq, output_row_count); - }); + return cudf::jni::hash_join_gather_maps( + env, j_left_table, j_right_hash_join, + [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) { + return hash.inner_join(left, output_row_count); + }); } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinRowCount(JNIEnv *env, jclass, @@ -2438,41 +2427,36 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoinGatherMaps( JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_fullJoinRowCount(JNIEnv *env, jclass, jlong j_left_table, - jlong j_right_hash_join, - jboolean compare_nulls_equal) { + jlong j_right_hash_join) { JNI_NULL_CHECK(env, j_left_table, "left table is null", 0); JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0); try { cudf::jni::auto_set_device(env); auto left_table = reinterpret_cast(j_left_table); auto hash_join = reinterpret_cast(j_right_hash_join); - auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; - auto row_count = hash_join->full_join_size(*left_table, nulleq); + auto row_count = hash_join->full_join_size(*left_table); return static_cast(row_count); } CATCH_STD(env, 0); } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMaps( - JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, - jboolean compare_nulls_equal) { + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) { return cudf::jni::hash_join_gather_maps( - env, j_left_table, j_right_hash_join, compare_nulls_equal, - [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) { - return hash.full_join(left, nulleq); + env, j_left_table, j_right_hash_join, + [](cudf::table_view const &left, cudf::hash_join const &hash) { + return hash.full_join(left); }); } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMapsWithCount( - JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal, - jlong j_output_row_count) { + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) { auto output_row_count = static_cast(j_output_row_count); - return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal, - [output_row_count](cudf::table_view const &left, - cudf::hash_join const &hash, - cudf::null_equality nulleq) { - return hash.full_join(left, nulleq, output_row_count); - }); + return cudf::jni::hash_join_gather_maps( + env, j_left_table, j_right_hash_join, + [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) { + return hash.full_join(left, output_row_count); + }); } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalFullJoinGatherMaps( From 8f0f6f851e4498ec99baad0345c52bf1e5769621 Mon Sep 17 00:00:00 2001 From: MithunR Date: Thu, 10 Feb 2022 14:10:36 -0800 Subject: [PATCH 10/47] Support `percent_rank()` aggregation (#10227) Fixes #9644. The `percent_rank()` aggregation is a ranking function, similar to `rank()` and `dense_rank()`. It calculates the fractional/relative rank of a row within a group of rows in a column, and returns a double value in the range [0.0, 1.0] for each row in the group/column. This commit includes the `libcudf` changes to support `percent_rank()` aggregations, and the supporting JNI bindings. Note that `percent_rank()` is typically used as a window aggregation. It is implemented as a (grouped) scan aggregation, just as `rank()` and `dense_rank()` because it operates across the whole group of rows. (i.e. the window specification if fixed, spanning the entire group.) References: 1. [SQL Server](https://docs.microsoft.com/en-us/sql/t-sql/functions/percent-rank-transact-sql) 2. [PostgreSQL](https://www.postgresql.org/docs/10/functions-window.html) 3. [SparkSQL](https://sparkbyexamples.com/spark/spark-sql-window-functions/#ranking-functions) Authors: - MithunR (https://github.com/mythrocks) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Bradley Dice (https://github.com/bdice) - Conor Hoekstra (https://github.com/codereport) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/10227 --- cpp/include/cudf/aggregation.hpp | 147 +++-- .../cudf/detail/aggregation/aggregation.hpp | 29 +- cpp/include/cudf/detail/scan.hpp | 14 +- cpp/include/cudf/rolling.hpp | 4 +- cpp/src/aggregation/aggregation.cpp | 23 +- cpp/src/groupby/sort/group_rank_scan.cu | 47 +- cpp/src/groupby/sort/group_scan.hpp | 17 +- cpp/src/groupby/sort/scan.cpp | 19 +- cpp/src/reductions/scan/rank_scan.cu | 30 +- cpp/src/reductions/scan/scan.cpp | 11 +- cpp/tests/groupby/rank_scan_tests.cpp | 505 ++++++++++-------- cpp/tests/reductions/rank_tests.cpp | 395 ++++++++------ .../main/java/ai/rapids/cudf/Aggregation.java | 20 +- .../rapids/cudf/GroupByScanAggregation.java | 9 +- .../java/ai/rapids/cudf/ScanAggregation.java | 9 +- java/src/main/native/src/AggregationJni.cpp | 8 +- .../java/ai/rapids/cudf/ColumnVectorTest.java | 22 + .../test/java/ai/rapids/cudf/TableTest.java | 14 +- 18 files changed, 862 insertions(+), 461 deletions(-) diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index 23587f49334..fb5b968671f 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -79,6 +79,7 @@ class aggregation { ROW_NUMBER, ///< get row-number of current index (relative to rolling window) RANK, ///< get rank of current index DENSE_RANK, ///< get dense rank of current index + PERCENT_RANK, ///< get percent (i.e. fractional) rank of current index COLLECT_LIST, ///< collect values into a list COLLECT_SET, ///< collect values into a list without duplicate entries LEAD, ///< window function, accesses row at specified offset following current row @@ -305,34 +306,34 @@ std::unique_ptr make_row_number_aggregation(); * 3. `RANK` aggregations are not compatible with exclusive scans. * * @code{.pseudo} - * Example: Consider an motor-racing statistics dataset, containing the following columns: - * 1. driver_name: (STRING) Name of the car driver - * 2. num_overtakes: (INT32) Number of times the driver overtook another car in a lap - * 3. lap_number: (INT32) The number of the lap + * Example: Consider a motor-racing statistics dataset, containing the following columns: + * 1. venue: (STRING) Location of the race event + * 2. driver: (STRING) Name of the car driver (abbreviated to 3 characters) + * 3. time: (INT32) Time taken to complete the circuit * * For the following presorted data: * - * [ // driver_name, num_overtakes, lap_number - * { "bottas", 2, 3 }, - * { "bottas", 2, 7 }, - * { "bottas", 2, 7 }, - * { "bottas", 1, 1 }, - * { "bottas", 1, 2 }, - * { "hamilton", 4, 1 }, - * { "hamilton", 4, 1 }, - * { "hamilton", 3, 4 }, - * { "hamilton", 2, 4 } + * [ // venue, driver, time + * { "silverstone", "HAM" ("hamilton"), 15823}, + * { "silverstone", "LEC" ("leclerc"), 15827}, + * { "silverstone", "BOT" ("bottas"), 15834}, // <-- Tied for 3rd place. + * { "silverstone", "NOR" ("norris"), 15834}, // <-- Tied for 3rd place. + * { "silverstone", "RIC" ("ricciardo"), 15905}, + * { "monza", "RIC" ("ricciardo"), 12154}, + * { "monza", "NOR" ("norris"), 12156}, // <-- Tied for 2nd place. + * { "monza", "BOT" ("bottas"), 12156}, // <-- Tied for 2nd place. + * { "monza", "LEC" ("leclerc"), 12201}, + * { "monza", "PER" ("perez"), 12203} * ] * * A grouped rank aggregation scan with: - * groupby column : driver_name - * input orderby column: struct_column{num_overtakes, lap_number} - * result: column{1, 2, 2, 4, 5, 1, 1, 3, 4} - * - * A grouped rank aggregation scan with: - * groupby column : driver_name - * input orderby column: num_overtakes - * result: column{1, 1, 1, 4, 4, 1, 1, 3, 4} + * groupby column : venue + * input orderby column: time + * Produces the following rank column: + * { 1, 2, 3, 3, 5, 1, 2, 2, 4, 5} + * (This corresponds to the following grouping and `driver` rows:) + * { "HAM", "LEC", "BOT", "NOR", "RIC", "RIC", "NOR", "BOT", "LEC", "PER" } + * <----------silverstone----------->|<-------------monza--------------> * @endcode */ template @@ -357,39 +358,95 @@ std::unique_ptr make_rank_aggregation(); * 3. `DENSE_RANK` aggregations are not compatible with exclusive scans. * * @code{.pseudo} - * Example: Consider an motor-racing statistics dataset, containing the following columns: - * 1. driver_name: (STRING) Name of the car driver - * 2. num_overtakes: (INT32) Number of times the driver overtook another car in a lap - * 3. lap_number: (INT32) The number of the lap + * Example: Consider a motor-racing statistics dataset, containing the following columns: + * 1. venue: (STRING) Location of the race event + * 2. driver: (STRING) Name of the car driver (abbreviated to 3 characters) + * 3. time: (INT32) Time taken to complete the circuit * * For the following presorted data: * - * [ // driver_name, num_overtakes, lap_number - * { "bottas", 2, 3 }, - * { "bottas", 2, 7 }, - * { "bottas", 2, 7 }, - * { "bottas", 1, 1 }, - * { "bottas", 1, 2 }, - * { "hamilton", 4, 1 }, - * { "hamilton", 4, 1 }, - * { "hamilton", 3, 4 }, - * { "hamilton", 2, 4 } + * [ // venue, driver, time + * { "silverstone", "HAM" ("hamilton"), 15823}, + * { "silverstone", "LEC" ("leclerc"), 15827}, + * { "silverstone", "BOT" ("bottas"), 15834}, // <-- Tied for 3rd place. + * { "silverstone", "NOR" ("norris"), 15834}, // <-- Tied for 3rd place. + * { "silverstone", "RIC" ("ricciardo"), 15905}, + * { "monza", "RIC" ("ricciardo"), 12154}, + * { "monza", "NOR" ("norris"), 12156}, // <-- Tied for 2nd place. + * { "monza", "BOT" ("bottas"), 12156}, // <-- Tied for 2nd place. + * { "monza", "LEC" ("leclerc"), 12201}, + * { "monza", "PER" ("perez"), 12203} * ] * * A grouped dense rank aggregation scan with: - * groupby column : driver_name - * input orderby column: struct_column{num_overtakes, lap_number} - * result: column{1, 2, 2, 3, 4, 1, 1, 2, 3} - * - * A grouped dense rank aggregation scan with: - * groupby column : driver_name - * input orderby column: num_overtakes - * result: column{1, 1, 1, 2, 2, 1, 1, 2, 3} + * groupby column : venue + * input orderby column: time + * Produces the following dense rank column: + * { 1, 2, 3, 3, 4, 1, 2, 2, 3, 4} + * (This corresponds to the following grouping and `driver` rows:) + * { "HAM", "LEC", "BOT", "NOR", "RIC", "RIC", "NOR", "BOT", "LEC", "PER" } + * <----------silverstone----------->|<-------------monza--------------> * @endcode */ template std::unique_ptr make_dense_rank_aggregation(); +/** + * @brief Factory to create a PERCENT_RANK aggregation + * + * `PERCENT_RANK` returns a non-nullable column of double precision "fractional" ranks. + * For row index `i`, the percent rank of row `i` is defined as: + * percent_rank = (rank - 1) / (group_row_count - 1) + * where, + * 1. rank is the `RANK` of the row within the group + * 2. group_row_count is the number of rows in the group + * + * This aggregation only works with "scan" algorithms. The input to the grouped or + * ungrouped scan is an orderby column that orders the rows that the aggregate function ranks. + * If rows are ordered by more than one column, the orderby input column should be a struct + * column containing the ordering columns. + * + * Note: + * 1. This method requires that the rows are presorted by the group keys and order_by columns. + * 2. `PERCENT_RANK` aggregations will return a fully valid column regardless of null_handling + * policy specified in the scan. + * 3. `PERCENT_RANK` aggregations are not compatible with exclusive scans. + * + * @code{.pseudo} + * Example: Consider a motor-racing statistics dataset, containing the following columns: + * 1. venue: (STRING) Location of the race event + * 2. driver: (STRING) Name of the car driver (abbreviated to 3 characters) + * 3. time: (INT32) Time taken to complete the circuit + * + * For the following presorted data: + * + * [ // venue, driver, time + * { "silverstone", "HAM" ("hamilton"), 15823}, + * { "silverstone", "LEC" ("leclerc"), 15827}, + * { "silverstone", "BOT" ("bottas"), 15834}, // <-- Tied for 3rd place. + * { "silverstone", "NOR" ("norris"), 15834}, // <-- Tied for 3rd place. + * { "silverstone", "RIC" ("ricciardo"), 15905}, + * { "monza", "RIC" ("ricciardo"), 12154}, + * { "monza", "NOR" ("norris"), 12156}, // <-- Tied for 2nd place. + * { "monza", "BOT" ("bottas"), 12156}, // <-- Tied for 2nd place. + * { "monza", "LEC" ("leclerc"), 12201}, + * { "monza", "PER" ("perez"), 12203} + * ] + * + * A grouped percent rank aggregation scan with: + * groupby column : venue + * input orderby column: time + * Produces the following percent rank column: + * { 0.00, 0.25, 0.50, 0.50, 1.00, 0.00, 0.25, 0.25, 0.75, 1.00 } + * + * (This corresponds to the following grouping and `driver` rows:) + * { "HAM", "LEC", "BOT", "NOR", "RIC", "RIC", "NOR", "BOT", "LEC", "PER" } + * <----------silverstone----------->|<-------------monza--------------> + * @endcode + */ +template +std::unique_ptr make_percent_rank_aggregation(); + /** * @brief Factory to create a COLLECT_LIST aggregation * diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index fbf315776f4..a234f4d3715 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -77,6 +77,8 @@ class simple_aggregations_collector { // Declares the interface for the simple class rank_aggregation const& agg); virtual std::vector> visit(data_type col_type, class dense_rank_aggregation const& agg); + virtual std::vector> visit( + data_type col_type, class percent_rank_aggregation const& agg); virtual std::vector> visit( data_type col_type, class collect_list_aggregation const& agg); virtual std::vector> visit(data_type col_type, @@ -126,6 +128,7 @@ class aggregation_finalizer { // Declares the interface for the finalizer virtual void visit(class row_number_aggregation const& agg); virtual void visit(class rank_aggregation const& agg); virtual void visit(class dense_rank_aggregation const& agg); + virtual void visit(class percent_rank_aggregation const& agg); virtual void visit(class collect_list_aggregation const& agg); virtual void visit(class collect_set_aggregation const& agg); virtual void visit(class lead_lag_aggregation const& agg); @@ -654,6 +657,22 @@ class dense_rank_aggregation final : public rolling_aggregation, public groupby_ void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } }; +class percent_rank_aggregation final : public rolling_aggregation, public groupby_scan_aggregation { + public: + percent_rank_aggregation() : aggregation{PERCENT_RANK} {} + + [[nodiscard]] std::unique_ptr clone() const override + { + return std::make_unique(*this); + } + std::vector> get_simple_aggregations( + data_type col_type, simple_aggregations_collector& collector) const override + { + return collector.visit(col_type, *this); + } + void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } +}; + /** * @brief Derived aggregation class for specifying COLLECT_LIST aggregation */ @@ -1243,6 +1262,12 @@ struct target_type_impl { using type = size_type; }; +// Always use double for PERCENT_RANK +template +struct target_type_impl { + using type = double; +}; + // Always use list for COLLECT_LIST template struct target_type_impl { @@ -1405,6 +1430,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind return f.template operator()(std::forward(args)...); case aggregation::DENSE_RANK: return f.template operator()(std::forward(args)...); + case aggregation::PERCENT_RANK: + return f.template operator()(std::forward(args)...); case aggregation::COLLECT_LIST: return f.template operator()(std::forward(args)...); case aggregation::COLLECT_SET: diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp index 8e3db1c7b10..36dce6caf0b 100644 --- a/cpp/include/cudf/detail/scan.hpp +++ b/cpp/include/cudf/detail/scan.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -102,5 +102,17 @@ std::unique_ptr inclusive_dense_rank_scan(column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @brief Generate row percent ranks for a column. + * + * @param order_by Input column to generate ranks for. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return rank values. + */ +std::unique_ptr inclusive_percent_rank_scan(column_view const& order_by, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp index 3f35b796e58..ba34e20398e 100644 --- a/cpp/include/cudf/rolling.hpp +++ b/cpp/include/cudf/rolling.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -451,7 +451,7 @@ std::unique_ptr grouped_time_range_rolling_window( * should be the exact same type (`INT32`). * * @code{.pseudo} - * Example: Consider an motor-racing statistics dataset, containing the following columns: + * Example: Consider a motor-racing statistics dataset, containing the following columns: * 1. driver_name: (STRING) Name of the car driver * 2. num_overtakes: (INT32) Number of times the driver overtook another car in a lap * 3. lap_number: (INT32) The number of the lap diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index c00b3d6db85..8e2a167f7b2 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -160,6 +160,12 @@ std::vector> simple_aggregations_collector::visit( return visit(col_type, static_cast(agg)); } +std::vector> simple_aggregations_collector::visit( + data_type col_type, percent_rank_aggregation const& agg) +{ + return visit(col_type, static_cast(agg)); +} + std::vector> simple_aggregations_collector::visit( data_type col_type, collect_list_aggregation const& agg) { @@ -333,6 +339,11 @@ void aggregation_finalizer::visit(dense_rank_aggregation const& agg) visit(static_cast(agg)); } +void aggregation_finalizer::visit(percent_rank_aggregation const& agg) +{ + visit(static_cast(agg)); +} + void aggregation_finalizer::visit(collect_list_aggregation const& agg) { visit(static_cast(agg)); @@ -616,6 +627,16 @@ template std::unique_ptr make_dense_rank_aggregation() template std::unique_ptr make_dense_rank_aggregation(); +/// Factory to create a PERCENT_RANK aggregation +template +std::unique_ptr make_percent_rank_aggregation() +{ + return std::make_unique(); +} +template std::unique_ptr make_percent_rank_aggregation(); +template std::unique_ptr +make_percent_rank_aggregation(); + /// Factory to create a COLLECT_LIST aggregation template std::unique_ptr make_collect_list_aggregation(null_policy null_handling) diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu index 62aa3df8e5c..eae7d0b6129 100644 --- a/cpp/src/groupby/sort/group_rank_scan.cu +++ b/cpp/src/groupby/sort/group_rank_scan.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,8 @@ #include #include +#include +#include #include #include #include @@ -98,7 +100,9 @@ std::unique_ptr rank_scan(column_view const& order_by, order_by, group_labels, group_offsets, - [] __device__(bool equality, auto row_index) { return equality ? row_index + 1 : 0; }, + [] __device__(bool unequal, auto row_index_in_group) { + return unequal ? row_index_in_group + 1 : 0; + }, DeviceMax{}, has_nested_nulls(table_view{{order_by}}), stream, @@ -115,13 +119,50 @@ std::unique_ptr dense_rank_scan(column_view const& order_by, order_by, group_labels, group_offsets, - [] __device__(bool equality, auto row_index) { return equality; }, + [] __device__(bool const unequal, size_type const) { return unequal ? 1 : 0; }, DeviceSum{}, has_nested_nulls(table_view{{order_by}}), stream, mr); } +std::unique_ptr percent_rank_scan(column_view const& order_by, + device_span group_labels, + device_span group_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const rank_column = rank_scan( + order_by, group_labels, group_offsets, stream, rmm::mr::get_current_device_resource()); + auto const rank_view = rank_column->view(); + auto const group_size_iter = cudf::detail::make_counting_transform_iterator( + 0, + [labels = group_labels.begin(), + offsets = group_offsets.begin()] __device__(size_type row_index) { + auto const group_label = labels[row_index]; + auto const group_start = offsets[group_label]; + auto const group_end = offsets[group_label + 1]; + return group_end - group_start; + }); + + // Result type for PERCENT_RANK is independent of input type. + using result_type = cudf::detail::target_type_t; + + auto percent_rank_result = cudf::make_fixed_width_column( + data_type{type_to_id()}, rank_view.size(), mask_state::UNALLOCATED, stream, mr); + + thrust::transform(rmm::exec_policy(stream), + rank_view.begin(), + rank_view.end(), + group_size_iter, + percent_rank_result->mutable_view().begin(), + [] __device__(auto const rank, auto const group_size) { + return group_size == 1 ? 0.0 : ((rank - 1.0) / (group_size - 1)); + }); + + return percent_rank_result; +} + } // namespace detail } // namespace groupby } // namespace cudf diff --git a/cpp/src/groupby/sort/group_scan.hpp b/cpp/src/groupby/sort/group_scan.hpp index 82ef0e25380..76a7f3f73c7 100644 --- a/cpp/src/groupby/sort/group_scan.hpp +++ b/cpp/src/groupby/sort/group_scan.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -116,6 +116,21 @@ std::unique_ptr dense_rank_scan(column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @brief Internal API to calculate groupwise percent rank value + * + * @param order_by column or struct column by which the rows within a group are sorted + * @param group_labels ID of group to which the row belongs + * @param group_offsets group index offsets with group ID indices + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Column of type `double` of percent rank values + */ +std::unique_ptr percent_rank_scan(column_view const& order_by, + device_span group_labels, + device_span group_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace groupby } // namespace cudf diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp index 402ff8c47ed..8c4959da35b 100644 --- a/cpp/src/groupby/sort/scan.cpp +++ b/cpp/src/groupby/sort/scan.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -144,6 +144,23 @@ void scan_result_functor::operator()(aggregation const& detail::dense_rank_scan( order_by, helper.group_labels(stream), helper.group_offsets(stream), stream, mr)); } + +template <> +void scan_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(values, agg)) return; + CUDF_EXPECTS(helper.is_presorted(), + "Percent rank aggregate in groupby scan requires the keys to be presorted"); + auto const order_by = get_grouped_values(); + CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by), + "Unsupported list type in grouped percent_rank scan."); + + cache.add_result( + values, + agg, + detail::percent_rank_scan( + order_by, helper.group_labels(stream), helper.group_offsets(stream), stream, mr)); +} } // namespace detail // Sort-based groupby diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu index 9ac4db3a34b..464a8688a2d 100644 --- a/cpp/src/reductions/scan/rank_scan.cu +++ b/cpp/src/reductions/scan/rank_scan.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -90,7 +91,7 @@ std::unique_ptr inclusive_dense_rank_scan(column_view const& order_by, return rank_generator( order_by, has_nested_nulls(table_view{{order_by}}), - [] __device__(bool equality, auto row_index) { return equality; }, + [] __device__(bool const unequal, size_type const) { return unequal ? 1 : 0; }, DeviceSum{}, stream, mr); @@ -105,11 +106,34 @@ std::unique_ptr inclusive_rank_scan(column_view const& order_by, return rank_generator( order_by, has_nested_nulls(table_view{{order_by}}), - [] __device__(bool equality, auto row_index) { return equality ? row_index + 1 : 0; }, + [] __device__(bool unequal, auto row_index) { return unequal ? row_index + 1 : 0; }, DeviceMax{}, stream, mr); } +std::unique_ptr inclusive_percent_rank_scan(column_view const& order_by, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const rank_column = + inclusive_rank_scan(order_by, stream, rmm::mr::get_current_device_resource()); + auto const rank_view = rank_column->view(); + + // Result type for PERCENT_RANK is independent of input type. + using result_type = cudf::detail::target_type_t; + auto percent_rank_result = cudf::make_fixed_width_column( + data_type{type_to_id()}, rank_view.size(), mask_state::UNALLOCATED, stream, mr); + + thrust::transform(rmm::exec_policy(stream), + rank_view.begin(), + rank_view.end(), + percent_rank_result->mutable_view().begin(), + [n_rows = rank_view.size()] __device__(auto const rank) { + return n_rows == 1 ? 0.0 : ((rank - 1.0) / (n_rows - 1)); + }); + return percent_rank_result; +} + } // namespace detail } // namespace cudf diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp index 5f10707d7fb..d73fc862380 100644 --- a/cpp/src/reductions/scan/scan.cpp +++ b/cpp/src/reductions/scan/scan.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,14 +34,19 @@ std::unique_ptr scan(column_view const& input, if (agg->kind == aggregation::RANK) { CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE, - "Unsupported rank aggregation operator for exclusive scan"); + "Rank aggregation operator requires an inclusive scan"); return inclusive_rank_scan(input, rmm::cuda_stream_default, mr); } if (agg->kind == aggregation::DENSE_RANK) { CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE, - "Unsupported dense rank aggregation operator for exclusive scan"); + "Dense rank aggregation operator requires an inclusive scan"); return inclusive_dense_rank_scan(input, rmm::cuda_stream_default, mr); } + if (agg->kind == aggregation::PERCENT_RANK) { + CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE, + "Percent rank aggregation operator requires an inclusive scan"); + return inclusive_percent_rank_scan(input, rmm::cuda_stream_default, mr); + } return inclusive == scan_type::EXCLUSIVE ? detail::scan_exclusive(input, agg, null_handling, rmm::cuda_stream_default, mr) diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp index 1b1b12ea69e..81369beb2ec 100644 --- a/cpp/tests/groupby/rank_scan_tests.cpp +++ b/cpp/tests/groupby/rank_scan_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,17 +23,27 @@ #include -using namespace cudf::test::iterators; - namespace cudf { namespace test { -inline void test_pair_rank_scans(column_view const& keys, - column_view const& order, - column_view const& expected_dense, - column_view const& expected_rank, - null_policy include_null_keys = null_policy::INCLUDE, - sorted keys_are_sorted = sorted::YES) +using namespace iterators; + +template +using input = fixed_width_column_wrapper; +using rank_result_col = fixed_width_column_wrapper; +using percent_result_t = + cudf::detail::target_type_t; +using percent_result_col = fixed_width_column_wrapper; +using null_iter_t = decltype(nulls_at({})); + +auto constexpr X = int32_t{0}; // Placeholder for NULL rows. +auto const all_valid = nulls_at({}); + +inline void test_rank_scans(column_view const& keys, + column_view const& order, + column_view const& expected_dense, + column_view const& expected_rank, + column_view const& expected_percent_rank) { test_single_scan(keys, order, @@ -49,6 +59,13 @@ inline void test_pair_rank_scans(column_view const& keys, make_rank_aggregation(), null_policy::INCLUDE, sorted::YES); + test_single_scan(keys, + order, + keys, + expected_percent_rank, + make_percent_rank_aggregation(), + null_policy::INCLUDE, + sorted::YES); } struct groupby_rank_scan_test : public BaseFixture { @@ -70,248 +87,283 @@ TYPED_TEST(typed_groupby_rank_scan_test, empty_cols) { using T = TypeParam; - fixed_width_column_wrapper keys{}; - fixed_width_column_wrapper order_col{}; - structs_column_wrapper struct_order{}; + auto const keys = input{}; + auto const order_by = input{}; + auto const order_by_struct = structs_column_wrapper{}; - fixed_width_column_wrapper expected_dense_vals{}; - fixed_width_column_wrapper expected_rank_vals{}; + auto const expected_dense = rank_result_col{}; + auto const expected_rank = rank_result_col{}; + auto const expected_percent = percent_result_col{}; - test_pair_rank_scans(keys, order_col, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals); + test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent); + test_rank_scans(keys, order_by_struct, expected_dense, expected_rank, expected_percent); } TYPED_TEST(typed_groupby_rank_scan_test, zero_valid_keys) { using T = TypeParam; - fixed_width_column_wrapper keys{{1, 2, 3}, all_nulls()}; - fixed_width_column_wrapper order_col1{3, 3, 1}; - fixed_width_column_wrapper order_col2{3, 3, 1}; - fixed_width_column_wrapper order_col3{3, 3, 1}; - structs_column_wrapper struct_order{order_col2, order_col3}; + auto const keys = input{{X, X, X}, all_nulls()}; + auto const order_by = input{{3, 3, 1}}; + auto const order_by_struct = [] { + auto member_1 = input{{3, 3, 1}}; + auto member_2 = input{{3, 3, 1}}; + return structs_column_wrapper{member_1, member_2}; + }(); - fixed_width_column_wrapper expected_dense_vals{1, 1, 2}; - fixed_width_column_wrapper expected_rank_vals{1, 1, 3}; + auto const dense_rank_results = rank_result_col{1, 1, 2}; + auto const rank_results = rank_result_col{1, 1, 3}; + auto const percent_rank_result = percent_result_col{0, 0, 1}; - test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals); + test_rank_scans(keys, order_by, dense_rank_results, rank_results, percent_rank_result); + test_rank_scans(keys, order_by_struct, dense_rank_results, rank_results, percent_rank_result); } TYPED_TEST(typed_groupby_rank_scan_test, zero_valid_orders) { - using T = TypeParam; - - fixed_width_column_wrapper keys{1, 1, 3, 3}; - fixed_width_column_wrapper order_col1{{5, 6, 7, 8}, all_nulls()}; - fixed_width_column_wrapper order_col2{{5, 6, 7, 8}, all_nulls()}; - fixed_width_column_wrapper order_col3{{5, 6, 7, 8}, all_nulls()}; - fixed_width_column_wrapper order_col4{{5, 6, 7, 8}, all_nulls()}; - fixed_width_column_wrapper order_col5{{5, 6, 7, 8}, all_nulls()}; - structs_column_wrapper struct_order{order_col2, order_col3}; - structs_column_wrapper struct_order_with_nulls{{order_col4, order_col5}, all_nulls()}; - - fixed_width_column_wrapper expected_dense_vals{1, 1, 1, 1}; - fixed_width_column_wrapper expected_rank_vals{1, 1, 1, 1}; - - test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order_with_nulls, expected_dense_vals, expected_rank_vals); + using T = TypeParam; + using null_iter_t = decltype(all_nulls()); + + auto const keys = input{{1, 1, 3, 3}}; + auto const make_order_by = [&] { return input{{X, X, X, X}, all_nulls()}; }; + auto const make_struct_order_by = [&](null_iter_t const& null_iter = no_nulls()) { + auto member1 = make_order_by(); + auto member2 = make_order_by(); + return structs_column_wrapper{{member1, member2}, null_iter}; + }; + auto const order_by = make_order_by(); + auto const order_by_struct = make_struct_order_by(); + auto const order_by_struct_all_nulls = make_struct_order_by(all_nulls()); + + auto const expected_dense = rank_result_col{1, 1, 1, 1}; + auto const expected_rank = rank_result_col{1, 1, 1, 1}; + auto const expected_percent = percent_result_col{0, 0, 0, 0}; + + test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent); + test_rank_scans(keys, order_by_struct, expected_dense, expected_rank, expected_percent); + test_rank_scans(keys, order_by_struct_all_nulls, expected_dense, expected_rank, expected_percent); } TYPED_TEST(typed_groupby_rank_scan_test, basic) { using T = TypeParam; - fixed_width_column_wrapper keys{0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; - fixed_width_column_wrapper order_col1{5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1}; - fixed_width_column_wrapper order_col2{5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1}; - fixed_width_column_wrapper order_col3{5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1}; - structs_column_wrapper struct_order{order_col2, order_col3}; - - fixed_width_column_wrapper expected_dense_vals = { - {1, 1, 1, 2, 2, 2, 3, 1, 2, 2, 3, 3}}; - fixed_width_column_wrapper expected_rank_vals = - fixed_width_column_wrapper{{1, 1, 1, 4, 4, 4, 7, 1, 2, 2, 4, 4}}; - - test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals); + auto const keys = input{0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; + auto const make_order_by = [&] { return input{5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1}; }; + auto const order_by = make_order_by(); + auto const order_by_struct = [&] { + auto order2 = make_order_by(); + auto order3 = make_order_by(); + return structs_column_wrapper{order2, order3}; + }(); + + auto const expected_dense = rank_result_col{1, 1, 1, 2, 2, 2, 3, 1, 2, 2, 3, 3}; + auto const expected_rank = rank_result_col{1, 1, 1, 4, 4, 4, 7, 1, 2, 2, 4, 4}; + auto const expected_percent = percent_result_col{ + 0.0, 0.0, 0.0, 3.0 / 6, 3.0 / 6, 3.0 / 6, 6.0 / 6, 0.0, 1.0 / 4, 1.0 / 4, 3.0 / 4, 3.0 / 4}; + + test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent); + test_rank_scans(keys, order_by_struct, expected_dense, expected_rank, expected_percent); } TYPED_TEST(typed_groupby_rank_scan_test, null_orders) { using T = TypeParam; - fixed_width_column_wrapper keys{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1}; - fixed_width_column_wrapper order_col1{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -5}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col2{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -5}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col3{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -5}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col4{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -5}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col5{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -5}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - structs_column_wrapper struct_order{order_col2, order_col3}; - structs_column_wrapper struct_order_with_nulls{{order_col4, order_col5}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - - fixed_width_column_wrapper expected_dense_vals{{1, 2, 3, 4, 5, 5, 1, 1, 2, 3, 3, 3}}; - fixed_width_column_wrapper expected_rank_vals{{1, 2, 3, 4, 5, 5, 1, 1, 3, 4, 4, 4}}; - - test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals); + auto const null_mask = nulls_at({2, 8}); + auto const keys = input{{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1}}; + auto const make_order_by = [&] { + return input{{-1, -2, X, -2, -3, -3, -4, -4, X, -5, -5, -5}, null_mask}; + }; + auto const make_struct_order_by = [&](null_iter_t const& null_iter = all_valid) { + auto member1 = make_order_by(); + auto member2 = make_order_by(); + return structs_column_wrapper{{member1, member2}, null_iter}; + }; + auto const order_by = make_order_by(); + auto const order_by_struct = make_struct_order_by(); + auto const order_by_struct_with_nulls = make_struct_order_by(null_mask); + + auto const expected_dense = rank_result_col{1, 2, 3, 4, 5, 5, 1, 1, 2, 3, 3, 3}; + auto const expected_rank = rank_result_col{1, 2, 3, 4, 5, 5, 1, 1, 3, 4, 4, 4}; + auto const expected_percent = percent_result_col{ + 0.0, 1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 4.0 / 5, 0.0, 0.0, 2.0 / 5, 3.0 / 5, 3.0 / 5, 3.0 / 5}; + + test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent); + test_rank_scans(keys, order_by_struct, expected_dense, expected_rank, expected_percent); + test_rank_scans( + keys, order_by_struct_with_nulls, expected_dense, expected_rank, expected_percent); } TYPED_TEST(typed_groupby_rank_scan_test, null_orders_and_keys) { using T = TypeParam; - fixed_width_column_wrapper keys = {{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}; - fixed_width_column_wrapper order_col1{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col2{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col3{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col4{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col5{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - structs_column_wrapper struct_order{order_col2, order_col3}; - structs_column_wrapper struct_order_with_nulls{{order_col4, order_col5}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - - fixed_width_column_wrapper expected_dense_vals{{1, 2, 3, 4, 5, 5, 1, 1, 2, 1, 1, 2}}; - fixed_width_column_wrapper expected_rank_vals{{1, 2, 3, 4, 5, 5, 1, 1, 3, 1, 1, 3}}; - - test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order_with_nulls, expected_dense_vals, expected_rank_vals); + auto const null_mask = nulls_at({2, 8}); + auto const keys = input{{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1}, nulls_at({9, 10, 11})}; + auto const make_order_by = [&] { + return input{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6}, null_mask}; + }; + auto const make_struct_order_by = [&](null_iter_t const& null_iter = all_valid) { + auto member1 = make_order_by(); + auto member2 = make_order_by(); + return structs_column_wrapper{{member1, member2}, null_iter}; + }; + auto const order_by = make_order_by(); + auto const order_by_struct = make_struct_order_by(); + auto const order_by_struct_with_nulls = make_struct_order_by(null_mask); + + auto const expected_dense = rank_result_col{{1, 2, 3, 4, 5, 5, 1, 1, 2, 1, 1, 2}}; + auto const expected_rank = rank_result_col{{1, 2, 3, 4, 5, 5, 1, 1, 3, 1, 1, 3}}; + auto const expected_percent = percent_result_col{ + {0.0, 1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 4.0 / 5, 0.0, 0.0, 2.0 / 2, 0.0, 0.0, 2.0 / 2}}; + + test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent); + test_rank_scans(keys, order_by_struct, expected_dense, expected_rank, expected_percent); + test_rank_scans( + keys, order_by_struct_with_nulls, expected_dense, expected_rank, expected_percent); } TYPED_TEST(typed_groupby_rank_scan_test, mixedStructs) { - auto col = fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; - auto strings = strings_column_wrapper{ - {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - auto struct_col = structs_column_wrapper{{col, strings}, null_at(11)}.release(); - - strings_column_wrapper keys = {{"0", "0", "0", "0", "0", "0", "1", "1", "1", "1", "0", "1"}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}; - auto expected_dense_vals = - fixed_width_column_wrapper{1, 1, 2, 2, 3, 4, 1, 1, 2, 1, 1, 2}; - auto expected_rank_vals = - fixed_width_column_wrapper{1, 1, 3, 3, 5, 6, 1, 1, 3, 1, 1, 3}; + auto const struct_col = [] { + auto nums = input{{0, 0, 7, 7, 7, X, 4, 4, 4, 9, 9, 9}, null_at(5)}; + auto strings = strings_column_wrapper{ + {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "XX", "9", "9", "10d"}, null_at(8)}; + return structs_column_wrapper{{nums, strings}, null_at(11)}.release(); + }(); + + auto const keys = strings_column_wrapper{ + {"0", "0", "0", "0", "0", "0", "1", "1", "1", "X", "X", "X"}, nulls_at({9, 10, 11})}; + + auto const expected_dense = rank_result_col{1, 1, 2, 2, 3, 4, 1, 1, 2, 1, 1, 2}; + auto const expected_rank = rank_result_col{1, 1, 3, 3, 5, 6, 1, 1, 3, 1, 1, 3}; + auto const expected_percent = percent_result_col{ + 0.0, 0.0, 2.0 / 5, 2.0 / 5, 4.0 / 5, 5.0 / 5, 0.0, 0.0, 2.0 / 2, 0.0, 0.0, 2.0 / 2}; std::vector requests; requests.emplace_back(groupby::scan_request()); requests[0].values = *struct_col; requests[0].aggregations.push_back(make_dense_rank_aggregation()); requests[0].aggregations.push_back(make_rank_aggregation()); + requests[0].aggregations.push_back(make_percent_rank_aggregation()); groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES); - auto result = gb_obj.scan(requests); + auto [result_keys, agg_results] = gb_obj.scan(requests); - CUDF_TEST_EXPECT_TABLES_EQUAL(table_view({keys}), result.first->view()); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[0], expected_dense_vals); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[1], expected_rank_vals); + CUDF_TEST_EXPECT_TABLES_EQUAL(table_view({keys}), result_keys->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[0], expected_dense); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[1], expected_rank); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[2], expected_percent); } TYPED_TEST(typed_groupby_rank_scan_test, nestedStructs) { using T = TypeParam; - auto col1 = fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; - auto col2 = fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; - auto col3 = fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; - auto col4 = fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; - auto strings1 = strings_column_wrapper{ - {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - auto strings2 = strings_column_wrapper{ - {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - auto struct_col = structs_column_wrapper{col1, strings1}; - auto nested_col = structs_column_wrapper{struct_col, col2}.release(); - auto flattened_col = structs_column_wrapper{col3, strings2, col4}.release(); - - strings_column_wrapper keys = {{"0", "0", "0", "0", "0", "0", "1", "1", "1", "1", "0", "1"}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}; + auto nested_structs = [] { + auto structs_member = [] { + auto nums_member = input{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; + auto strings_member = strings_column_wrapper{ + {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; + return structs_column_wrapper{nums_member, strings_member}; + }(); + auto nums_member = input{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; + return structs_column_wrapper{structs_member, nums_member}.release(); + }(); + + auto flat_struct = [] { + auto nums_member = input{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; + auto strings_member = strings_column_wrapper{ + {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; + auto nuther_nums = + fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; + return structs_column_wrapper{nums_member, strings_member, nuther_nums}.release(); + }(); + + auto const keys = strings_column_wrapper{ + {"0", "0", "0", "0", "0", "0", "1", "1", "1", "1", "0", "1"}, nulls_at({9, 10, 11})}; std::vector requests; requests.emplace_back(groupby::scan_request()); requests.emplace_back(groupby::scan_request()); - requests[0].values = *nested_col; + requests[0].values = *nested_structs; requests[0].aggregations.push_back(make_dense_rank_aggregation()); requests[0].aggregations.push_back(make_rank_aggregation()); - requests[1].values = *flattened_col; + requests[0].aggregations.push_back(make_percent_rank_aggregation()); + requests[1].values = *flat_struct; requests[1].aggregations.push_back(make_dense_rank_aggregation()); requests[1].aggregations.push_back(make_rank_aggregation()); + requests[1].aggregations.push_back(make_percent_rank_aggregation()); groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES); - auto result = gb_obj.scan(requests); + auto [result_keys, agg_results] = gb_obj.scan(requests); - CUDF_TEST_EXPECT_TABLES_EQUAL(table_view({keys}), result.first->view()); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[0], *result.second[1].results[0]); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[1], *result.second[1].results[1]); + CUDF_TEST_EXPECT_TABLES_EQUAL(table_view({keys}), result_keys->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[0], *agg_results[1].results[0]); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[1], *agg_results[1].results[1]); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[2], *agg_results[1].results[2]); } TYPED_TEST(typed_groupby_rank_scan_test, structsWithNullPushdown) { using T = TypeParam; - auto col1 = fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; - auto col2 = fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; - auto strings1 = strings_column_wrapper{ - {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - auto strings2 = strings_column_wrapper{ - {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - - std::vector> struct_columns; - struct_columns.push_back(col1.release()); - struct_columns.push_back(strings1.release()); - auto struct_col = - cudf::make_structs_column(12, std::move(struct_columns), 0, rmm::device_buffer{}); - auto const struct_nulls = - thrust::host_vector(std::vector{1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}); - struct_col->set_null_mask( - cudf::test::detail::make_null_mask(struct_nulls.begin(), struct_nulls.end())); - - std::vector> null_struct_columns; - null_struct_columns.push_back(col2.release()); - null_struct_columns.push_back(strings2.release()); - auto null_col = - cudf::make_structs_column(12, std::move(null_struct_columns), 0, rmm::device_buffer{}); - null_col->set_null_mask(create_null_mask(12, cudf::mask_state::ALL_NULL)); - - strings_column_wrapper keys = {{"0", "0", "0", "0", "0", "0", "1", "1", "1", "1", "0", "1"}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}; + auto constexpr num_rows = 12; + + auto get_struct_column = [] { + auto nums_member = + fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; + auto strings_member = strings_column_wrapper{ + {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; + auto struct_column = structs_column_wrapper{nums_member, strings_member}.release(); + // Reset null-mask, a posteriori. Nulls will not be pushed down to children. + auto const null_iter = nulls_at({1, 2, 11}); + struct_column->set_null_mask( + cudf::test::detail::make_null_mask(null_iter, null_iter + num_rows)); + return struct_column; + }; + + auto const possibly_null_structs = get_struct_column(); + + auto const definitely_null_structs = [&] { + auto struct_column = get_struct_column(); + struct_column->set_null_mask(create_null_mask(num_rows, mask_state::ALL_NULL)); + return struct_column; + }(); + + strings_column_wrapper keys = {{"0", "0", "0", "0", "0", "0", "1", "1", "1", "X", "X", "X"}, + nulls_at({9, 10, 11})}; std::vector requests; requests.emplace_back(groupby::scan_request()); requests.emplace_back(groupby::scan_request()); - requests[0].values = *struct_col; + requests[0].values = *possibly_null_structs; requests[0].aggregations.push_back(make_dense_rank_aggregation()); requests[0].aggregations.push_back(make_rank_aggregation()); - requests[1].values = *null_col; + requests[0].aggregations.push_back(make_percent_rank_aggregation()); + requests[1].values = *definitely_null_structs; requests[1].aggregations.push_back(make_dense_rank_aggregation()); requests[1].aggregations.push_back(make_rank_aggregation()); + requests[1].aggregations.push_back(make_percent_rank_aggregation()); groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES); - auto result = gb_obj.scan(requests); - - auto expected_dense_vals = - fixed_width_column_wrapper{1, 2, 2, 3, 4, 5, 1, 1, 2, 1, 1, 2}; - auto expected_rank_vals = - fixed_width_column_wrapper{1, 2, 2, 4, 5, 6, 1, 1, 3, 1, 1, 3}; - auto expected_null_result = - fixed_width_column_wrapper{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[0], expected_dense_vals); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[1], expected_rank_vals); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[1].results[0], expected_null_result); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[1].results[1], expected_null_result); + auto [result_keys, agg_results] = gb_obj.scan(requests); + + auto expected_dense = rank_result_col{1, 2, 2, 3, 4, 5, 1, 1, 2, 1, 1, 2}; + auto expected_rank = rank_result_col{1, 2, 2, 4, 5, 6, 1, 1, 3, 1, 1, 3}; + auto expected_percent = percent_result_col{ + 0.0, 1.0 / 5, 1.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5, 0.0, 0.0, 2.0 / 2, 0.0, 0.0, 2.0 / 2}; + auto expected_rank_for_null = rank_result_col{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + auto expected_percent_for_null = + percent_result_col{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[0], expected_dense); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[1], expected_rank); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[2], expected_percent); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[1].results[0], expected_rank_for_null); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[1].results[1], expected_rank_for_null); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[1].results[2], expected_percent_for_null); } /* List support dependent on https://github.com/rapidsai/cudf/issues/8683 @@ -372,67 +424,72 @@ TYPED_TEST(list_groupby_rank_scan_test, lists) TEST(groupby_rank_scan_test, bools) { - fixed_width_column_wrapper keys = {{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}; - fixed_width_column_wrapper order_col1{{0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1}, - {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col2{{0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1}, - {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col3{{0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1}, - {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col4{{0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1}, - {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col5{{0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1}, - {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}}; - structs_column_wrapper struct_order{order_col2, order_col3}; - structs_column_wrapper struct_order_with_nulls{{order_col4, order_col5}, - {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}}; - - fixed_width_column_wrapper expected_dense_vals{{1, 1, 2, 2, 2, 2, 1, 2, 3, 1, 1, 2}}; - fixed_width_column_wrapper expected_rank_vals{{1, 1, 3, 3, 3, 3, 1, 2, 3, 1, 1, 3}}; - - test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order_with_nulls, expected_dense_vals, expected_rank_vals); + using bools = fixed_width_column_wrapper; + using null_iter_t = decltype(nulls_at({})); + + auto const keys = bools{{0, 0, 0, 0, 0, 0, 1, 1, 1, X, X, X}, nulls_at({9, 10, 11})}; + auto const nulls_6_8 = nulls_at({6, 8}); + auto const make_order_by = [&] { return bools{{0, 0, 1, 1, 1, 1, X, 1, X, 0, 0, 1}, nulls_6_8}; }; + auto const make_structs = [&](null_iter_t const& null_iter = all_valid) { + auto member_1 = make_order_by(); + auto member_2 = make_order_by(); + return structs_column_wrapper{{member_1, member_2}, null_iter}; + }; + + auto const order_by = make_order_by(); + auto const order_by_structs = make_structs(); + auto const order_by_structs_with_nulls = make_structs(nulls_6_8); + + auto const expected_dense = rank_result_col{{1, 1, 2, 2, 2, 2, 1, 2, 3, 1, 1, 2}}; + auto const expected_rank = rank_result_col{{1, 1, 3, 3, 3, 3, 1, 2, 3, 1, 1, 3}}; + auto const expected_percent = percent_result_col{ + {0.0, 0.0, 2.0 / 5, 2.0 / 5, 2.0 / 5, 2.0 / 5, 0.0, 1.0 / 2, 2.0 / 2, 0.0, 0.0, 2.0 / 2}}; + + test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent); + test_rank_scans(keys, order_by_structs, expected_dense, expected_rank, expected_percent); + test_rank_scans( + keys, order_by_structs_with_nulls, expected_dense, expected_rank, expected_percent); } TEST(groupby_rank_scan_test, strings) { - strings_column_wrapper keys = {{"0", "0", "0", "0", "0", "0", "1", "1", "1", "1", "0", "1"}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}; - strings_column_wrapper order_col1{ - {"-1", "-2", "-2", "-2", "-3", "-3", "-4", "-4", "-4", "-5", "-5", "-6"}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - strings_column_wrapper order_col2{ - {"-1", "-2", "-2", "-2", "-3", "-3", "-4", "-4", "-4", "-5", "-5", "-6"}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - strings_column_wrapper order_col3{ - {"-1", "-2", "-2", "-2", "-3", "-3", "-4", "-4", "-4", "-5", "-5", "-6"}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - strings_column_wrapper order_col4{ - {"-1", "-2", "-2", "-2", "-3", "-3", "-4", "-4", "-4", "-5", "-5", "-6"}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - strings_column_wrapper order_col5{ - {"-1", "-2", "-2", "-2", "-3", "-3", "-4", "-4", "-4", "-5", "-5", "-6"}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - structs_column_wrapper struct_order{order_col2, order_col3}; - structs_column_wrapper struct_order_with_nulls{{order_col4, order_col5}, - {1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0}}; - - fixed_width_column_wrapper expected_dense_vals{{1, 2, 3, 4, 5, 5, 1, 1, 2, 1, 1, 2}}; - fixed_width_column_wrapper expected_rank_vals{{1, 2, 3, 4, 5, 5, 1, 1, 3, 1, 1, 3}}; - - test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order_with_nulls, expected_dense_vals, expected_rank_vals); + using strings = strings_column_wrapper; + using null_iter_t = decltype(nulls_at({})); + + auto const keys = + strings{{"0", "0", "0", "0", "0", "0", "1", "1", "1", "X", "X", "X"}, nulls_at({9, 10, 11})}; + auto const nulls_2_8 = nulls_at({2, 8}); + auto const make_order_by = [&] { + return strings{{"-1", "-2", "X", "-2", "-3", "-3", "-4", "-4", "X", "-5", "-5", "-6"}, + nulls_2_8}; + }; + auto const make_structs = [&](null_iter_t const& null_iter = all_valid) { + auto member_1 = make_order_by(); + auto member_2 = make_order_by(); + return structs_column_wrapper{{member_1, member_2}, null_iter}; + }; + + auto const order_by = make_order_by(); + auto const order_by_structs = make_structs(); + auto const order_by_structs_with_nulls = make_structs(nulls_at({4, 5, 11})); + + auto const expected_dense = rank_result_col{{1, 2, 3, 4, 5, 5, 1, 1, 2, 1, 1, 2}}; + auto const expected_rank = rank_result_col{{1, 2, 3, 4, 5, 5, 1, 1, 3, 1, 1, 3}}; + auto const expected_percent = percent_result_col{ + {0.0, 1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 4.0 / 5, 0.0, 0.0, 2.0 / 2, 0.0, 0.0, 2.0 / 2}}; + + test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent); + test_rank_scans(keys, order_by_structs, expected_dense, expected_rank, expected_percent); + test_rank_scans( + keys, order_by_structs_with_nulls, expected_dense, expected_rank, expected_percent); } TEST_F(groupby_rank_scan_test_failures, test_exception_triggers) { using T = uint32_t; - fixed_width_column_wrapper keys{{1, 2, 3}, {1, 1, 0}}; - fixed_width_column_wrapper col{3, 3, 1}; + auto const keys = input{{1, 2, 3}, null_at(2)}; + auto const col = input{3, 3, 1}; CUDF_EXPECT_THROW_MESSAGE( test_single_scan(keys, diff --git a/cpp/tests/reductions/rank_tests.cpp b/cpp/tests/reductions/rank_tests.cpp index 2249fac4e2e..ffcbce08163 100644 --- a/cpp/tests/reductions/rank_tests.cpp +++ b/cpp/tests/reductions/rank_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,22 +20,40 @@ #include #include +#include #include #include +#include using aggregation = cudf::aggregation; using cudf::null_policy; using cudf::scan_type; -using namespace cudf::test::iterators; + +namespace cudf::test { + +using namespace iterators; + +template +using input = fixed_width_column_wrapper; +using rank_result_col = fixed_width_column_wrapper; +using percent_result_t = + cudf::detail::target_type_t; +using percent_result_col = fixed_width_column_wrapper; + +auto const rank = cudf::make_rank_aggregation(); +auto const dense_rank = cudf::make_dense_rank_aggregation(); +auto const percent_rank = cudf::make_percent_rank_aggregation(); + +auto constexpr INCLUSIVE_SCAN = cudf::scan_type::INCLUSIVE; +auto constexpr INCLUDE_NULLS = cudf::null_policy::INCLUDE; template struct TypedRankScanTest : BaseScanTest { inline void test_ungrouped_rank_scan(cudf::column_view const& input, cudf::column_view const& expect_vals, - std::unique_ptr const& agg, - null_policy null_handling) + std::unique_ptr const& agg) { - auto col_out = cudf::scan(input, agg, scan_type::INCLUSIVE, null_handling); + auto col_out = cudf::scan(input, agg, INCLUSIVE_SCAN, INCLUDE_NULLS); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expect_vals, col_out->view()); } }; @@ -55,16 +73,25 @@ TYPED_TEST(TypedRankScanTest, Rank) return make_vector({-120, -120, -120, -16, -16, 5, 6, 6, 6, 6, 34, 113}); return make_vector({5, 5, 5, 6, 6, 9, 11, 11, 11, 11, 14, 34}); }(); - auto col = this->make_column(v); - - auto const expected_dense_vals = - cudf::test::fixed_width_column_wrapper{1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 5, 6}; - auto const expected_rank_vals = - cudf::test::fixed_width_column_wrapper{1, 1, 1, 4, 4, 6, 7, 7, 7, 7, 11, 12}; - this->test_ungrouped_rank_scan( - *col, expected_dense_vals, cudf::make_dense_rank_aggregation(), null_policy::INCLUDE); - this->test_ungrouped_rank_scan( - *col, expected_rank_vals, cudf::make_rank_aggregation(), null_policy::INCLUDE); + auto const col = this->make_column(v); + + auto const expected_dense = rank_result_col{1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 5, 6}; + auto const expected_rank = rank_result_col{1, 1, 1, 4, 4, 6, 7, 7, 7, 7, 11, 12}; + auto const expected_percent = percent_result_col{0.0, + 0.0, + 0.0, + 3.0 / 11, + 3.0 / 11, + 5.0 / 11, + 6.0 / 11, + 6.0 / 11, + 6.0 / 11, + 6.0 / 11, + 10.0 / 11, + 11.0 / 11}; + this->test_ungrouped_rank_scan(*col, expected_dense, dense_rank); + this->test_ungrouped_rank_scan(*col, expected_rank, rank); + this->test_ungrouped_rank_scan(*col, expected_percent, percent_rank); } TYPED_TEST(TypedRankScanTest, RankWithNulls) @@ -74,132 +101,159 @@ TYPED_TEST(TypedRankScanTest, RankWithNulls) return make_vector({-120, -120, -120, -16, -16, 5, 6, 6, 6, 6, 34, 113}); return make_vector({5, 5, 5, 6, 6, 9, 11, 11, 11, 11, 14, 34}); }(); - auto const b = thrust::host_vector(std::vector{1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0}); - auto col = this->make_column(v, b); - - auto const expected_dense_vals = - cudf::test::fixed_width_column_wrapper{1, 1, 1, 2, 3, 4, 5, 5, 6, 6, 7, 8}; - auto const expected_rank_vals = - cudf::test::fixed_width_column_wrapper{1, 1, 1, 4, 5, 6, 7, 7, 9, 9, 11, 12}; - this->test_ungrouped_rank_scan( - *col, expected_dense_vals, cudf::make_dense_rank_aggregation(), null_policy::INCLUDE); - this->test_ungrouped_rank_scan( - *col, expected_rank_vals, cudf::make_rank_aggregation(), null_policy::INCLUDE); + auto const null_iter = nulls_at({3, 6, 7, 11}); + auto const b = thrust::host_vector(null_iter, null_iter + v.size()); + auto col = this->make_column(v, b); + + auto const expected_dense = rank_result_col{1, 1, 1, 2, 3, 4, 5, 5, 6, 6, 7, 8}; + auto const expected_rank = rank_result_col{1, 1, 1, 4, 5, 6, 7, 7, 9, 9, 11, 12}; + auto const expected_percent = percent_result_col{0.0, + 0.0, + 0.0, + 3.0 / 11, + 4.0 / 11, + 5.0 / 11, + 6.0 / 11, + 6.0 / 11, + 8.0 / 11, + 8.0 / 11, + 10.0 / 11, + 11.0 / 11}; + this->test_ungrouped_rank_scan(*col, expected_dense, dense_rank); + this->test_ungrouped_rank_scan(*col, expected_rank, rank); + this->test_ungrouped_rank_scan(*col, expected_percent, percent_rank); } -TYPED_TEST(TypedRankScanTest, MixedStructs) +namespace { +template +auto make_input_column() { - auto const v = [] { - if (std::is_signed::value) - return make_vector({-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9}); - return make_vector({0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9}); - }(); - auto const b = thrust::host_vector(std::vector{1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1}); - auto col = this->make_column(v, b); - auto strings = cudf::test::strings_column_wrapper{ + if constexpr (std::is_same_v) { + return strings_column_wrapper{{"0", "0", "4", "4", "4", "5", "7", "7", "7", "9", "9", "9"}, + null_at(5)}; + } else { + return (std::is_signed_v) + ? input{{-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9}, null_at(5)} + : input{{0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9}, null_at(5)}; + } +} + +auto make_strings_column() +{ + return strings_column_wrapper{ {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - std::vector> vector_of_columns; - vector_of_columns.push_back(std::move(col)); - vector_of_columns.push_back(strings.release()); - auto struct_col = cudf::test::structs_column_wrapper{std::move(vector_of_columns)}.release(); - - auto expected_dense_vals = - cudf::test::fixed_width_column_wrapper{1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8}; - auto expected_rank_vals = - cudf::test::fixed_width_column_wrapper{1, 1, 3, 3, 5, 6, 7, 7, 9, 10, 10, 12}; - - this->test_ungrouped_rank_scan( - *struct_col, expected_dense_vals, cudf::make_dense_rank_aggregation(), null_policy::INCLUDE); - this->test_ungrouped_rank_scan( - *struct_col, expected_rank_vals, cudf::make_rank_aggregation(), null_policy::INCLUDE); +} + +template +auto make_mixed_structs_column() +{ + auto col = make_input_column(); + auto strings = make_strings_column(); + return structs_column_wrapper{{col, strings}}; +} +} // namespace + +TYPED_TEST(TypedRankScanTest, MixedStructs) +{ + auto const struct_col = make_mixed_structs_column(); + auto const expected_dense = rank_result_col{1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8}; + auto const expected_rank = rank_result_col{1, 1, 3, 3, 5, 6, 7, 7, 9, 10, 10, 12}; + auto const expected_percent = percent_result_col{0.0, + 0.0, + 2.0 / 11, + 2.0 / 11, + 4.0 / 11, + 5.0 / 11, + 6.0 / 11, + 6.0 / 11, + 8.0 / 11, + 9.0 / 11, + 9.0 / 11, + 11.0 / 11}; + + this->test_ungrouped_rank_scan(struct_col, expected_dense, dense_rank); + this->test_ungrouped_rank_scan(struct_col, expected_rank, rank); + this->test_ungrouped_rank_scan(struct_col, expected_percent, percent_rank); } TYPED_TEST(TypedRankScanTest, NestedStructs) { - auto const v = [] { - if (std::is_signed::value) - return make_vector({-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9}); - return make_vector({0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9}); + auto const nested_col = [&] { + auto struct_col = [&] { + auto col = make_input_column(); + auto strings = make_strings_column(); + return structs_column_wrapper{{col, strings}}; + }(); + auto col = make_input_column(); + return structs_column_wrapper{{struct_col, col}}; }(); - auto const b = thrust::host_vector(std::vector{1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1}); - auto col1 = this->make_column(v, b); - auto col2 = this->make_column(v, b); - auto col3 = this->make_column(v, b); - auto col4 = this->make_column(v, b); - auto strings1 = cudf::test::strings_column_wrapper{ - {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - auto strings2 = cudf::test::strings_column_wrapper{ - {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - std::vector> struct_columns; - struct_columns.push_back(std::move(col1)); - struct_columns.push_back(strings1.release()); - auto struct_col = cudf::test::structs_column_wrapper{std::move(struct_columns)}; - std::vector> nested_columns; - nested_columns.push_back(struct_col.release()); - nested_columns.push_back(std::move(col2)); - auto nested_col = cudf::test::structs_column_wrapper{std::move(nested_columns)}; - std::vector> flat_columns; - flat_columns.push_back(std::move(col3)); - flat_columns.push_back(strings2.release()); - flat_columns.push_back(std::move(col4)); - auto flat_col = cudf::test::structs_column_wrapper{std::move(flat_columns)}; - - auto dense_out = cudf::scan( - nested_col, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - auto dense_expected = cudf::scan( - flat_col, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - auto rank_out = cudf::scan( - nested_col, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - auto rank_expected = - cudf::scan(flat_col, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); + auto const flat_col = [&] { + auto col = make_input_column(); + auto strings_col = make_strings_column(); + auto nuther_col = make_input_column(); + return structs_column_wrapper{{col, strings_col, nuther_col}}; + }(); + auto const dense_out = cudf::scan(nested_col, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const dense_expected = cudf::scan(flat_col, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), dense_expected->view()); + + auto const rank_out = cudf::scan(nested_col, rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const rank_expected = cudf::scan(flat_col, rank, INCLUSIVE_SCAN, INCLUDE_NULLS); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), rank_expected->view()); + + auto const percent_out = cudf::scan(nested_col, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const percent_expected = cudf::scan(flat_col, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(percent_out->view(), percent_expected->view()); } -TYPED_TEST(TypedRankScanTest, structsWithNullPushdown) +TYPED_TEST(TypedRankScanTest, StructsWithNullPushdown) { - auto const v = [] { - if (std::is_signed::value) - return make_vector({-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9}); - return make_vector({0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9}); - }(); - auto const b = thrust::host_vector(std::vector{1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1}); - auto col = this->make_column(v, b); - auto strings = cudf::test::strings_column_wrapper{ - {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - std::vector> struct_columns; - struct_columns.push_back(std::move(col)); - struct_columns.push_back(strings.release()); - - auto struct_col = - cudf::make_structs_column(12, std::move(struct_columns), 0, rmm::device_buffer{}); - - struct_col->set_null_mask(create_null_mask(12, cudf::mask_state::ALL_NULL)); - auto expected_null_result = - cudf::test::fixed_width_column_wrapper{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - auto dense_null_out = cudf::scan( - *struct_col, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - auto rank_null_out = cudf::scan( - *struct_col, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_null_out->view(), expected_null_result); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_null_out->view(), expected_null_result); - - auto const struct_nulls = - thrust::host_vector(std::vector{1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - struct_col->set_null_mask( - cudf::test::detail::make_null_mask(struct_nulls.begin(), struct_nulls.end())); - auto expected_dense_vals = - cudf::test::fixed_width_column_wrapper{1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9}; - auto expected_rank_vals = - cudf::test::fixed_width_column_wrapper{1, 2, 2, 4, 5, 6, 7, 7, 9, 10, 10, 12}; - auto dense_out = cudf::scan( - *struct_col, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - auto rank_out = cudf::scan( - *struct_col, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), expected_dense_vals); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), expected_rank_vals); + auto struct_col = make_mixed_structs_column().release(); + + // First, verify that if the structs column has only nulls, all output rows are ranked 1. + { + struct_col->set_null_mask( + create_null_mask(12, cudf::mask_state::ALL_NULL)); // Null mask not pushed down to members. + auto const expected_null_result = rank_result_col{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + auto const expected_percent_rank_null_result = + percent_result_col{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + auto const dense_out = cudf::scan(*struct_col, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const rank_out = cudf::scan(*struct_col, rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const percent_out = cudf::scan(*struct_col, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), expected_null_result); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), expected_null_result); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(percent_out->view(), expected_percent_rank_null_result); + } + + // Next, verify that if the structs column a null mask that is NOT pushed down to members, + // the ranks are still correct. + { + auto const null_iter = nulls_at({1, 2}); + struct_col->set_null_mask( + cudf::test::detail::make_null_mask(null_iter, null_iter + struct_col->size())); + auto const expected_dense = rank_result_col{1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9}; + auto const expected_rank = rank_result_col{1, 2, 2, 4, 5, 6, 7, 7, 9, 10, 10, 12}; + auto const expected_percent = percent_result_col{0.0, + 1.0 / 11, + 1.0 / 11, + 3.0 / 11, + 4.0 / 11, + 5.0 / 11, + 6.0 / 11, + 6.0 / 11, + 8.0 / 11, + 9.0 / 11, + 9.0 / 11, + 11.0 / 11}; + auto const dense_out = cudf::scan(*struct_col, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const rank_out = cudf::scan(*struct_col, rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const percent_out = cudf::scan(*struct_col, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), expected_dense); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), expected_rank); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(percent_out->view(), expected_percent); + } } struct RankScanTest : public cudf::test::BaseFixture { @@ -207,49 +261,66 @@ struct RankScanTest : public cudf::test::BaseFixture { TEST(RankScanTest, BoolRank) { - cudf::test::fixed_width_column_wrapper vals{0, 0, 0, 6, 6, 9, 11, 11, 11, 11, 14, 34}; - cudf::test::fixed_width_column_wrapper expected_dense_vals{ - 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2}; - cudf::test::fixed_width_column_wrapper expected_rank_vals{ - 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4}; - - auto dense_out = cudf::scan( - vals, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - auto rank_out = - cudf::scan(vals, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense_vals, dense_out->view()); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank_vals, rank_out->view()); + auto const vals = input{0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + auto const expected_dense = rank_result_col{1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2}; + auto const expected_rank = rank_result_col{1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4}; + auto const expected_percent = percent_result_col{0.0, + 0.0, + 0.0, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11}; + + auto const dense_out = cudf::scan(vals, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const rank_out = cudf::scan(vals, rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const percent_out = cudf::scan(vals, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense, dense_out->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank, rank_out->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_percent, percent_out->view()); } TEST(RankScanTest, BoolRankWithNull) { - cudf::test::fixed_width_column_wrapper vals{{0, 0, 0, 6, 6, 9, 11, 11, 11, 11, 14, 34}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0}}; - cudf::table_view order_table{std::vector{vals}}; - cudf::test::fixed_width_column_wrapper expected_dense_vals{ - 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3}; - cudf::test::fixed_width_column_wrapper expected_rank_vals{ - 1, 1, 1, 4, 4, 4, 4, 4, 9, 9, 9, 9}; - - auto nullable_dense_out = cudf::scan( - vals, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - auto nullable_rank_out = - cudf::scan(vals, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense_vals, nullable_dense_out->view()); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank_vals, nullable_rank_out->view()); + auto const vals = input{{0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}, nulls_at({8, 9, 10, 11})}; + auto const expected_dense = rank_result_col{1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3}; + auto const expected_rank = rank_result_col{1, 1, 1, 4, 4, 4, 4, 4, 9, 9, 9, 9}; + auto const expected_percent = percent_result_col{0.0, + 0.0, + 0.0, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 8.0 / 11, + 8.0 / 11, + 8.0 / 11, + 8.0 / 11}; + + auto nullable_dense_out = cudf::scan(vals, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto nullable_rank_out = cudf::scan(vals, rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto nullable_percent_out = cudf::scan(vals, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense, nullable_dense_out->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank, nullable_rank_out->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_percent, nullable_percent_out->view()); } TEST(RankScanTest, ExclusiveScan) { - cudf::test::fixed_width_column_wrapper vals{3, 4, 5}; - cudf::test::fixed_width_column_wrapper order_col{3, 3, 1}; - cudf::table_view order_table{std::vector{order_col}}; - - CUDF_EXPECT_THROW_MESSAGE( - cudf::scan( - vals, cudf::make_dense_rank_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE), - "Unsupported dense rank aggregation operator for exclusive scan"); - CUDF_EXPECT_THROW_MESSAGE( - cudf::scan(vals, cudf::make_rank_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE), - "Unsupported rank aggregation operator for exclusive scan"); + auto const vals = input{3, 4, 5}; + + CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, dense_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS), + "Dense rank aggregation operator requires an inclusive scan"); + CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, rank, scan_type::EXCLUSIVE, INCLUDE_NULLS), + "Rank aggregation operator requires an inclusive scan"); + CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, percent_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS), + "Percent rank aggregation operator requires an inclusive scan"); } + +} // namespace cudf::test diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java index 1d73bd71246..d10329ca0f2 100644 --- a/java/src/main/java/ai/rapids/cudf/Aggregation.java +++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,8 +66,9 @@ enum Kind { MERGE_M2(27), RANK(28), DENSE_RANK(29), - TDIGEST(30), // This can take a delta argument for accuracy level - MERGE_TDIGEST(31); // This can take a delta argument for accuracy level + PERCENT_RANK(30), + TDIGEST(31), // This can take a delta argument for accuracy level + MERGE_TDIGEST(32); // This can take a delta argument for accuracy level final int nativeId; @@ -754,6 +755,19 @@ static DenseRankAggregation denseRank() { return new DenseRankAggregation(); } + static final class PercentRankAggregation extends NoParamAggregation { + private PercentRankAggregation() { + super(Kind.PERCENT_RANK); + } + } + + /** + * Get the row's percent ranking. + */ + static PercentRankAggregation percentRank() { + return new PercentRankAggregation(); + } + /** * Collect the values into a list. Nulls will be skipped. */ diff --git a/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java index 219b6dde05d..006941deec6 100644 --- a/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java +++ b/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -115,4 +115,11 @@ public static GroupByScanAggregation rank() { public static GroupByScanAggregation denseRank() { return new GroupByScanAggregation(Aggregation.denseRank()); } + + /** + * Get the row's percent ranking. + */ + public static GroupByScanAggregation percentRank() { + return new GroupByScanAggregation(Aggregation.percentRank()); + } } diff --git a/java/src/main/java/ai/rapids/cudf/ScanAggregation.java b/java/src/main/java/ai/rapids/cudf/ScanAggregation.java index 08489562adc..752fd57355b 100644 --- a/java/src/main/java/ai/rapids/cudf/ScanAggregation.java +++ b/java/src/main/java/ai/rapids/cudf/ScanAggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -97,4 +97,11 @@ public static ScanAggregation rank() { public static ScanAggregation denseRank() { return new ScanAggregation(Aggregation.denseRank()); } + + /** + * Get the row's percent rank. + */ + public static ScanAggregation percentRank() { + return new ScanAggregation(Aggregation.percentRank()); + } } diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp index 93a01854ced..f8c448566c8 100644 --- a/java/src/main/native/src/AggregationJni.cpp +++ b/java/src/main/native/src/AggregationJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -85,6 +85,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv return cudf::make_rank_aggregation(); case 29: // DENSE_RANK return cudf::make_dense_rank_aggregation(); + case 30: // PERCENT_RANK + return cudf::make_percent_rank_aggregation(); default: throw std::logic_error("Unsupported No Parameter Aggregation Operation"); } }(); @@ -139,10 +141,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createTDigestAgg(JNIEnv std::unique_ptr ret; // These numbers come from Aggregation.java and must stay in sync switch (kind) { - case 30: // TDIGEST + case 31: // TDIGEST ret = cudf::make_tdigest_aggregation(delta); break; - case 31: // MERGE_TDIGEST + case 32: // MERGE_TDIGEST ret = cudf::make_merge_tdigest_aggregation(delta); break; default: throw std::logic_error("Unsupported TDigest Aggregation Operation"); diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index f9c8029ed84..9c00cdbc084 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3099,6 +3099,28 @@ void testScanDenseRank() { } } + @Test + void testScanPercentRank() { + try (ColumnVector col1 = ColumnVector.fromBoxedInts(-97, -97, -97, null, -16, 5, null, null, 6, 6, 34, null); + ColumnVector col2 = ColumnVector.fromBoxedInts( 3, 3, 4, 7, 7, 7, 7, 7, 8, 8, 8, 9); + ColumnVector struct_order = ColumnVector.makeStruct(col1, col2); + ColumnVector expected = ColumnVector.fromBoxedDoubles( + 0.0, 0.0, 2.0/11, 3.0/11, 4.0/11, 5.0/11, 6.0/11, 6.0/11, 8.0/11, 8.0/11, 10.0/11, 1.0)) { + try (ColumnVector result = struct_order.scan(ScanAggregation.percentRank(), + ScanType.INCLUSIVE, NullPolicy.INCLUDE)) { + assertColumnsAreEqual(expected, result); + } + + // Exclude should have identical results + try (ColumnVector result = struct_order.scan(ScanAggregation.percentRank(), + ScanType.INCLUSIVE, NullPolicy.EXCLUDE)) { + assertColumnsAreEqual(expected, result); + } + + // Percent rank aggregations do not support ScanType.EXCLUSIVE + } + } + @Test void testWindowStatic() { try (Scalar one = Scalar.fromInt(1); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index db1327c5471..f309b1ee703 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -3930,8 +3930,8 @@ void testValidityFill() { @Test void testGroupByScan() { try (Table t1 = new Table.TestBuilder() - .column( "1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2") - .column( 0, 1, 3, 3, 5, 5, 5, 5, 5, 5, 5) + .column( "1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2") // GBY Key#0 + .column( 0, 1, 3, 3, 5, 5, 5, 5, 5, 5, 5) // GBY Key#1 .column(12.0, 14.0, 13.0, 17.0, 17.0, 17.0, null, null, 11.0, null, 10.0) .column( -9, null, -5, 0, 4, 4, 8, 2, 2, 2, null) .build()) { @@ -3945,16 +3945,18 @@ void testGroupByScan() { GroupByScanAggregation.min().onColumn(2), GroupByScanAggregation.max().onColumn(2), GroupByScanAggregation.rank().onColumn(3), - GroupByScanAggregation.denseRank().onColumn(3)); + GroupByScanAggregation.denseRank().onColumn(3), + GroupByScanAggregation.percentRank().onColumn(3)); Table expected = new Table.TestBuilder() - .column( "1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2") + .column( "1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2") .column( 0, 1, 3, 3, 5, 5, 5, 5, 5, 5, 5) .column(12.0, 14.0, 13.0, 30.0, 17.0, 34.0, null, null, 11.0, null, 21.0) .column( 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 3) // odd why is this not 1 based? .column(12.0, 14.0, 13.0, 13.0, 17.0, 17.0, null, null, 11.0, null, 10.0) .column(12.0, 14.0, 13.0, 17.0, 17.0, 17.0, null, null, 11.0, null, 11.0) - .column(1, 1, 1, 2, 1, 1, 3, 1, 1, 1, 4) - .column(1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2) + .column( 1, 1, 1, 2, 1, 1, 3, 1, 1, 1, 4) + .column( 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2) + .column( 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0) .build()) { assertTablesAreEqual(expected, result); } From d5b2448fc69f17509304d594f029d0df56984962 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 10 Feb 2022 17:32:54 -0500 Subject: [PATCH 11/47] Add regex flags to strings extract function (#10192) Add the `regex_flags` parameter to the strings `extract()` functions so that matching regex patterns is consistent other strings regex APIs. This also renames the `cudf::strings::extract_all` API function to `cudf::strings::extract_all_record` to be consistent with the other `_record` APIs that return a List column. This is in anticipation of supporting `extractall` as requested in #7908 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Devavret Makkar (https://github.com/devavret) URL: https://github.com/rapidsai/cudf/pull/10192 --- cpp/include/cudf/strings/extract.hpp | 11 ++++-- cpp/src/strings/extract/extract.cu | 9 +++-- cpp/src/strings/extract/extract_all.cu | 17 ++++---- cpp/tests/strings/extract_tests.cpp | 39 +++++++++++++++++-- .../cudf/cudf/_lib/cpp/strings/contains.pxd | 11 +----- python/cudf/cudf/_lib/cpp/strings/extract.pxd | 6 ++- .../cudf/_lib/cpp/strings/regex_flags.pxd | 9 +++++ python/cudf/cudf/_lib/strings/contains.pyx | 4 +- python/cudf/cudf/_lib/strings/extract.pyx | 10 +++-- python/cudf/cudf/_lib/strings/findall.pyx | 2 +- python/cudf/cudf/core/column/string.py | 26 +++++++++---- python/cudf/cudf/tests/test_string.py | 8 ++-- 12 files changed, 107 insertions(+), 45 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/strings/regex_flags.pxd diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp index 466f71aace0..94e9f36d7d3 100644 --- a/cpp/include/cudf/strings/extract.hpp +++ b/cpp/include/cudf/strings/extract.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include @@ -48,12 +49,14 @@ namespace strings { * * @param strings Strings instance for this operation. * @param pattern The regular expression pattern with group indicators. + * @param flags Regex flags for interpreting special characters in the pattern. * @param mr Device memory resource used to allocate the returned table's device memory. * @return Columns of strings extracted from the input column. */ std::unique_ptr
extract( strings_column_view const& strings, std::string const& pattern, + regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -69,7 +72,7 @@ std::unique_ptr
extract( * @code{.pseudo} * Example: * s = ["a1 b4", "b2", "c3 a5", "b", null] - * r = extract_all(s,"([ab])(\\d)") + * r = extract_all_record(s,"([ab])(\\d)") * r is now [ ["a", "1", "b", "4"], * ["b", "2"], * ["a", "5"], @@ -81,12 +84,14 @@ std::unique_ptr
extract( * * @param strings Strings instance for this operation. * @param pattern The regular expression pattern with group indicators. + * @param flags Regex flags for interpreting special characters in the pattern. * @param mr Device memory resource used to allocate any returned device memory. * @return Lists column containing strings extracted from the input column. */ -std::unique_ptr extract_all( +std::unique_ptr extract_all_record( strings_column_view const& strings, std::string const& pattern, + regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu index c4076dd61c1..a67af9442f0 100644 --- a/cpp/src/strings/extract/extract.cu +++ b/cpp/src/strings/extract/extract.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -83,6 +83,7 @@ struct extract_fn { std::unique_ptr
extract( strings_column_view const& strings, std::string const& pattern, + regex_flags const flags, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { @@ -91,7 +92,8 @@ std::unique_ptr
extract( auto const d_strings = *strings_column; // compile regex into device object - auto prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + auto prog = + reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream); auto d_prog = *prog; // extract should include groups auto const groups = d_prog.group_counts(); @@ -150,10 +152,11 @@ std::unique_ptr
extract( std::unique_ptr
extract(strings_column_view const& strings, std::string const& pattern, + regex_flags const flags, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract(strings, pattern, rmm::cuda_stream_default, mr); + return detail::extract(strings, pattern, flags, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu index c4749eae003..e27dccb9338 100644 --- a/cpp/src/strings/extract/extract_all.cu +++ b/cpp/src/strings/extract/extract_all.cu @@ -89,13 +89,14 @@ struct extract_fn { } // namespace /** - * @copydoc cudf::strings::extract_all + * @copydoc cudf::strings::extract_all_record * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr extract_all( +std::unique_ptr extract_all_record( strings_column_view const& strings, std::string const& pattern, + regex_flags const flags, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { @@ -103,7 +104,8 @@ std::unique_ptr extract_all( auto const d_strings = column_device_view::create(strings.parent(), stream); // Compile regex into device object. - auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + auto d_prog = + reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream); // The extract pattern should always include groups. auto const groups = d_prog->group_counts(); CUDF_EXPECTS(groups > 0, "extract_all requires group indicators in the regex pattern."); @@ -179,12 +181,13 @@ std::unique_ptr extract_all( // external API -std::unique_ptr extract_all(strings_column_view const& strings, - std::string const& pattern, - rmm::mr::device_memory_resource* mr) +std::unique_ptr extract_all_record(strings_column_view const& strings, + std::string const& pattern, + regex_flags const flags, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_all(strings, pattern, rmm::cuda_stream_default, mr); + return detail::extract_all_record(strings, pattern, flags, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index 516882bd8ad..9a28dbf0697 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -150,6 +150,39 @@ TEST_F(StringsExtractTests, ExtractEventTest) } } +TEST_F(StringsExtractTests, MultiLine) +{ + auto input = + cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"}); + auto view = cudf::strings_column_view(input); + + auto results = cudf::strings::extract(view, "(^[a-c]+$)", cudf::strings::regex_flags::MULTILINE); + cudf::test::strings_column_wrapper expected_multiline({"abc", "abc", "abc", "", "abc"}, + {1, 1, 1, 0, 1}); + auto expected = cudf::table_view{{expected_multiline}}; + CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); + results = cudf::strings::extract(view, "^([a-c]+)$"); + cudf::test::strings_column_wrapper expected_default({"", "", "abc", "", ""}, {0, 0, 1, 0, 0}); + expected = cudf::table_view{{expected_default}}; + CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); +} + +TEST_F(StringsExtractTests, DotAll) +{ + auto input = cudf::test::strings_column_wrapper({"abc\nfa\nef", "fff\nabbc\nfff", "abcdef", ""}); + auto view = cudf::strings_column_view(input); + + auto results = cudf::strings::extract(view, "(a.*f)", cudf::strings::regex_flags::DOTALL); + cudf::test::strings_column_wrapper expected_dotall({"abc\nfa\nef", "abbc\nfff", "abcdef", ""}, + {1, 1, 1, 0}); + auto expected = cudf::table_view{{expected_dotall}}; + CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); + results = cudf::strings::extract(view, "(a.*f)"); + cudf::test::strings_column_wrapper expected_default({"", "", "abcdef", ""}, {0, 0, 1, 0}); + expected = cudf::table_view{{expected_default}}; + CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); +} + TEST_F(StringsExtractTests, EmptyExtractTest) { std::vector h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""}; @@ -181,7 +214,7 @@ TEST_F(StringsExtractTests, ExtractAllTest) cudf::test::strings_column_wrapper input(h_input.begin(), h_input.end(), validity); auto sv = cudf::strings_column_view(input); - auto results = cudf::strings::extract_all(sv, "(\\d+) (\\w+)"); + auto results = cudf::strings::extract_all_record(sv, "(\\d+) (\\w+)"); bool valids[] = {true, true, true, false, false, false, true}; using LCW = cudf::test::lists_column_wrapper; @@ -201,7 +234,7 @@ TEST_F(StringsExtractTests, Errors) cudf::test::strings_column_wrapper input({"this column intentionally left blank"}); auto sv = cudf::strings_column_view(input); EXPECT_THROW(cudf::strings::extract(sv, "\\w+"), cudf::logic_error); - EXPECT_THROW(cudf::strings::extract_all(sv, "\\w+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::extract_all_record(sv, "\\w+"), cudf::logic_error); } TEST_F(StringsExtractTests, MediumRegex) diff --git a/python/cudf/cudf/_lib/cpp/strings/contains.pxd b/python/cudf/cudf/_lib/cpp/strings/contains.pxd index 8014a60617d..59131f6e1aa 100644 --- a/python/cudf/cudf/_lib/cpp/strings/contains.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/contains.pxd @@ -1,20 +1,13 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.strings.regex_flags cimport regex_flags -cdef extern from "cudf/strings/regex/flags.hpp" \ - namespace "cudf::strings" nogil: - - ctypedef enum regex_flags: - DEFAULT 'cudf::strings::regex_flags::DEFAULT' - MULTILINE 'cudf::strings::regex_flags::MULTILINE' - DOTALL 'cudf::strings::regex_flags::DOTALL' - cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] contains_re( diff --git a/python/cudf/cudf/_lib/cpp/strings/extract.pxd b/python/cudf/cudf/_lib/cpp/strings/extract.pxd index 606369c8994..43a43594997 100644 --- a/python/cudf/cudf/_lib/cpp/strings/extract.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/extract.pxd @@ -1,10 +1,11 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.strings.contains cimport regex_flags from cudf._lib.cpp.table.table cimport table @@ -12,4 +13,5 @@ cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[table] extract( column_view source_strings, - string pattern) except + + string pattern, + regex_flags flags) except + diff --git a/python/cudf/cudf/_lib/cpp/strings/regex_flags.pxd b/python/cudf/cudf/_lib/cpp/strings/regex_flags.pxd new file mode 100644 index 00000000000..2a5701fa6a3 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/strings/regex_flags.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +cdef extern from "cudf/strings/regex/flags.hpp" \ + namespace "cudf::strings" nogil: + + ctypedef enum regex_flags: + DEFAULT 'cudf::strings::regex_flags::DEFAULT' + MULTILINE 'cudf::strings::regex_flags::MULTILINE' + DOTALL 'cudf::strings::regex_flags::DOTALL' diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx index f18d0eb7f36..31f0fb2ec04 100644 --- a/python/cudf/cudf/_lib/strings/contains.pyx +++ b/python/cudf/cudf/_lib/strings/contains.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport uint32_t from libcpp.memory cimport unique_ptr @@ -12,8 +12,8 @@ from cudf._lib.cpp.strings.contains cimport ( contains_re as cpp_contains_re, count_re as cpp_count_re, matches_re as cpp_matches_re, - regex_flags as regex_flags, ) +from cudf._lib.cpp.strings.regex_flags cimport regex_flags from cudf._lib.scalar cimport DeviceScalar diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx index 9d554aa2162..bac282dccc5 100644 --- a/python/cudf/cudf/_lib/strings/extract.pyx +++ b/python/cudf/cudf/_lib/strings/extract.pyx @@ -1,5 +1,6 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move @@ -8,12 +9,13 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.strings.extract cimport extract as cpp_extract +from cudf._lib.cpp.strings.regex_flags cimport regex_flags from cudf._lib.cpp.table.table cimport table from cudf._lib.scalar cimport DeviceScalar from cudf._lib.utils cimport data_from_unique_ptr -def extract(Column source_strings, object pattern): +def extract(Column source_strings, object pattern, uint32_t flags): """ Returns data which contains extracted capture groups provided in `pattern` for all `source_strings`. @@ -24,11 +26,13 @@ def extract(Column source_strings, object pattern): cdef column_view source_view = source_strings.view() cdef string pattern_string = str(pattern).encode() + cdef regex_flags c_flags = flags with nogil: c_result = move(cpp_extract( source_view, - pattern_string + pattern_string, + c_flags )) return data_from_unique_ptr( diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx index c4e4b6c38d8..2d62a3f9a4b 100644 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ b/python/cudf/cudf/_lib/strings/findall.pyx @@ -9,11 +9,11 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.scalar.scalar cimport string_scalar -from cudf._lib.cpp.strings.contains cimport regex_flags from cudf._lib.cpp.strings.findall cimport ( findall as cpp_findall, findall_record as cpp_findall_record, ) +from cudf._lib.cpp.strings.regex_flags cimport regex_flags from cudf._lib.cpp.table.table cimport table from cudf._lib.scalar cimport DeviceScalar from cudf._lib.utils cimport data_from_unique_ptr diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index ee1ddb58abc..415200521ac 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -574,6 +574,8 @@ def extract( ---------- pat : str Regular expression pattern with capturing groups. + flags : int, default 0 (no flags) + Flags to pass through to the regex engine (e.g. re.MULTILINE) expand : bool, default True If True, return DataFrame with one column per capture group. If False, return a Series/Index if there is one capture group or @@ -588,8 +590,8 @@ def extract( Notes ----- - The `flags` parameter is not yet supported and will raise a - NotImplementedError if anything other than the default value is passed. + The `flags` parameter currently only supports re.DOTALL and + re.MULTILINE. Examples -------- @@ -618,10 +620,12 @@ def extract( 2 dtype: object """ # noqa W605 - if flags != 0: - raise NotImplementedError("`flags` parameter is not yet supported") + if not _is_supported_regex_flags(flags): + raise NotImplementedError( + "unsupported value for `flags` parameter" + ) - data, index = libstrings.extract(self._column, pat) + data, index = libstrings.extract(self._column, pat, flags) if len(data) == 1 and expand is False: data = next(iter(data.values())) else: @@ -752,7 +756,9 @@ def contains( flags = pat.flags & ~re.U pat = pat.pattern if not _is_supported_regex_flags(flags): - raise ValueError("invalid `flags` parameter value") + raise NotImplementedError( + "unsupported value for `flags` parameter" + ) if pat is None: result_col = column.column_empty( @@ -3393,7 +3399,9 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: flags = pat.flags & ~re.U pat = pat.pattern if not _is_supported_regex_flags(flags): - raise ValueError("invalid `flags` parameter value") + raise NotImplementedError( + "unsupported value for `flags` parameter" + ) return self._return_or_inplace( libstrings.count_re(self._column, pat, flags) @@ -3969,7 +3977,9 @@ def match( flags = pat.flags & ~re.U pat = pat.pattern if not _is_supported_regex_flags(flags): - raise ValueError("invalid `flags` parameter value") + raise NotImplementedError( + "unsupported value for `flags` parameter" + ) return self._return_or_inplace( libstrings.match_re(self._column, pat, flags) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 653c79fe603..efe8e523d4e 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -829,7 +829,9 @@ def test_string_join(ps_gs, sep): @pytest.mark.parametrize("pat", [r"(a)", r"(f)", r"([a-z])", r"([A-Z])"]) @pytest.mark.parametrize("expand", [True, False]) -@pytest.mark.parametrize("flags,flags_raise", [(0, 0), (1, 1)]) +@pytest.mark.parametrize( + "flags,flags_raise", [(0, 0), (re.M | re.S, 0), (re.I, 1)] +) def test_string_extract(ps_gs, pat, expand, flags, flags_raise): ps, gs = ps_gs expectation = raise_builder([flags_raise], NotImplementedError) @@ -862,9 +864,7 @@ def test_string_contains(ps_gs, pat, regex, flags, flags_raise, na, na_raise): ps, gs = ps_gs expectation = does_not_raise() - if flags_raise: - expectation = pytest.raises(ValueError) - if na_raise: + if flags_raise or na_raise: expectation = pytest.raises(NotImplementedError) with expectation: From dcac052bd90c53865203ccda1ae4829da9d5db1f Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 10 Feb 2022 22:02:56 -0500 Subject: [PATCH 12/47] Adding string row size iterator for row to column and column to row conversion (#10157) This is the first step to supporting variable-width strings in the row to column and column to row code. It adds an iterator that reads the offset columns inside string columns to compute the row sizes of this variable-width data. Note that this doesn't add support for strings yet, but is the first step in that direction. closes #10111 Authors: - Mike Wilson (https://github.com/hyperbolic2346) Approvers: - MithunR (https://github.com/mythrocks) - Nghia Truong (https://github.com/ttnghia) - https://github.com/nvdbaranec URL: https://github.com/rapidsai/cudf/pull/10157 --- java/src/main/native/src/row_conversion.cu | 244 +++++++++++++++------ 1 file changed, 183 insertions(+), 61 deletions(-) diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 4a5265b1d2e..5a2aa44261d 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -187,8 +189,82 @@ struct batch_data { std::vector row_batches; // information about each batch such as byte count }; -struct row_offset_functor { - row_offset_functor(size_type fixed_width_only_row_size) +/** + * @brief builds row size information for tables that contain strings + * + * @param tbl table from which to compute row size information + * @param fixed_width_and_validity_size size of fixed-width and validity data in this table + * @param stream cuda stream on which to operate + * @return device vector of size_types of the row sizes of the table + */ +rmm::device_uvector build_string_row_sizes(table_view const &tbl, + size_type fixed_width_and_validity_size, + rmm::cuda_stream_view stream) { + auto const num_rows = tbl.num_rows(); + rmm::device_uvector d_row_sizes(num_rows, stream); + thrust::uninitialized_fill(rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(), 0); + + auto d_offsets_iterators = [&]() { + std::vector offsets_iterators; + auto offsets_iter = thrust::make_transform_iterator( + tbl.begin(), [](auto const &col) -> strings_column_view::offset_iterator { + if (!is_fixed_width(col.type())) { + CUDF_EXPECTS(col.type().id() == type_id::STRING, "only string columns are supported!"); + return strings_column_view(col).offsets_begin(); + } else { + return nullptr; + } + }); + std::copy_if(offsets_iter, offsets_iter + tbl.num_columns(), + std::back_inserter(offsets_iterators), + [](auto const &offset_ptr) { return offset_ptr != nullptr; }); + return make_device_uvector_async(offsets_iterators, stream); + }(); + + auto const num_columns = static_cast(d_offsets_iterators.size()); + + thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_columns * num_rows), + [d_offsets_iterators = d_offsets_iterators.data(), num_columns, num_rows, + d_row_sizes = d_row_sizes.data()] __device__(auto element_idx) { + auto const row = element_idx % num_rows; + auto const col = element_idx / num_rows; + auto const val = + d_offsets_iterators[col][row + 1] - d_offsets_iterators[col][row]; + atomicAdd(&d_row_sizes[row], val); + }); + + // transform the row sizes to include fixed width size and alignment + thrust::transform(rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(), + d_row_sizes.begin(), [fixed_width_and_validity_size] __device__(auto row_size) { + return util::round_up_unsafe(fixed_width_and_validity_size + row_size, + JCUDF_ROW_ALIGNMENT); + }); + + return d_row_sizes; +} + +/** + * @brief functor to return the offset of a row in a table with string columns + * + */ +struct string_row_offset_functor { + string_row_offset_functor(device_span _d_row_offsets) + : d_row_offsets(_d_row_offsets){}; + + __device__ inline size_type operator()(int row_number, int) const { + return d_row_offsets[row_number]; + } + + device_span d_row_offsets; +}; + +/** + * @brief functor to return the offset of a row in a table with only fixed-width columns + * + */ +struct fixed_width_row_offset_functor { + fixed_width_row_offset_functor(size_type fixed_width_only_row_size) : _fixed_width_only_row_size(fixed_width_only_row_size){}; __device__ inline size_type operator()(int row_number, int tile_row_start) const { @@ -542,6 +618,10 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum auto const relative_col = el / num_fetch_rows; auto const relative_row = el % num_fetch_rows; auto const absolute_col = relative_col + fetch_tile.start_col; + if (input_data[absolute_col] == nullptr) { + // variable-width data + continue; + } auto const absolute_row = relative_row + fetch_tile.start_row; auto const col_size = col_sizes[absolute_col]; auto const col_offset = col_offsets[absolute_col]; @@ -1194,10 +1274,8 @@ static size_type compute_column_information(iterator begin, iterator end, auto validity_offset = fixed_width_size_per_row; column_starts.push_back(validity_offset); - return util::round_up_unsafe( - fixed_width_size_per_row + - util::div_rounding_up_safe(static_cast(std::distance(begin, end)), CHAR_BIT), - JCUDF_ROW_ALIGNMENT); + return fixed_width_size_per_row + + util::div_rounding_up_safe(static_cast(std::distance(begin, end)), CHAR_BIT); } /** @@ -1512,20 +1590,27 @@ void determine_tiles(std::vector const &column_sizes, } } -#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - -} // namespace detail - -std::vector> convert_to_rows(table_view const &tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - auto const num_columns = tbl.num_columns(); - auto const num_rows = tbl.num_rows(); - - auto const fixed_width_only = std::all_of( - tbl.begin(), tbl.end(), [](column_view const &c) { return is_fixed_width(c.type()); }); - +/** + * @brief convert cudf table into JCUDF row format + * + * @tparam offsetFunctor functor type for offset functor + * @param tbl table to convert to JCUDF row format + * @param batch_info information about the batches of data + * @param offset_functor functor that returns the starting offset of each row + * @param column_starts starting offset of a column in a row + * @param column_sizes size of each element in a column + * @param fixed_width_size_per_row size of fixed-width data in a row of this table + * @param stream stream used + * @param mr selected memory resource for returned data + * @return vector of list columns containing byte columns of the JCUDF row data + */ +template +std::vector> +convert_to_rows(table_view const &tbl, batch_data &batch_info, offsetFunctor offset_functor, + std::vector const &column_starts, + std::vector const &column_sizes, + size_type const fixed_width_size_per_row, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { int device_id; CUDA_TRY(cudaGetDevice(&device_id)); int total_shmem_in_bytes; @@ -1537,23 +1622,12 @@ std::vector> convert_to_rows(table_view const &tbl, sizeof(cuda::barrier) * NUM_TILES_PER_KERNEL_LOADED; auto const shmem_limit_per_tile = total_shmem_in_bytes / NUM_TILES_PER_KERNEL_LOADED; - // break up the work into tiles, which are a starting and ending row/col #. - // this tile size is calculated based on the shared memory size available - // we want a single tile to fill up the entire shared memory space available - // for the transpose-like conversion. - - // There are two different processes going on here. The GPU conversion of the data - // and the writing of the data into the list of byte columns that are a maximum of - // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand - // this limitation because the column must own the data inside and as a result it must be - // a distinct allocation for that column. Copying the data into these final buffers would - // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer. - // The tiles are broken at the boundaries of specific rows based on the row sizes up - // to that point. These are row batches and they are decided first before building the - // tiles so the tiles can be properly cut around them. + auto const num_rows = tbl.num_rows(); + auto const num_columns = tbl.num_columns(); + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); // Get the pointers to the input columnar data ready - auto data_begin = thrust::make_transform_iterator( tbl.begin(), [](auto const &c) { return c.template data(); }); std::vector input_data(data_begin, data_begin + tbl.num_columns()); @@ -1565,27 +1639,6 @@ std::vector> convert_to_rows(table_view const &tbl, auto dev_input_data = make_device_uvector_async(input_data, stream, mr); auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); - std::vector column_sizes; // byte size of each column - std::vector column_starts; // offset of column inside a row including alignment - column_sizes.reserve(num_columns); - column_starts.reserve(num_columns + 1); // we add a final offset for validity data start - - auto schema_column_iter = - thrust::make_transform_iterator(thrust::make_counting_iterator(0), - [&tbl](auto i) -> std::tuple { - return {tbl.column(i).type(), tbl.column(i)}; - }); - - auto const fixed_width_size_per_row = detail::compute_column_information( - schema_column_iter, schema_column_iter + num_columns, column_starts, column_sizes); - - auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); - auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); - - // total encoded row size. This includes fixed-width data, validity, and variable-width data. - auto row_size_iter = thrust::make_constant_iterator(fixed_width_size_per_row); - auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr); - // the first batch always exists unless we were sent an empty table auto const first_batch_size = batch_info.row_batches[0].row_count; @@ -1636,8 +1689,6 @@ std::vector> convert_to_rows(table_view const &tbl, util::div_rounding_up_unsafe(validity_tile_infos.size(), NUM_VALIDITY_TILES_PER_KERNEL)); dim3 validity_threads(std::min(validity_tile_infos.size() * 32, 128lu)); - detail::row_offset_functor offset_functor(fixed_width_size_per_row); - detail::copy_to_rows<<>>( num_rows, num_columns, shmem_limit_per_tile, gpu_tile_infos, dev_input_data.data(), dev_col_sizes.data(), dev_col_starts.data(), offset_functor, @@ -1670,6 +1721,76 @@ std::vector> convert_to_rows(table_view const &tbl, }); return ret; +} +#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + +} // namespace detail + +std::vector> convert_to_rows(table_view const &tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + auto const num_columns = tbl.num_columns(); + auto const num_rows = tbl.num_rows(); + + auto const fixed_width_only = std::all_of( + tbl.begin(), tbl.end(), [](column_view const &c) { return is_fixed_width(c.type()); }); + + // break up the work into tiles, which are a starting and ending row/col #. + // this tile size is calculated based on the shared memory size available + // we want a single tile to fill up the entire shared memory space available + // for the transpose-like conversion. + + // There are two different processes going on here. The GPU conversion of the data + // and the writing of the data into the list of byte columns that are a maximum of + // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand + // this limitation because the column must own the data inside and as a result it must be + // a distinct allocation for that column. Copying the data into these final buffers would + // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer. + // The tiles are broken at the boundaries of specific rows based on the row sizes up + // to that point. These are row batches and they are decided first before building the + // tiles so the tiles can be properly cut around them. + + std::vector column_sizes; // byte size of each column + std::vector column_starts; // offset of column inside a row including alignment + column_sizes.reserve(num_columns); + column_starts.reserve(num_columns + 1); // we add a final offset for validity data start + + auto schema_column_iter = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [&tbl](auto i) -> std::tuple { + return {tbl.column(i).type(), tbl.column(i)}; + }); + + auto const fixed_width_size_per_row = detail::compute_column_information( + schema_column_iter, schema_column_iter + num_columns, column_starts, column_sizes); + if (fixed_width_only) { + // total encoded row size. This includes fixed-width data and validity only. It does not include + // variable-width data since it isn't copied with the fixed-width and validity kernel. + auto row_size_iter = thrust::make_constant_iterator( + util::round_up_unsafe(fixed_width_size_per_row, JCUDF_ROW_ALIGNMENT)); + + auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr); + + detail::fixed_width_row_offset_functor offset_functor( + util::round_up_unsafe(fixed_width_size_per_row, JCUDF_ROW_ALIGNMENT)); + + return detail::convert_to_rows(tbl, batch_info, offset_functor, column_starts, column_sizes, + fixed_width_size_per_row, stream, mr); + } else { + auto row_sizes = detail::build_string_row_sizes(tbl, fixed_width_size_per_row, stream); + + auto row_size_iter = cudf::detail::make_counting_transform_iterator( + 0, detail::row_size_functor(num_rows, row_sizes.data(), 0)); + + auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr); + + detail::string_row_offset_functor offset_functor(batch_info.batch_row_offsets); + + return detail::convert_to_rows(tbl, batch_info, offset_functor, column_starts, column_sizes, + fixed_width_size_per_row, stream, mr); + } + #else CUDF_FAIL("Column to row conversion optimization requires volta or later hardware."); return {}; @@ -1768,8 +1889,9 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { return std::make_tuple(schema[i], nullptr); }); - auto const fixed_width_size_per_row = - detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes); + auto const fixed_width_size_per_row = util::round_up_unsafe( + detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes), + JCUDF_ROW_ALIGNMENT); // Ideally we would check that the offsets are all the same, etc. but for now // this is probably fine @@ -1842,7 +1964,7 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, dim3 validity_threads(std::min(validity_tile_infos.size() * 32, 128lu)); - detail::row_offset_functor offset_functor(fixed_width_size_per_row); + detail::fixed_width_row_offset_functor offset_functor(fixed_width_size_per_row); detail::copy_from_rows<<>>( num_rows, num_columns, shmem_limit_per_tile, offset_functor, gpu_batch_row_boundaries.data(), From 8cc84c69ef4ecb0e098d794a34c4e9268a06b49a Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 11 Feb 2022 08:06:44 -0500 Subject: [PATCH 13/47] Add libcudf strings split API that accepts regex pattern (#10128) Reference #3584 This PR adds 4 new libcudf strings APIs for split. - `cudf::strings::split_re` - split using regex to locate delimiters with table output like `cudf::strings::split`. - `cudf::strings::rsplit_re` - same as `split_re` but delimiter search starts from the end of each string - `cudf::strings::split_record_re` - same as `split_re` but returns a list column like `split_record` does - `cudf::strings::rsplit_record_re` - same as `split_record_re` but delimiter search starts from the end of each string Like `split/rsplit` the results try to match Pandas behavior for these. The `record` results are similar to specifying `expand=False` in the Pandas `split/rsplit` APIs. Python/Cython updates for cuDF will be in a follow-on PR. Currently, Pandas does not support regex for its `rsplit` even though it has been documented and there is an issue [here](https://github.com/pandas-dev/pandas/issues/29633). New gtests have been added for these along with some additional tests that were missing for the non-regex versions of these APIs. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Robert Maynard (https://github.com/robertmaynard) - AJ Schmidt (https://github.com/ajschmidt8) - https://github.com/nvdbaranec - Andy Grove (https://github.com/andygrove) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/10128 --- conda/recipes/libcudf/meta.yaml | 1 + cpp/CMakeLists.txt | 1 + cpp/include/cudf/strings/split/split_re.hpp | 232 ++++++++++++ cpp/src/strings/split/split_re.cu | 375 ++++++++++++++++++++ cpp/tests/strings/split_tests.cpp | 270 +++++++++++--- 5 files changed, 834 insertions(+), 45 deletions(-) create mode 100644 cpp/include/cudf/strings/split/split_re.hpp create mode 100644 cpp/src/strings/split/split_re.cu diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 57205f22e57..4e20c979f6c 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -216,6 +216,7 @@ test: - test -f $PREFIX/include/cudf/strings/replace_re.hpp - test -f $PREFIX/include/cudf/strings/split/partition.hpp - test -f $PREFIX/include/cudf/strings/split/split.hpp + - test -f $PREFIX/include/cudf/strings/split/split_re.hpp - test -f $PREFIX/include/cudf/strings/string_view.hpp - test -f $PREFIX/include/cudf/strings/strings_column_view.hpp - test -f $PREFIX/include/cudf/strings/strip.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 90e94ffcc7b..407e1f9a858 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -445,6 +445,7 @@ add_library( src/strings/search/find_multiple.cu src/strings/split/partition.cu src/strings/split/split.cu + src/strings/split/split_re.cu src/strings/split/split_record.cu src/strings/strings_column_factories.cu src/strings/strings_column_view.cpp diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp new file mode 100644 index 00000000000..320d1bdc9b4 --- /dev/null +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace cudf { +namespace strings { +/** + * @addtogroup strings_split + * @{ + * @file + */ + +/** + * @brief Splits strings elements into a table of strings columns + * using a regex pattern to delimit each string. + * + * Each element generates a vector of strings that are stored in corresponding + * rows in the output table -- `table[col,row] = token[col] of strings[row]` + * where `token` is a substring between delimiters. + * + * The number of rows in the output table will be the same as the number of + * elements in the input column. The resulting number of columns will be the + * maximum number of tokens found in any input row. + * + * The `pattern` is used to identify the delimiters within a string + * and splitting stops when either `maxsplit` or the end of the string is reached. + * + * An empty input string will produce a corresponding empty string in the + * corresponding row of the first column. + * A null row will produce corresponding null rows in the output table. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * s1 = split_re(s, "[_ ]") + * s1 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc", "", "ab", "cd"], + * ["def", "bc", "cd", ""], + * ["g", null, null, null] ] + * s2 = split_re(s, "[ _]", 1) + * s2 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc def_g", "_bc", "ab cd", "cd "] ] + * @endcode + * + * @throw cudf::logic_error if `pattern` is empty. + * + * @param input A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return A table of columns of strings. + */ +std::unique_ptr
split_re( + strings_column_view const& input, + std::string const& pattern, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Splits strings elements into a table of strings columns + * using a regex pattern to delimit each string starting from the end of the string. + * + * Each element generates a vector of strings that are stored in corresponding + * rows in the output table -- `table[col,row] = token[col] of string[row]` + * where `token` is the substring between each delimiter. + * + * The number of rows in the output table will be the same as the number of + * elements in the input column. The resulting number of columns will be the + * maximum number of tokens found in any input row. + * + * Splitting occurs by traversing starting from the end of the input string. + * The `pattern` is used to identify the delimiters within a string + * and splitting stops when either `maxsplit` or the beginning of the string + * is reached. + * + * An empty input string will produce a corresponding empty string in the + * corresponding row of the first column. + * A null row will produce corresponding null rows in the output table. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * s1 = rsplit_re(s, "[_ ]") + * s1 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc", "", "ab", "cd"], + * ["def", "bc", "cd", ""], + * ["g", null, null, null] ] + * s2 = rsplit_re(s, "[ _]", 1) + * s2 is a table of strings columns: + * [ ["a_bc def", "a_", "_ab", "ab"], + * ["g", "bc", "cd", "cd "] ] + * @endcode + * + * @throw cudf::logic_error if `pattern` is empty. + * + * @param input A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return A table of columns of strings. + */ +std::unique_ptr
rsplit_re( + strings_column_view const& input, + std::string const& pattern, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Splits strings elements into a list column of strings + * using the given regex pattern to delimit each string. + * + * Each element generates an array of strings that are stored in an output + * lists column -- `list[row] = [token1, token2, ...] found in input[row]` + * where `token` is a substring between delimiters. + * + * The number of elements in the output column will be the same as the number of + * elements in the input column. Each individual list item will contain the + * new strings for that row. The resulting number of strings in each row can vary + * from 0 to `maxsplit + 1`. + * + * The `pattern` is used to identify the delimiters within a string + * and splitting stops when either `maxsplit` or the end of the string is reached. + * + * An empty input string will produce a corresponding empty list item output row. + * A null row will produce a corresponding null output row. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * s1 = split_record_re(s, "[_ ]") + * s1 is a lists column of strings: + * [ ["a", "bc", "def", "g"], + * ["a", "", "bc"], + * ["", "ab", "cd"], + * ["ab", "cd", ""] ] + * s2 = split_record_re(s, "[ _]", 1) + * s2 is a lists column of strings: + * [ ["a", "bc def_g"], + * ["a", "_bc"], + * ["", "ab cd"], + * ["ab", "cd "] ] + * @endcode + * + * @throw cudf::logic_error if `pattern` is empty. + * + * @param input A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return Lists column of strings. + */ +std::unique_ptr split_record_re( + strings_column_view const& input, + std::string const& pattern, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Splits strings elements into a list column of strings + * using the given regex pattern to delimit each string starting from the end of the string. + * + * Each element generates a vector of strings that are stored in an output + * lists column -- `list[row] = [token1, token2, ...] found in input[row]` + * where `token` is a substring between delimiters. + * + * The number of elements in the output column will be the same as the number of + * elements in the input column. Each individual list item will contain the + * new strings for that row. The resulting number of strings in each row can vary + * from 0 to `maxsplit + 1`. + * + * Splitting occurs by traversing starting from the end of the input string. + * The `pattern` is used to identify the separation points within a string + * and splitting stops when either `maxsplit` or the beginning of the string + * is reached. + * + * An empty input string will produce a corresponding empty list item output row. + * A null row will produce a corresponding null output row. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * s1 = rsplit_record_re(s, "[_ ]") + * s1 is a lists column of strings: + * [ ["a", "bc", "def", "g"], + * ["a", "", "bc"], + * ["", "ab", "cd"], + * ["ab", "cd", ""] ] + * s2 = rsplit_record_re(s, "[ _]", 1) + * s2 is a lists column of strings: + * [ ["a_bc def", "g"], + * ["a_", "bc"], + * ["_ab", "cd"], + * ["ab_cd", ""] ] + * @endcode + * + * @throw cudf::logic_error if `pattern` is empty. + * + * @param input A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return Lists column of strings. + */ +std::unique_ptr rsplit_record_re( + strings_column_view const& input, + std::string const& pattern, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of doxygen group +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu new file mode 100644 index 00000000000..d80148f2fe6 --- /dev/null +++ b/cpp/src/strings/split/split_re.cu @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace cudf { +namespace strings { +namespace detail { +namespace { + +using string_index_pair = thrust::pair; + +enum class split_direction { + FORWARD, ///< for split logic + BACKWARD ///< for rsplit logic +}; + +/** + * @brief Identify the tokens from the `idx'th` string element of `d_strings`. + * + * Each string's tokens are stored in the `d_tokens` vector. + * The `d_token_offsets` specifies the output position within `d_tokens` + * for each string. + */ +template +struct token_reader_fn { + column_device_view const d_strings; + reprog_device prog; + split_direction const direction; + offset_type const* d_token_offsets; + string_index_pair* d_tokens; + + __device__ void operator()(size_type idx) + { + if (d_strings.is_null(idx)) { return; } + auto const d_str = d_strings.element(idx); + + auto const token_offset = d_token_offsets[idx]; + auto const token_count = d_token_offsets[idx + 1] - token_offset; + auto const d_result = d_tokens + token_offset; // store tokens here + + size_type token_idx = 0; + size_type begin = 0; // characters + size_type end = d_str.length(); + size_type last_pos = 0; // bytes + while (prog.find(idx, d_str, begin, end) > 0) { + // get the token (characters just before this match) + auto const token = + string_index_pair{d_str.data() + last_pos, d_str.byte_offset(begin) - last_pos}; + // store it if we have space + if (token_idx < token_count - 1) { + d_result[token_idx++] = token; + } else { + if (direction == split_direction::FORWARD) { break; } // we are done + for (auto l = 0; l < token_idx - 1; ++l) { + d_result[l] = d_result[l + 1]; // shift left + } + d_result[token_idx - 1] = token; + } + // setup for next match + last_pos = d_str.byte_offset(end); + begin = end + (begin == end); + end = d_str.length(); + } + + // set the last token to the remainder of the string + d_result[token_idx] = string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos}; + + if (direction == split_direction::BACKWARD) { + // update first entry -- this happens when max_tokens is hit before the end of the string + auto const first_offset = + d_result[0].first + ? static_cast(thrust::distance(d_str.data(), d_result[0].first)) + : 0; + if (first_offset) { + d_result[0] = string_index_pair{d_str.data(), first_offset + d_result[0].second}; + } + } + } +}; + +/** + * @brief Call regex to split each input string into tokens. + * + * This will also convert the `offsets` values from counts to offsets. + * + * @param d_strings Strings to split + * @param d_prog Regex to evaluate against each string + * @param direction Whether tokens are generated forwards or backwards. + * @param max_tokens The maximum number of tokens for each split. + * @param offsets The number of matches on input. + * The offsets for each token in each string on output. + * @param stream CUDA stream used for kernel launches. + */ +rmm::device_uvector generate_tokens(column_device_view const& d_strings, + reprog_device& d_prog, + split_direction direction, + size_type maxsplit, + mutable_column_view& offsets, + rmm::cuda_stream_view stream) +{ + auto const strings_count = d_strings.size(); + + auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); + + auto const begin = thrust::make_counting_iterator(0); + auto const end = thrust::make_counting_iterator(strings_count); + auto const d_offsets = offsets.data(); + + // convert match counts to token offsets + auto map_fn = [d_strings, d_offsets, max_tokens] __device__(auto idx) { + return d_strings.is_null(idx) ? 0 : std::min(d_offsets[idx], max_tokens) + 1; + }; + thrust::transform_exclusive_scan( + rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus{}); + + // the last offset entry is the total number of tokens to be generated + auto const total_tokens = cudf::detail::get_value(offsets, strings_count, stream); + + // generate tokens for each string + rmm::device_uvector tokens(total_tokens, stream); + auto const regex_insts = d_prog.insts_counts(); + if (regex_insts <= RX_SMALL_INSTS) { + token_reader_fn reader{d_strings, d_prog, direction, d_offsets, tokens.data()}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); + } else if (regex_insts <= RX_MEDIUM_INSTS) { + token_reader_fn reader{d_strings, d_prog, direction, d_offsets, tokens.data()}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); + } else if (regex_insts <= RX_LARGE_INSTS) { + token_reader_fn reader{d_strings, d_prog, direction, d_offsets, tokens.data()}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); + } else { + token_reader_fn reader{d_strings, d_prog, direction, d_offsets, tokens.data()}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); + } + + return tokens; +} + +/** + * @brief Returns string pair for the specified column for each string in `d_strings` + * + * This is used to build the table result of a split. + * Null is returned if the row is null or if the `column_index` is larger + * than the token count for that string. + */ +struct tokens_transform_fn { + column_device_view const d_strings; + string_index_pair const* d_tokens; + offset_type const* d_token_offsets; + size_type const column_index; + + __device__ string_index_pair operator()(size_type idx) const + { + auto const offset = d_token_offsets[idx]; + auto const token_count = d_token_offsets[idx + 1] - offset; + return (column_index >= token_count) || d_strings.is_null(idx) + ? string_index_pair{nullptr, 0} + : d_tokens[offset + column_index]; + } +}; + +std::unique_ptr
split_re(strings_column_view const& input, + std::string const& pattern, + split_direction direction, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); + + auto const strings_count = input.size(); + + std::vector> results; + if (strings_count == 0) { + results.push_back(make_empty_column(type_id::STRING)); + return std::make_unique
(std::move(results)); + } + + // create the regex device prog from the given pattern + auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + auto d_strings = column_device_view::create(input.parent(), stream); + + // count the number of delimiters matched in each string + auto offsets = count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource()); + auto offsets_view = offsets->mutable_view(); + auto d_offsets = offsets_view.data(); + + // get the split tokens from the input column; this also converts the counts into offsets + auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream); + + // the output column count is the maximum number of tokens generated for any input string + auto const columns_count = thrust::transform_reduce( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + [d_offsets] __device__(auto const idx) -> size_type { + return d_offsets[idx + 1] - d_offsets[idx]; + }, + 0, + thrust::maximum{}); + + // boundary case: if no columns, return one all-null column (custrings issue #119) + if (columns_count == 0) { + results.push_back(std::make_unique( + data_type{type_id::STRING}, + strings_count, + rmm::device_buffer{0, stream, mr}, // no data + cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr), + strings_count)); + return std::make_unique
(std::move(results)); + } + + // convert the tokens into multiple strings columns + auto make_strings_lambda = [&](size_type column_index) { + // returns appropriate token for each row/column + auto indices_itr = cudf::detail::make_counting_transform_iterator( + 0, tokens_transform_fn{*d_strings, tokens.data(), d_offsets, column_index}); + return make_strings_column(indices_itr, indices_itr + strings_count, stream, mr); + }; + // build a vector of columns + results.resize(columns_count); + std::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(columns_count), + results.begin(), + make_strings_lambda); + + return std::make_unique
(std::move(results)); +} + +std::unique_ptr split_record_re(strings_column_view const& input, + std::string const& pattern, + split_direction direction, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); + + auto const strings_count = input.size(); + + // create the regex device prog from the given pattern + auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + auto d_strings = column_device_view::create(input.parent(), stream); + + // count the number of delimiters matched in each string + auto offsets = count_matches(*d_strings, *d_prog, stream, mr); + auto offsets_view = offsets->mutable_view(); + + // get the split tokens from the input column; this also converts the counts into offsets + auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream); + + // convert the tokens into one big strings column + auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr); + + // create a lists column using the offsets and the strings columns + return make_lists_column(strings_count, + std::move(offsets), + std::move(strings_output), + input.null_count(), + copy_bitmask(input.parent(), stream, mr), + stream, + mr); +} + +} // namespace + +std::unique_ptr
split_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr); +} + +std::unique_ptr split_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_record_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr); +} + +std::unique_ptr
rsplit_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr); +} + +std::unique_ptr rsplit_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_record_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr); +} + +} // namespace detail + +// external APIs + +std::unique_ptr
split_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::split_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); +} + +std::unique_ptr split_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::split_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); +} + +std::unique_ptr
rsplit_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::rsplit_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); +} + +std::unique_ptr rsplit_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::rsplit_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); +} +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index de4e48fd70a..f0d7315929b 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,14 +18,15 @@ #include #include #include +#include #include #include #include #include #include +#include #include -#include #include @@ -240,41 +241,6 @@ TEST_F(StringsSplitTest, RSplitWhitespaceWithMax) CUDF_TEST_EXPECT_TABLES_EQUAL(*results, *expected); } -TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns) -{ - cudf::column_view zero_size_strings_column( - cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto results = cudf::strings::split(zero_size_strings_column); - EXPECT_TRUE(results->num_columns() == 1); - cudf::test::expect_strings_empty(results->get_column(0)); - results = cudf::strings::rsplit(zero_size_strings_column); - EXPECT_TRUE(results->num_columns() == 1); - cudf::test::expect_strings_empty(results->get_column(0)); -} - -// This test specifically for https://github.com/rapidsai/custrings/issues/119 -TEST_F(StringsSplitTest, AllNullsCase) -{ - std::vector h_strings{nullptr, nullptr, nullptr}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - - auto results = cudf::strings::split(cudf::strings_column_view(strings)); - EXPECT_TRUE(results->num_columns() == 1); - auto column = results->get_column(0).view(); - EXPECT_TRUE(column.size() == 3); - EXPECT_TRUE(column.has_nulls()); - EXPECT_TRUE(column.null_count() == column.size()); - results = cudf::strings::split(cudf::strings_column_view(strings), cudf::string_scalar("-")); - EXPECT_TRUE(results->num_columns() == 1); - column = results->get_column(0); - EXPECT_TRUE(column.size() == 3); - EXPECT_TRUE(column.has_nulls()); - EXPECT_TRUE(column.null_count() == column.size()); -} - TEST_F(StringsSplitTest, SplitRecord) { std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; @@ -339,6 +305,127 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); } +TEST_F(StringsSplitTest, SplitRegex) +{ + std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + + { + auto result = cudf::strings::split_re(sv, "\\s+"); + + cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, validity); + cudf::test::strings_column_wrapper col1({"Héllo", "", "some", "String", ""}, {1, 0, 1, 1, 0}); + cudf::test::strings_column_wrapper col2({"thesé", "", "", "", ""}, {1, 0, 1, 0, 0}); + auto expected = cudf::table_view({col0, col1, col2}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + + // rsplit == split when using default parameters + result = cudf::strings::rsplit_re(sv, "\\s+"); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + } + + { + auto result = cudf::strings::split_re(sv, "[eé]"); + + cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, validity); + cudf::test::strings_column_wrapper col1({"llo th", "", " som", "st String", ""}, + {1, 0, 1, 1, 0}); + cudf::test::strings_column_wrapper col2({"s", "", " ", "", ""}, {1, 0, 1, 0, 0}); + cudf::test::strings_column_wrapper col3({"", "", "", "", ""}, {1, 0, 0, 0, 0}); + auto expected = cudf::table_view({col0, col1, col2, col3}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + + // rsplit == split when using default parameters + result = cudf::strings::rsplit_re(sv, "[eé]"); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + } +} + +TEST_F(StringsSplitTest, SplitRecordRegex) +{ + std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + + using LCW = cudf::test::lists_column_wrapper; + { + auto result = cudf::strings::split_record_re(sv, "\\s+"); + + LCW expected( + {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", ""}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + + // rsplit == split when using default parameters + result = cudf::strings::rsplit_record_re(sv, "\\s+"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + } + + { + auto result = cudf::strings::split_record_re(sv, "[eé]"); + + LCW expected({LCW{" H", "llo th", "s", ""}, + LCW{}, + LCW{"ar", " som", " "}, + LCW{"t", "st String"}, + LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + + // rsplit == split when using default parameters + result = cudf::strings::rsplit_record_re(sv, "[eé]"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + } +} + +TEST_F(StringsSplitTest, SplitRegexWithMaxSplit) +{ + std::vector h_strings{" Héllo\tthesé", nullptr, "are\nsome ", "tést\rString", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + { + auto result = cudf::strings::split_re(sv, "\\s+", 1); + + cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1}); + cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some ", "String", ""}, + {1, 0, 1, 1, 0}); + auto expected = cudf::table_view({col0, col1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + + // split everything is the same output as maxsplit==2 for the test input column here + result = cudf::strings::split_re(sv, "\\s+", 2); + auto expected2 = cudf::strings::split_re(sv, "\\s+"); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected2->view()); + } + { + auto result = cudf::strings::split_record_re(sv, "\\s", 1); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected1( + {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some "}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected1); + + result = cudf::strings::split_record_re(sv, "\\s", 2); + LCW expected2( + {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", " "}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected2); + + // split everything is the same output as maxsplit==3 for the test input column here + result = cudf::strings::split_record_re(sv, "\\s", 3); + auto expected0 = cudf::strings::split_record_re(sv, "\\s"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view()); + } +} + TEST_F(StringsSplitTest, RSplitRecord) { std::vector h_strings{ @@ -430,14 +517,100 @@ TEST_F(StringsSplitTest, RSplitRecordWhitespaceWithMaxSplit) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); } -TEST_F(StringsSplitTest, SplitRecordZeroSizeStringsColumns) +TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit) +{ + std::vector h_strings{" Héllo\tthesé", nullptr, "are some\n ", "tést\rString", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + + { + auto result = cudf::strings::rsplit_re(sv, "\\s+", 1); + + cudf::test::strings_column_wrapper col0({" Héllo", "", "are some", "tést", ""}, validity); + cudf::test::strings_column_wrapper col1({"thesé", "", "", "String", ""}, {1, 0, 1, 1, 0}); + auto expected = cudf::table_view({col0, col1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + } + { + auto result = cudf::strings::rsplit_record_re(sv, "\\s+", 1); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected( + {LCW{" Héllo", "thesé"}, LCW{}, LCW{"are some", ""}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + + // split everything is the same output as any maxsplit > 2 for the test input column here + result = cudf::strings::rsplit_record_re(sv, "\\s+", 3); + auto expected0 = cudf::strings::rsplit_record_re(sv, "\\s+"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view()); + } +} + +TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns) { cudf::column_view zero_size_strings_column( cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto split_record_result = cudf::strings::split_record(zero_size_strings_column); - EXPECT_TRUE(split_record_result->size() == 0); - auto rsplit_record_result = cudf::strings::rsplit_record(zero_size_strings_column); - EXPECT_TRUE(rsplit_record_result->size() == 0); + auto results = cudf::strings::split(zero_size_strings_column); + EXPECT_TRUE(results->num_columns() == 1); + EXPECT_TRUE(results->num_rows() == 0); + results = cudf::strings::rsplit(zero_size_strings_column); + EXPECT_TRUE(results->num_columns() == 1); + EXPECT_TRUE(results->num_rows() == 0); + results = cudf::strings::split_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(results->num_columns() == 1); + EXPECT_TRUE(results->num_rows() == 0); + results = cudf::strings::rsplit_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(results->num_columns() == 1); + EXPECT_TRUE(results->num_rows() == 0); + + auto list_result = cudf::strings::split_record(zero_size_strings_column); + EXPECT_TRUE(list_result->size() == 0); + list_result = cudf::strings::rsplit_record(zero_size_strings_column); + EXPECT_TRUE(list_result->size() == 0); + list_result = cudf::strings::split_record_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(list_result->size() == 0); + list_result = cudf::strings::rsplit_record_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(list_result->size() == 0); +} + +// This test specifically for https://github.com/rapidsai/custrings/issues/119 +TEST_F(StringsSplitTest, AllNullsCase) +{ + cudf::test::strings_column_wrapper input({"", "", ""}, {0, 0, 0}); + auto sv = cudf::strings_column_view(input); + + auto results = cudf::strings::split(sv); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::split(sv, cudf::string_scalar("-")); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::rsplit(sv); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::rsplit(sv, cudf::string_scalar("-")); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::split_re(sv, "-"); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::rsplit_re(sv, "-"); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + + auto list_result = cudf::strings::split_record(sv); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{}, LCW{}, LCW{}}, cudf::test::iterators::all_nulls()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); + list_result = cudf::strings::rsplit_record(sv); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); + list_result = cudf::strings::split_record_re(sv, "-"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); + list_result = cudf::strings::rsplit_record_re(sv, "-"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); } TEST_F(StringsSplitTest, Partition) @@ -612,13 +785,20 @@ TEST_F(StringsSplitTest, PartitionZeroSizeStringsColumns) TEST_F(StringsSplitTest, InvalidParameter) { - std::vector h_strings{"string left intentionally blank"}; - cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); - auto strings_view = cudf::strings_column_view(strings); + cudf::test::strings_column_wrapper input({"string left intentionally blank"}); + auto strings_view = cudf::strings_column_view(input); EXPECT_THROW(cudf::strings::split(strings_view, cudf::string_scalar("", false)), cudf::logic_error); EXPECT_THROW(cudf::strings::rsplit(strings_view, cudf::string_scalar("", false)), cudf::logic_error); + EXPECT_THROW(cudf::strings::split_record(strings_view, cudf::string_scalar("", false)), + cudf::logic_error); + EXPECT_THROW(cudf::strings::rsplit_record(strings_view, cudf::string_scalar("", false)), + cudf::logic_error); + EXPECT_THROW(cudf::strings::split_re(strings_view, ""), cudf::logic_error); + EXPECT_THROW(cudf::strings::split_record_re(strings_view, ""), cudf::logic_error); + EXPECT_THROW(cudf::strings::rsplit_re(strings_view, ""), cudf::logic_error); + EXPECT_THROW(cudf::strings::rsplit_record_re(strings_view, ""), cudf::logic_error); EXPECT_THROW(cudf::strings::partition(strings_view, cudf::string_scalar("", false)), cudf::logic_error); EXPECT_THROW(cudf::strings::rpartition(strings_view, cudf::string_scalar("", false)), From 4c92f86cc6ffd190cbab68d394ba4d820502cf1d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 11 Feb 2022 09:45:26 -0800 Subject: [PATCH 14/47] Add tests of reflected ufuncs and fix behavior of logical reflected ufuncs (#10261) This PR fixes cases like `cupy.array([1, 2, 3]) < cudf.Series([1, 2, 3])`. The code would previously attempt to find a reflected operator `Series.__rle__`, but no such operator exists because of the inherent symmetry in comparison operators. These changes fix the detection of such operators to just use operator `__ge__` in this case. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Michael Wang (https://github.com/isVoid) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/10261 --- python/cudf/cudf/core/series.py | 24 +++++++++++++-- python/cudf/cudf/tests/test_array_ufunc.py | 35 +++++++++++++++++----- 2 files changed, 49 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 41bf75fb48f..ddaa358e9d9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -998,9 +998,27 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # First look for methods of the class. fname = ufunc.__name__ if fname in binary_operations: - not_reflect = self is inputs[0] - other = inputs[not_reflect] - op = f"__{'' if not_reflect else 'r'}{binary_operations[fname]}__" + reflect = self is not inputs[0] + other = inputs[0] if reflect else inputs[1] + + # These operators need to be mapped to their inverses when + # performing a reflected operation because no reflected version of + # the operators themselves exist. + ops_without_reflection = { + "gt": "lt", + "ge": "le", + "lt": "gt", + "le": "ge", + # ne and eq are symmetric, so they are their own inverse op + "ne": "ne", + "eq": "eq", + } + + op = binary_operations[fname] + if reflect and op in ops_without_reflection: + op = ops_without_reflection[op] + reflect = False + op = f"__{'r' if reflect else ''}{op}__" # pandas bitwise operations return bools if indexes are misaligned. # TODO: Generalize for other types of Frames diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index 2db0e8d29b8..a384ddecca6 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -1,3 +1,5 @@ +# Copyright (c) 2020-2022, NVIDIA CORPORATION. + import operator from functools import reduce @@ -98,17 +100,26 @@ def test_ufunc_series(ufunc, has_nulls, indexed): raise -@pytest.mark.parametrize("ufunc", [np.add, np.greater, np.logical_and]) +@pytest.mark.parametrize( + "ufunc", [np.add, np.greater, np.greater_equal, np.logical_and] +) @pytest.mark.parametrize("has_nulls", [True, False]) @pytest.mark.parametrize("indexed", [True, False]) @pytest.mark.parametrize("type_", ["cupy", "numpy", "list"]) -def test_binary_ufunc_series_array(ufunc, has_nulls, indexed, type_): +@pytest.mark.parametrize("reflect", [True, False]) +def test_binary_ufunc_series_array(ufunc, has_nulls, indexed, type_, reflect): fname = ufunc.__name__ - if fname == "greater" and has_nulls: + if fname in ("greater", "greater_equal") and has_nulls: pytest.xfail( "The way cudf casts nans in arrays to nulls during binops with " "cudf objects is currently incompatible with pandas." ) + if reflect and has_nulls and type_ == "cupy": + pytest.skip( + "When cupy is the left operand there is no way for us to avoid " + "calling its binary operators, which cannot handle cudf objects " + "that contain nulls." + ) N = 100 # Avoid zeros in either array to skip division by 0 errors. Also limit the # scale to avoid issues with overflow, etc. We use ints because some @@ -136,18 +147,28 @@ def test_binary_ufunc_series_array(ufunc, has_nulls, indexed, type_): if type_ == "list": arg1 = arg1.tolist() - got = ufunc(args[0], arg1) - expect = ufunc(args[0].to_pandas(), args[1].to_numpy()) + if reflect: + got = ufunc(arg1, args[0]) + expect = ufunc(args[1].to_numpy(), args[0].to_pandas()) + else: + got = ufunc(args[0], arg1) + expect = ufunc(args[0].to_pandas(), args[1].to_numpy()) if ufunc.nout > 1: for g, e in zip(got, expect): if has_nulls: e[mask] = np.nan - assert_eq(g, e) + if type_ == "cupy" and reflect: + assert (cp.asnumpy(g) == e).all() + else: + assert_eq(g, e) else: if has_nulls: expect[mask] = np.nan - assert_eq(got, expect) + if type_ == "cupy" and reflect: + assert (cp.asnumpy(got) == expect).all() + else: + assert_eq(got, expect) @pytest.mark.parametrize( From 48c4dc316a344cfce724e505dd6c81735972cc9e Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 11 Feb 2022 12:59:54 -0600 Subject: [PATCH 15/47] Add more `nvtx` annotations (#10256) This PR adds more `nvtx` annotations for `cudf` & `dask_cudf` APIs. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/10256 --- python/cudf/cudf/core/dataframe.py | 100 ++++++++++++++++++++- python/cudf/cudf/core/frame.py | 117 ++++++++++++++++++++++++- python/cudf/cudf/io/parquet.py | 17 ++++ python/dask_cudf/dask_cudf/backends.py | 25 +++++- python/dask_cudf/dask_cudf/core.py | 43 ++++++++- python/dask_cudf/dask_cudf/groupby.py | 47 ++++++++++ python/dask_cudf/dask_cudf/sorting.py | 11 ++- 7 files changed, 350 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8a49efabcde..c3c79135c34 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -112,6 +112,7 @@ def __setitem__(self, key, value): key = (key, slice(None)) return self._setitem_tuple_arg(key, value) + @annotate("_CAN_DOWNCAST_TO_SERIES", color="green", domain="cudf_python") def _can_downcast_to_series(self, df, arg): """ This method encapsulates the logic used @@ -152,6 +153,7 @@ def _can_downcast_to_series(self, df, arg): return True return False + @annotate("_DOWNCAST_TO_SERIES", color="green", domain="cudf_python") def _downcast_to_series(self, df, arg): """ "Downcast" from a DataFrame to a Series @@ -193,6 +195,7 @@ class _DataFrameLocIndexer(_DataFrameIndexer): For selection by label. """ + @annotate("_GETITEM_SCALAR", color="green", domain="cudf_python") def _getitem_scalar(self, arg): return self._frame[arg[1]].loc[arg[0]] @@ -634,6 +637,9 @@ def __init__( if dtype: self._data = self.astype(dtype)._data + @annotate( + "DATAFRAME_INIT_FROM_SERIES_LIST", color="blue", domain="cudf_python" + ) def _init_from_series_list(self, data, columns, index): if index is None: # When `index` is `None`, the final index of @@ -732,6 +738,9 @@ def _init_from_series_list(self, data, columns, index): ) self._data = self._data.select_by_label(columns) + @annotate( + "DATAFRAME_INIT_FROM_LIST_LIKE", color="blue", domain="cudf_python" + ) def _init_from_list_like(self, data, index=None, columns=None): if index is None: index = RangeIndex(start=0, stop=len(data)) @@ -768,6 +777,9 @@ def _init_from_list_like(self, data, index=None, columns=None): self.columns = columns + @annotate( + "DATAFRAME_INIT_FROM_DICT_LIKE", color="blue", domain="cudf_python" + ) def _init_from_dict_like( self, data, index=None, columns=None, nan_as_null=None ): @@ -841,6 +853,11 @@ def _from_data( return out @staticmethod + @annotate( + "DATAFRAME_ALIGN_INPUT_SERIES_INDICES", + color="blue", + domain="cudf_python", + ) def _align_input_series_indices(data, index): data = data.copy() @@ -1186,6 +1203,7 @@ def __delitem__(self, name): """ self._drop_column(name) + @annotate("DATAFRAME_SLICE", color="blue", domain="cudf_python") def _slice(self: T, arg: slice) -> T: """ _slice : slice the frame as per the arg @@ -1247,11 +1265,13 @@ def _slice(self: T, arg: slice) -> T: result.columns = self.columns return result + @annotate("DATAFRAME_MEMORY_USAGE", color="blue", domain="cudf_python") def memory_usage(self, index=True, deep=False): return Series( {str(k): v for k, v in super().memory_usage(index, deep).items()} ) + @annotate("DATAFRAME_ARRAY_UFUNC", color="blue", domain="cudf_python") def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if method == "__call__" and hasattr(cudf, ufunc.__name__): func = getattr(cudf, ufunc.__name__) @@ -1259,6 +1279,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): else: return NotImplemented + @annotate("DATAFRAME_ARRAY_FUNCTION", color="blue", domain="cudf_python") def __array_function__(self, func, types, args, kwargs): cudf_df_module = DataFrame @@ -1297,6 +1318,7 @@ def __array_function__(self, func, types, args, kwargs): return NotImplemented # The _get_numeric_data method is necessary for dask compatibility. + @annotate("DATAFRAME_GET_NUMERIC_DATA", color="blue", domain="cudf_python") def _get_numeric_data(self): """Return a dataframe with only numeric data types""" columns = [ @@ -1306,6 +1328,7 @@ def _get_numeric_data(self): ] return self[columns] + @annotate("DATAFRAME_ASSIGN", color="blue", domain="cudf_python") def assign(self, **kwargs): """ Assign columns to DataFrame from keyword arguments. @@ -1784,10 +1807,12 @@ def _get_renderable_dataframe(self): return output + @annotate("DATAFRAME_REPR", color="blue", domain="cudf_python") def __repr__(self): output = self._get_renderable_dataframe() return self._clean_renderable_dataframe(output) + @annotate("DATAFRAME_REPR_HTML", color="blue", domain="cudf_python") def _repr_html_(self): lines = ( self._get_renderable_dataframe() @@ -1804,9 +1829,13 @@ def _repr_html_(self): lines.append("") return "\n".join(lines) + @annotate("DATAFRAME_REPR_LATEX", color="blue", domain="cudf_python") def _repr_latex_(self): return self._get_renderable_dataframe().to_pandas()._repr_latex_() + @annotate( + "DATAFRAME_GET_COLUMNS_BY_LABEL", color="blue", domain="cudf_python" + ) def _get_columns_by_label(self, labels, downcast=False): """ Return columns of dataframe by `labels` @@ -1829,6 +1858,7 @@ def _get_columns_by_label(self, labels, downcast=False): ) return out + @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python") def _binaryop( self, other: Any, @@ -1919,6 +1949,7 @@ def _binaryop( index=lhs._index, ) + @annotate("DATAFRAME_UPDATE", color="blue", domain="cudf_python") def update( self, other, @@ -2012,14 +2043,17 @@ def update( self._mimic_inplace(source_df, inplace=True) + @annotate("DATAFRAME_ITER", color="blue", domain="cudf_python") def __iter__(self): return iter(self.columns) + @annotate("DATAFRAME_ITERITEMS", color="blue", domain="cudf_python") def iteritems(self): """Iterate over column names and series pairs""" for k in self: yield (k, self[k]) + @annotate("DATAFRAME_EQUALS", color="blue", domain="cudf_python") def equals(self, other, **kwargs): ret = super().equals(other) # If all other checks matched, validate names. @@ -2082,6 +2116,7 @@ def columns(self, columns): data, multiindex=is_multiindex, level_names=columns.names, ) + @annotate("DATAFRAME_REINDEX_INTERNAL", color="blue", domain="cudf_python") def _reindex( self, columns, dtypes=None, deep=False, index=None, inplace=False ): @@ -2158,6 +2193,7 @@ def _reindex( return self._mimic_inplace(result, inplace=inplace) + @annotate("DATAFRAME_REINDEX", color="blue", domain="cudf_python") def reindex( self, labels=None, axis=None, index=None, columns=None, copy=True ): @@ -2236,6 +2272,7 @@ def reindex( inplace=False, ) + @annotate("DATAFRAME_SET_INDEX", color="blue", domain="cudf_python") def set_index( self, keys, @@ -2496,7 +2533,7 @@ def reset_index( inplace=inplace, ) - @annotate("INSERT", color="green", domain="cudf_python") + @annotate("DATAFRAME_INSERT", color="green", domain="cudf_python") def insert(self, loc, name, value, nan_as_null=None): """Add a column to DataFrame at the index specified by loc. @@ -2622,6 +2659,7 @@ def diff(self, periods=1, axis=0): return self - self.shift(periods=periods) + @annotate("DATAFRAME_DROP", color="green", domain="cudf_python") def drop( self, labels=None, @@ -2795,12 +2833,14 @@ def drop( if not inplace: return out + @annotate("DATAFRAME_DROP_COLUMN", color="green", domain="cudf_python") def _drop_column(self, name): """Drop a column by *name*""" if name not in self._data: raise KeyError(f"column '{name}' does not exist") del self._data[name] + @annotate("DATAFRAME_DROP_DUPLICATES", color="green", domain="cudf_python") def drop_duplicates( self, subset=None, keep="first", inplace=False, ignore_index=False ): @@ -2878,12 +2918,14 @@ def drop_duplicates( return self._mimic_inplace(outdf, inplace=inplace) + @annotate("DATAFRAME_POP", color="green", domain="cudf_python") def pop(self, item): """Return a column and drop it from the DataFrame.""" popped = self[item] del self[item] return popped + @annotate("DATAFRAME_RENAME", color="green", domain="cudf_python") def rename( self, mapper=None, @@ -3027,6 +3069,7 @@ def rename( else: return out.copy(deep=copy) + @annotate("DATAFRAME_ADD_PREFIX", color="green", domain="cudf_python") def add_prefix(self, prefix): out = self.copy(deep=True) out.columns = [ @@ -3034,6 +3077,7 @@ def add_prefix(self, prefix): ] return out + @annotate("DATAFRAME_ADD_SUFFIX", color="green", domain="cudf_python") def add_suffix(self, suffix): out = self.copy(deep=True) out.columns = [ @@ -3041,6 +3085,7 @@ def add_suffix(self, suffix): ] return out + @annotate("DATAFRAME_AGG", color="green", domain="cudf_python") def agg(self, aggs, axis=None): """ Aggregate using one or more operations over the specified axis. @@ -3172,6 +3217,7 @@ def agg(self, aggs, axis=None): else: raise ValueError("argument must be a string, list or dict") + @annotate("DATAFRAME_NLARGEST", color="green", domain="cudf_python") def nlargest(self, n, columns, keep="first"): """Get the rows of the DataFrame sorted by the n largest value of *columns* @@ -3303,6 +3349,7 @@ def nsmallest(self, n, columns, keep="first"): """ return self._n_largest_or_smallest(False, n, columns, keep) + @annotate("DATAFRAME_TRANSPOSE", color="green", domain="cudf_python") def transpose(self): """Transpose index and columns. @@ -3333,6 +3380,7 @@ def transpose(self): T = property(transpose, doc=transpose.__doc__) + @annotate("DATAFRAME_MELT", color="green", domain="cudf_python") def melt(self, **kwargs): """Unpivots a DataFrame from wide format to long format, optionally leaving identifier variables set. @@ -3362,7 +3410,7 @@ def melt(self, **kwargs): return melt(self, **kwargs) - @annotate("JOIN", color="blue", domain="cudf_python") + @annotate("DATAFRAME_JOIN", color="blue", domain="cudf_python") def merge( self, right, @@ -3544,6 +3592,7 @@ def join( ) return df + @annotate("DATAFRAME_GROUPBY", color="green", domain="cudf_python") @copy_docstring(DataFrameGroupBy) def groupby( self, @@ -3659,7 +3708,7 @@ def query(self, expr, local_dict=None): """ # can't use `annotate` decorator here as we inspect the calling # environment. - with annotate("QUERY", color="purple", domain="cudf_python"): + with annotate("DATAFRAME_QUERY", color="purple", domain="cudf_python"): if local_dict is None: local_dict = {} @@ -3683,6 +3732,7 @@ def query(self, expr, local_dict=None): boolmask = queryutils.query_execute(self, expr, callenv) return self._apply_boolean_mask(boolmask) + @annotate("DATAFRAME_APPLY", color="green", domain="cudf_python") def apply( self, func, axis=1, raw=False, result_type=None, args=(), **kwargs ): @@ -3831,6 +3881,7 @@ def apply( return self._apply(func, _get_row_kernel, *args, **kwargs) + @annotate("DATAFRAME_APPLY_ROWS", color="green", domain="cudf_python") @applyutils.doc_apply() def apply_rows( self, @@ -3909,6 +3960,7 @@ def apply_rows( cache_key=cache_key, ) + @annotate("DATAFRAME_APPLY_CHUNKS", color="green", domain="cudf_python") @applyutils.doc_applychunks() def apply_chunks( self, @@ -3976,6 +4028,9 @@ def apply_chunks( tpb=tpb, ) + @annotate( + "DATAFRAME_PARTITION_BY_HASH", color="green", domain="cudf_python" + ) def partition_by_hash(self, columns, nparts, keep_index=True): """Partition the dataframe by the hashed value of data in *columns*. @@ -4313,6 +4368,7 @@ def _sizeof_fmt(num, size_qualifier): cudf.utils.ioutils.buffer_write_lines(buf, lines) + @annotate("DATAFRAME_DESCRIBE", color="green", domain="cudf_python") @docutils.doc_describe() def describe( self, @@ -4372,6 +4428,7 @@ def describe( sort=False, ) + @annotate("DATAFRAME_TO_PANDAS", color="green", domain="cudf_python") def to_pandas(self, nullable=False, **kwargs): """ Convert to a Pandas DataFrame. @@ -4458,6 +4515,7 @@ def to_pandas(self, nullable=False, **kwargs): return out_df @classmethod + @annotate("DATAFRAME_FROM_PANDAS", color="green", domain="cudf_python") def from_pandas(cls, dataframe, nan_as_null=None): """ Convert from a Pandas DataFrame. @@ -4527,6 +4585,7 @@ def from_pandas(cls, dataframe, nan_as_null=None): return result @classmethod + @annotate("DATAFRAME_FROM_ARROW", color="green", domain="cudf_python") def from_arrow(cls, table): """ Convert from PyArrow Table to DataFrame. @@ -4582,6 +4641,7 @@ def from_arrow(cls, table): return out + @annotate("DATAFRAME_TO_ARROW", color="green", domain="cudf_python") def to_arrow(self, preserve_index=True): """ Convert to a PyArrow Table. @@ -4663,6 +4723,7 @@ def to_arrow(self, preserve_index=True): return out.replace_schema_metadata(metadata) + @annotate("DATAFRAME_TO_RECORDS", color="green", domain="cudf_python") def to_records(self, index=True): """Convert to a numpy recarray @@ -4686,6 +4747,7 @@ def to_records(self, index=True): return ret @classmethod + @annotate("DATAFRAME_FROM_RECORDS", color="green", domain="cudf_python") def from_records(cls, data, index=None, columns=None, nan_as_null=False): """ Convert structured or record ndarray to DataFrame. @@ -4747,6 +4809,9 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): return df @classmethod + @annotate( + "DATAFRAME_FROM_ARRAYS_INTERNAL", color="green", domain="cudf_python" + ) def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): """Convert a numpy/cupy array to DataFrame. @@ -4806,6 +4871,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): df._index = as_index(index) return df + @annotate("DATAFRAME_INTERPOLATE", color="green", domain="cudf_python") def interpolate( self, method="linear", @@ -4836,6 +4902,7 @@ def interpolate( **kwargs, ) + @annotate("DATAFRAME_QUANTILE", color="green", domain="cudf_python") def quantile( self, q=0.5, @@ -4951,6 +5018,7 @@ def quantile( result.index = q return result + @annotate("DATAFRAME_QUANTILES", color="green", domain="cudf_python") def quantiles(self, q=0.5, interpolation="nearest"): """ Return values at the given quantile. @@ -4990,6 +5058,7 @@ def quantiles(self, q=0.5, interpolation="nearest"): result.index = as_index(q) return result + @annotate("DATAFRAME_ISIN", color="green", domain="cudf_python") def isin(self, values): """ Whether each element in the DataFrame is contained in values. @@ -5127,6 +5196,9 @@ def isin(self, values): # # Stats # + @annotate( + "DATAFRAME_PREPARE_FOR_ROWWISE_OP", color="green", domain="cudf_python" + ) def _prepare_for_rowwise_op(self, method, skipna): """Prepare a DataFrame for CuPy-based row-wise operations.""" @@ -5176,6 +5248,7 @@ def _prepare_for_rowwise_op(self, method, skipna): coerced = coerced.astype("int64", copy=False) return coerced, mask, common_dtype + @annotate("DATAFRAME_COUNT", color="green", domain="cudf_python") def count(self, axis=0, level=None, numeric_only=False, **kwargs): """ Count ``non-NA`` cells for each column or row. @@ -5222,6 +5295,7 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): "columns": 1, } + @annotate("DATAFRAME_REDUCE", color="green", domain="cudf_python") def _reduce( self, op, axis=None, level=None, numeric_only=None, **kwargs, ): @@ -5246,6 +5320,7 @@ def _reduce( elif axis == 1: return self._apply_cupy_method_axis_1(op, **kwargs) + @annotate("DATAFRAME_SCAN", color="green", domain="cudf_python") def _scan( self, op, axis=None, *args, **kwargs, ): @@ -5256,6 +5331,7 @@ def _scan( elif axis == 1: return self._apply_cupy_method_axis_1(f"cum{op}", **kwargs) + @annotate("DATAFRAME_MODE", color="green", domain="cudf_python") def mode(self, axis=0, numeric_only=False, dropna=True): """ Get the mode(s) of each element along the selected axis. @@ -5355,6 +5431,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True): return df + @annotate("DATAFRAME_KURTOSIS", color="green", domain="cudf_python") def kurtosis( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -5363,6 +5440,7 @@ def kurtosis( axis, skipna, level, numeric_only, **kwargs ) + @annotate("DATAFRAME_SKEW", color="green", domain="cudf_python") def skew( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -5371,14 +5449,17 @@ def skew( axis, skipna, level, numeric_only, **kwargs ) + @annotate("DATAFRAME_ALL", color="green", domain="cudf_python") def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): obj = self.select_dtypes(include="bool") if bool_only else self return super(DataFrame, obj).all(axis, skipna, level, **kwargs) + @annotate("DATAFRAME_ANY", color="green", domain="cudf_python") def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): obj = self.select_dtypes(include="bool") if bool_only else self return super(DataFrame, obj).any(axis, skipna, level, **kwargs) + @annotate("DATAFRAME_APPLY_CUPY", color="green", domain="cudf_python") def _apply_cupy_method_axis_1(self, method, *args, **kwargs): # This method uses cupy to perform scans and reductions along rows of a # DataFrame. Since cuDF is designed around columnar storage and @@ -5479,6 +5560,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): result_df.columns = prepared.columns return result_df + @annotate("DATAFRAME_COLUMNS_VIEW", color="green", domain="cudf_python") def _columns_view(self, columns): """ Return a subset of the DataFrame's columns as a view. @@ -5487,6 +5569,7 @@ def _columns_view(self, columns): {col: self._data[col] for col in columns}, index=self.index ) + @annotate("DATAFRAME_SELECT_DTYPES", color="green", domain="cudf_python") def select_dtypes(self, include=None, exclude=None): """Return a subset of the DataFrame’s columns based on the column dtypes. @@ -5673,6 +5756,7 @@ def to_orc(self, fname, compression=None, *args, **kwargs): orc.to_orc(self, fname, compression, *args, **kwargs) + @annotate("DATAFRAME_STACK", color="green", domain="cudf_python") def stack(self, level=-1, dropna=True): """Stack the prescribed level(s) from columns to index @@ -5734,6 +5818,7 @@ def stack(self, level=-1, dropna=True): else: return result + @annotate("DATAFRAME_COV", color="green", domain="cudf_python") def cov(self, **kwargs): """Compute the covariance matrix of a DataFrame. @@ -5751,6 +5836,7 @@ def cov(self, **kwargs): df.columns = self.columns return df + @annotate("DATAFRAME_CORR", color="green", domain="cudf_python") def corr(self): """Compute the correlation matrix of a DataFrame.""" corr = cupy.corrcoef(self.values, rowvar=False) @@ -5758,6 +5844,7 @@ def corr(self): df.columns = self.columns return df + @annotate("DATAFRAME_TO_STRUCT", color="green", domain="cudf_python") def to_struct(self, name=None): """ Return a struct Series composed of the columns of the DataFrame. @@ -5782,6 +5869,7 @@ def to_struct(self, name=None): name=name, ) + @annotate("DATAFRAME_KEYS", color="green", domain="cudf_python") def keys(self): """ Get the columns. @@ -5829,6 +5917,7 @@ def iterrows(self): "if you wish to iterate over each row." ) + @annotate("DATAFRAME_APPEND", color="green", domain="cudf_python") def append( self, other, ignore_index=False, verify_integrity=False, sort=False ): @@ -5981,6 +6070,7 @@ def append( return cudf.concat(to_concat, ignore_index=ignore_index, sort=sort) + @annotate("DATAFRAME_PIVOT", color="green", domain="cudf_python") @copy_docstring(reshape.pivot) def pivot(self, index, columns, values=None): @@ -5988,12 +6078,14 @@ def pivot(self, index, columns, values=None): self, index=index, columns=columns, values=values ) + @annotate("DATAFRAME_UNSTACK", color="green", domain="cudf_python") @copy_docstring(reshape.unstack) def unstack(self, level=-1, fill_value=None): return cudf.core.reshape.unstack( self, level=level, fill_value=fill_value ) + @annotate("DATAFRAME_EXPLODE", color="green", domain="cudf_python") def explode(self, column, ignore_index=False): """ Transform each element of a list-like to a row, replicating index @@ -6199,6 +6291,7 @@ def func(left, right, output): ) +@annotate("CUDF_FROM_PANDAS", color="green", domain="cudf_python") def from_pandas(obj, nan_as_null=None): """ Convert certain Pandas objects into the cudf equivalent. @@ -6319,6 +6412,7 @@ def from_pandas(obj, nan_as_null=None): ) +@annotate("CUDF_MERGE", color="green", domain="cudf_python") def merge(left, right, *args, **kwargs): return left.merge(right, *args, **kwargs) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 8d8dd2a1bc0..2d073d46891 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -123,6 +123,7 @@ def deserialize(cls, header, frames): return cls_deserialize._from_data(dict(zip(column_names, columns))) @classmethod + @annotate("FRAME_FROM_DATA", color="green", domain="cudf_python") def _from_data( cls, data: MutableMapping, @@ -133,6 +134,7 @@ def _from_data( return obj @classmethod + @annotate("FRAME_FROM_COLUMNS", color="green", domain="cudf_python") def _from_columns( cls, columns: List[ColumnBase], @@ -163,6 +165,9 @@ def _from_columns( return cls._from_data(data, index) + @annotate( + "FRAME_FROM_COLUMNS_LIKE_SELF", color="green", domain="cudf_python" + ) def _from_columns_like_self( self, columns: List[ColumnBase], @@ -360,6 +365,7 @@ def memory_usage(self, deep=False): def __len__(self): return self._num_rows + @annotate("FRAME_COPY", color="green", domain="cudf_python") def copy(self: T, deep: bool = True) -> T: """ Make a copy of this object's indices and data. @@ -445,6 +451,7 @@ def copy(self: T, deep: bool = True) -> T: return new_frame + @annotate("FRAME_EQUALS", color="green", domain="cudf_python") def equals(self, other, **kwargs): """ Test whether two objects contain the same elements. @@ -527,6 +534,7 @@ def equals(self, other, **kwargs): else: return self._index.equals(other._index) + @annotate("FRAME_EXPLODE", color="green", domain="cudf_python") def _explode(self, explode_column: Any, ignore_index: bool): """Helper function for `explode` in `Series` and `Dataframe`, explodes a specified nested column. Other columns' corresponding rows are @@ -550,6 +558,9 @@ def _explode(self, explode_column: Any, ignore_index: bool): res.index.names = self._index.names return res + @annotate( + "FRAME_GET_COLUMNS_BY_LABEL", color="green", domain="cudf_python" + ) def _get_columns_by_label(self, labels, downcast=False): """ Returns columns of the Frame specified by `labels` @@ -557,6 +568,9 @@ def _get_columns_by_label(self, labels, downcast=False): """ return self._data.select_by_label(labels) + @annotate( + "FRAME_GET_COLUMNS_BY_INDEX", color="green", domain="cudf_python" + ) def _get_columns_by_index(self, indices): """ Returns columns of the Frame specified by `labels` @@ -580,6 +594,7 @@ def _as_column(self): return self._data[None].copy(deep=False) + @annotate("FRAME_EMPTY_LIKE", color="green", domain="cudf_python") def _empty_like(self, keep_index=True): result = self.__class__._from_data( *libcudf.copying.table_empty_like(self, keep_index) @@ -675,6 +690,7 @@ def get_column_values_na(col): # particular, we need to benchmark how much of the overhead is coming from # (potentially unavoidable) local copies in to_cupy and how much comes from # inefficiencies in the implementation. + @annotate("FRAME_TO_CUPY", color="green", domain="cudf_python") def to_cupy( self, dtype: Union[Dtype, None] = None, @@ -709,6 +725,7 @@ def to_cupy( na_value, ) + @annotate("FRAME_TO_NUMPY", color="green", domain="cudf_python") def to_numpy( self, dtype: Union[Dtype, None] = None, @@ -743,6 +760,7 @@ def to_numpy( (lambda col: col.values_host), np.empty, dtype, na_value ) + @annotate("FRAME_CLIP", color="green", domain="cudf_python") def clip(self, lower=None, upper=None, inplace=False, axis=1): """ Trim values at input threshold(s). @@ -870,6 +888,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): return self._mimic_inplace(output, inplace=inplace) + @annotate("FRAME_WHERE", color="green", domain="cudf_python") def where(self, cond, other=None, inplace=False): """ Replace values where the condition is False. @@ -928,6 +947,7 @@ def where(self, cond, other=None, inplace=False): frame=self, cond=cond, other=other, inplace=inplace ) + @annotate("FRAME_MASK", color="green", domain="cudf_python") def mask(self, cond, other=None, inplace=False): """ Replace values where the condition is True. @@ -989,6 +1009,7 @@ def mask(self, cond, other=None, inplace=False): return self.where(cond=~cond, other=other, inplace=inplace) + @annotate("FRAME_PIPE", color="green", domain="cudf_python") def pipe(self, func, *args, **kwargs): """ Apply ``func(self, *args, **kwargs)``. @@ -1119,6 +1140,7 @@ def scatter_by_map( return result + @annotate("FRAME_FILLNA", color="green", domain="cudf_python") def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None ): @@ -1273,6 +1295,7 @@ def fillna( inplace=inplace, ) + @annotate("FRAME_DROPNA_COLUMNS", color="green", domain="cudf_python") def _drop_na_columns(self, how="any", subset=None, thresh=None): """ Drop columns containing nulls @@ -1300,6 +1323,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): return self[out_cols] + @annotate("FRAME_INTERPOLATE", color="green", domain="cudf_python") def interpolate( self, method="linear", @@ -1369,6 +1393,7 @@ def interpolate( else result._gather(perm_sort.argsort()) ) + @annotate("FRAME_QUANTILES", color="green", domain="cudf_python") def _quantiles( self, q, @@ -1401,6 +1426,7 @@ def _quantiles( result._copy_type_metadata(self) return result + @annotate("FRAME_RANK", color="green", domain="cudf_python") def rank( self, axis=0, @@ -1477,6 +1503,7 @@ def rank( return self._from_data(data, index).astype(np.float64) + @annotate("FRAME_REPEAT", color="green", domain="cudf_python") def repeat(self, repeats, axis=None): """Repeats elements consecutively. @@ -1566,6 +1593,7 @@ def repeat(self, repeats, axis=None): result._copy_type_metadata(self) return result + @annotate("FRAME_SHIFT", color="green", domain="cudf_python") def shift(self, periods=1, freq=None, axis=0, fill_value=None): """Shift values by `periods` positions.""" axis = self._get_axis_from_axis_arg(axis) @@ -1581,7 +1609,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): zip(self._column_names, data_columns), self._index ) - @annotate("SAMPLE", color="orange", domain="cudf_python") + @annotate("FRAME_SAMPLE", color="orange", domain="cudf_python") def sample( self, n=None, @@ -1775,7 +1803,7 @@ def sample( return result @classmethod - @annotate("FROM_ARROW", color="orange", domain="cudf_python") + @annotate("FRAME_FROM_ARROW", color="orange", domain="cudf_python") def from_arrow(cls, data): """Convert from PyArrow Table to Frame @@ -1915,7 +1943,7 @@ def from_arrow(cls, data): return cls._from_data({name: result[name] for name in column_names}) - @annotate("TO_ARROW", color="orange", domain="cudf_python") + @annotate("FRAME_TO_ARROW", color="orange", domain="cudf_python") def to_arrow(self): """ Convert to arrow Table @@ -1951,6 +1979,7 @@ def _positions_from_column_names(self, column_names): if name in set(column_names) ] + @annotate("FRAME_REPLACE", color="green", domain="cudf_python") def replace( self, to_replace=None, @@ -2237,6 +2266,7 @@ def _copy_type_metadata( return self + @annotate("FRAME_ISNULL", color="green", domain="cudf_python") def isnull(self): """ Identify missing values. @@ -2318,6 +2348,7 @@ def isnull(self): # Alias for isnull isna = isnull + @annotate("FRAME_NOTNULL", color="green", domain="cudf_python") def notnull(self): """ Identify non-missing values. @@ -2399,6 +2430,7 @@ def notnull(self): # Alias for notnull notna = notnull + @annotate("FRAME_INTERLEAVE_COLUMNS", color="green", domain="cudf_python") def interleave_columns(self): """ Interleave Series columns of a table into a single column. @@ -2438,6 +2470,7 @@ def interleave_columns(self): return result + @annotate("FRAME_TILE", color="green", domain="cudf_python") def tile(self, count): """ Repeats the rows from `self` DataFrame `count` times to form a @@ -2467,6 +2500,7 @@ def tile(self, count): result._copy_type_metadata(self) return result + @annotate("FRAME_SEARCHSORTED", color="green", domain="cudf_python") def searchsorted( self, values, side="left", ascending=True, na_position="last" ): @@ -2551,7 +2585,7 @@ def searchsorted( else: return result - @annotate("ARGSORT", color="yellow", domain="cudf_python") + @annotate("FRAME_ARGSORT", color="yellow", domain="cudf_python") def argsort( self, by=None, @@ -2654,6 +2688,7 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): return libcudf.sort.order_by(to_sort, ascending, na_position) + @annotate("FRAME_SIN", color="green", domain="cudf_python") def sin(self): """ Get Trigonometric sine, element-wise. @@ -2715,6 +2750,7 @@ def sin(self): """ return self._unaryop("sin") + @annotate("FRAME_COS", color="green", domain="cudf_python") def cos(self): """ Get Trigonometric cosine, element-wise. @@ -2776,6 +2812,7 @@ def cos(self): """ return self._unaryop("cos") + @annotate("FRAME_TAN", color="green", domain="cudf_python") def tan(self): """ Get Trigonometric tangent, element-wise. @@ -2837,6 +2874,7 @@ def tan(self): """ return self._unaryop("tan") + @annotate("FRAME_ASIN", color="green", domain="cudf_python") def asin(self): """ Get Trigonometric inverse sine, element-wise. @@ -2887,6 +2925,7 @@ def asin(self): """ return self._unaryop("asin") + @annotate("FRAME_ACOS", color="green", domain="cudf_python") def acos(self): """ Get Trigonometric inverse cosine, element-wise. @@ -2945,6 +2984,7 @@ def acos(self): result = result.mask((result < 0) | (result > np.pi + 1)) return result + @annotate("FRAME_ATAN", color="green", domain="cudf_python") def atan(self): """ Get Trigonometric inverse tangent, element-wise. @@ -3005,6 +3045,7 @@ def atan(self): """ return self._unaryop("atan") + @annotate("FRAME_EXP", color="green", domain="cudf_python") def exp(self): """ Get the exponential of all elements, element-wise. @@ -3067,6 +3108,7 @@ def exp(self): """ return self._unaryop("exp") + @annotate("FRAME_LOG", color="green", domain="cudf_python") def log(self): """ Get the natural logarithm of all elements, element-wise. @@ -3128,6 +3170,7 @@ def log(self): """ return self._unaryop("log") + @annotate("FRAME_SQRT", color="green", domain="cudf_python") def sqrt(self): """ Get the non-negative square-root of all elements, element-wise. @@ -3183,6 +3226,7 @@ def sqrt(self): """ return self._unaryop("sqrt") + @annotate("FRAME_ABS", color="green", domain="cudf_python") def abs(self): """ Return a Series/DataFrame with absolute numeric value of each element. @@ -3209,6 +3253,7 @@ def abs(self): return self._unaryop("abs") # Rounding + @annotate("FRAME_CEIL", color="green", domain="cudf_python") def ceil(self): """ Rounds each value upward to the smallest integral value not less @@ -3245,6 +3290,7 @@ def ceil(self): return self._unaryop("ceil") + @annotate("FRAME_FLOOR", color="green", domain="cudf_python") def floor(self): """Rounds each value downward to the largest integral value not greater than the original. @@ -3284,6 +3330,7 @@ def floor(self): return self._unaryop("floor") + @annotate("FRAME_SCALE", color="green", domain="cudf_python") def scale(self): """ Scale values to [0, 1] in float64 @@ -3318,6 +3365,7 @@ def scale(self): scaled._index = self._index.copy(deep=False) return scaled + @annotate("FRAME_INTERNAL_MERGE", color="green", domain="cudf_python") def _merge( self, right, @@ -3361,6 +3409,7 @@ def _merge( suffixes=suffixes, ).perform_merge() + @annotate("FRAME_IS_SORTED", color="green", domain="cudf_python") def _is_sorted(self, ascending=None, null_position=None): """ Returns a boolean indicating whether the data of the Frame are sorted @@ -3391,12 +3440,14 @@ def _is_sorted(self, ascending=None, null_position=None): self, ascending=ascending, null_position=null_position ) + @annotate("FRAME_SPLIT", color="green", domain="cudf_python") def _split(self, splits, keep_index=True): results = libcudf.copying.table_split( self, splits, keep_index=keep_index ) return [self.__class__._from_data(*result) for result in results] + @annotate("FRAME_ENCODE", color="green", domain="cudf_python") def _encode(self): data, index, indices = libcudf.transform.table_encode(self) for name, col in data.items(): @@ -3404,6 +3455,7 @@ def _encode(self): keys = self.__class__._from_data(data, index) return keys, indices + @annotate("FRAME_UNARYOP", color="green", domain="cudf_python") def _unaryop(self, op): data_columns = (col.unary_operator(op) for col in self._columns) return self.__class__._from_data( @@ -3443,6 +3495,7 @@ def _binaryop( raise NotImplementedError @classmethod + @annotate("FRAME_COLWISE_BINOP", color="green", domain="cudf_python") def _colwise_binop( cls, operands: Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], @@ -3601,6 +3654,7 @@ def _colwise_binop( return output + @annotate("FRAME_DOT", color="green", domain="cudf_python") def dot(self, other, reflect=False): """ Get dot product of frame and other, (binary operator `dot`). @@ -3773,6 +3827,7 @@ def _reduce(self, *args, **kwargs): f"Reductions are not supported for objects of type {type(self)}." ) + @annotate("FRAME_MIN", color="green", domain="cudf_python") def min( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, ): @@ -3818,6 +3873,7 @@ def min( **kwargs, ) + @annotate("FRAME_MAX", color="green", domain="cudf_python") def max( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, ): @@ -3863,6 +3919,7 @@ def max( **kwargs, ) + @annotate("FRAME_SUM", color="green", domain="cudf_python") def sum( self, axis=None, @@ -3921,6 +3978,7 @@ def sum( **kwargs, ) + @annotate("FRAME_PRODUCT", color="green", domain="cudf_python") def product( self, axis=None, @@ -3985,6 +4043,7 @@ def product( # Alias for pandas compatibility. prod = product + @annotate("FRAME_MEAN", color="green", domain="cudf_python") def mean( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -4029,6 +4088,7 @@ def mean( **kwargs, ) + @annotate("FRAME_STD", color="green", domain="cudf_python") def std( self, axis=None, @@ -4085,6 +4145,7 @@ def std( **kwargs, ) + @annotate("FRAME_VAR", color="green", domain="cudf_python") def var( self, axis=None, @@ -4140,6 +4201,7 @@ def var( **kwargs, ) + @annotate("FRAME_KURTOSIS", color="green", domain="cudf_python") def kurtosis( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -4208,6 +4270,7 @@ def kurt( **kwargs, ) + @annotate("FRAME_SKEW", color="green", domain="cudf_python") def skew( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -4265,6 +4328,7 @@ def skew( **kwargs, ) + @annotate("FRAME_ALL", color="green", domain="cudf_python") def all(self, axis=0, skipna=True, level=None, **kwargs): """ Return whether all elements are True in DataFrame. @@ -4300,6 +4364,7 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): "all", axis=axis, skipna=skipna, level=level, **kwargs, ) + @annotate("FRAME_ANY", color="green", domain="cudf_python") def any(self, axis=0, skipna=True, level=None, **kwargs): """ Return whether any elements is True in DataFrame. @@ -4335,6 +4400,7 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): "any", axis=axis, skipna=skipna, level=level, **kwargs, ) + @annotate("FRAME_SUM_OF_SQUARES", color="green", domain="cudf_python") def sum_of_squares(self, dtype=None): """Return the sum of squares of values. @@ -4358,6 +4424,7 @@ def sum_of_squares(self, dtype=None): """ return self._reduce("sum_of_squares", dtype=dtype) + @annotate("FRAME_MEDIAN", color="green", domain="cudf_python") def median( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -4403,6 +4470,7 @@ def median( ) # Scans + @annotate("FRAME_SCAN", color="green", domain="cudf_python") def _scan(self, op, axis=None, skipna=True, cast_to_int=False): skipna = True if skipna is None else skipna @@ -4437,6 +4505,7 @@ def _scan(self, op, axis=None, skipna=True, cast_to_int=False): # for Index._from_data and simplify. return self._from_data(results, index=self._index) + @annotate("FRAME_CUMMIN", color="green", domain="cudf_python") def cummin(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative minimum of the Series or DataFrame. @@ -4480,6 +4549,7 @@ def cummin(self, axis=None, skipna=True, *args, **kwargs): """ return self._scan("min", axis=axis, skipna=skipna, *args, **kwargs) + @annotate("FRAME_CUMMAX", color="green", domain="cudf_python") def cummax(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative maximum of the Series or DataFrame. @@ -4523,6 +4593,7 @@ def cummax(self, axis=None, skipna=True, *args, **kwargs): """ return self._scan("max", axis=axis, skipna=skipna, *args, **kwargs) + @annotate("FRAME_CUMSUM", color="green", domain="cudf_python") def cumsum(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative sum of the Series or DataFrame. @@ -4569,6 +4640,7 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs): "sum", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs ) + @annotate("FRAME_CUMPROD", color="green", domain="cudf_python") def cumprod(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative product of the Series or DataFrame. @@ -4614,6 +4686,7 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs): "prod", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs ) + @annotate("FRAME_TO_JSON", color="green", domain="cudf_python") @ioutils.doc_to_json() def to_json(self, path_or_buf=None, *args, **kwargs): """{docstring}""" @@ -4622,18 +4695,21 @@ def to_json(self, path_or_buf=None, *args, **kwargs): self, path_or_buf=path_or_buf, *args, **kwargs ) + @annotate("FRAME_TO_HDF", color="green", domain="cudf_python") @ioutils.doc_to_hdf() def to_hdf(self, path_or_buf, key, *args, **kwargs): """{docstring}""" cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) + @annotate("FRAME_TO_DLPACK", color="green", domain="cudf_python") @ioutils.doc_to_dlpack() def to_dlpack(self): """{docstring}""" return cudf.io.dlpack.to_dlpack(self) + @annotate("FRAME_TO_STRING", color="green", domain="cudf_python") def to_string(self): """ Convert to string @@ -4659,12 +4735,15 @@ def to_string(self): def __str__(self): return self.to_string() + @annotate("FRAME_DEEP_COPY", color="green", domain="cudf_python") def __deepcopy__(self, memo): return self.copy(deep=True) + @annotate("FRAME_COPY", color="green", domain="cudf_python") def __copy__(self): return self.copy(deep=False) + @annotate("FRAME_HEAD", color="green", domain="cudf_python") def head(self, n=5): """ Return the first `n` rows. @@ -4748,6 +4827,7 @@ def head(self, n=5): """ return self.iloc[:n] + @annotate("FRAME_TAIL", color="green", domain="cudf_python") def tail(self, n=5): """ Returns the last n rows as a new DataFrame or Series @@ -4779,6 +4859,7 @@ def tail(self, n=5): return self.iloc[-n:] + @annotate("FRAME_ROLLING", color="green", domain="cudf_python") @copy_docstring(Rolling) def rolling( self, window, min_periods=None, center=False, axis=0, win_type=None @@ -4792,6 +4873,7 @@ def rolling( win_type=win_type, ) + @annotate("FRAME_NANS_TO_NULLS", color="green", domain="cudf_python") def nans_to_nulls(self): """ Convert nans (if any) to nulls @@ -4846,6 +4928,7 @@ def nans_to_nulls(self): self._index, ) + @annotate("FRAME_INVERT", color="green", domain="cudf_python") def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" return self._from_data( @@ -4856,6 +4939,7 @@ def __invert__(self): self._index, ) + @annotate("FRAME_ADD", color="green", domain="cudf_python") def add(self, other, axis, level=None, fill_value=None): """ Get Addition of dataframe or series and other, element-wise (binary @@ -4926,6 +5010,7 @@ def add(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "add", fill_value) + @annotate("FRAME_RADD", color="green", domain="cudf_python") def radd(self, other, axis, level=None, fill_value=None): """ Get Addition of dataframe or series and other, element-wise (binary @@ -5005,6 +5090,7 @@ def radd(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "add", fill_value, reflect=True) + @annotate("FRAME_SUBTRACT", color="green", domain="cudf_python") def subtract(self, other, axis, level=None, fill_value=None): """ Get Subtraction of dataframe or series and other, element-wise (binary @@ -5087,6 +5173,7 @@ def subtract(self, other, axis, level=None, fill_value=None): sub = subtract + @annotate("FRAME_RSUB", color="green", domain="cudf_python") def rsub(self, other, axis, level=None, fill_value=None): """ Get Subtraction of dataframe or series and other, element-wise (binary @@ -5170,6 +5257,7 @@ def rsub(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "sub", fill_value, reflect=True) + @annotate("FRAME_MULTIPLY", color="green", domain="cudf_python") def multiply(self, other, axis, level=None, fill_value=None): """ Get Multiplication of dataframe or series and other, element-wise @@ -5254,6 +5342,7 @@ def multiply(self, other, axis, level=None, fill_value=None): mul = multiply + @annotate("FRAME_RMUL", color="green", domain="cudf_python") def rmul(self, other, axis, level=None, fill_value=None): """ Get Multiplication of dataframe or series and other, element-wise @@ -5338,6 +5427,7 @@ def rmul(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "mul", fill_value, reflect=True) + @annotate("FRAME_MOD", color="green", domain="cudf_python") def mod(self, other, axis, level=None, fill_value=None): """ Get Modulo division of dataframe or series and other, element-wise @@ -5408,6 +5498,7 @@ def mod(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "mod", fill_value) + @annotate("FRAME_RMOD", color="green", domain="cudf_python") def rmod(self, other, axis, level=None, fill_value=None): """ Get Modulo division of dataframe or series and other, element-wise @@ -5490,6 +5581,7 @@ def rmod(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "mod", fill_value, reflect=True) + @annotate("FRAME_POW", color="green", domain="cudf_python") def pow(self, other, axis, level=None, fill_value=None): """ Get Exponential power of dataframe series and other, element-wise @@ -5569,6 +5661,7 @@ def pow(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "pow", fill_value) + @annotate("FRAME_RPOW", color="green", domain="cudf_python") def rpow(self, other, axis, level=None, fill_value=None): """ Get Exponential power of dataframe or series and other, element-wise @@ -5648,6 +5741,7 @@ def rpow(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "pow", fill_value, reflect=True) + @annotate("FRAME_FLOORDIV", color="green", domain="cudf_python") def floordiv(self, other, axis, level=None, fill_value=None): """ Get Integer division of dataframe or series and other, element-wise @@ -5727,6 +5821,7 @@ def floordiv(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "floordiv", fill_value) + @annotate("FRAME_RFLOORDIV", color="green", domain="cudf_python") def rfloordiv(self, other, axis, level=None, fill_value=None): """ Get Integer division of dataframe or series and other, element-wise @@ -5823,6 +5918,7 @@ def rfloordiv(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "floordiv", fill_value, reflect=True) + @annotate("FRAME_TRUEDIV", color="green", domain="cudf_python") def truediv(self, other, axis, level=None, fill_value=None): """ Get Floating division of dataframe or series and other, element-wise @@ -5911,6 +6007,7 @@ def truediv(self, other, axis, level=None, fill_value=None): div = truediv divide = truediv + @annotate("FRAME_RTRUEDIV", color="green", domain="cudf_python") def rtruediv(self, other, axis, level=None, fill_value=None): """ Get Floating division of dataframe or series and other, element-wise @@ -6003,6 +6100,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None): # Alias for rtruediv rdiv = rtruediv + @annotate("FRAME_EQ", color="green", domain="cudf_python") def eq(self, other, axis="columns", level=None, fill_value=None): """Equal to, element-wise (binary operator eq). @@ -6078,6 +6176,7 @@ def eq(self, other, axis="columns", level=None, fill_value=None): other=other, fn="eq", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_NE", color="green", domain="cudf_python") def ne(self, other, axis="columns", level=None, fill_value=None): """Not equal to, element-wise (binary operator ne). @@ -6153,6 +6252,7 @@ def ne(self, other, axis="columns", level=None, fill_value=None): other=other, fn="ne", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_LT", color="green", domain="cudf_python") def lt(self, other, axis="columns", level=None, fill_value=None): """Less than, element-wise (binary operator lt). @@ -6228,6 +6328,7 @@ def lt(self, other, axis="columns", level=None, fill_value=None): other=other, fn="lt", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_LE", color="green", domain="cudf_python") def le(self, other, axis="columns", level=None, fill_value=None): """Less than or equal, element-wise (binary operator le). @@ -6303,6 +6404,7 @@ def le(self, other, axis="columns", level=None, fill_value=None): other=other, fn="le", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_GT", color="green", domain="cudf_python") def gt(self, other, axis="columns", level=None, fill_value=None): """Greater than, element-wise (binary operator gt). @@ -6378,6 +6480,7 @@ def gt(self, other, axis="columns", level=None, fill_value=None): other=other, fn="gt", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_GE", color="green", domain="cudf_python") def ge(self, other, axis="columns", level=None, fill_value=None): """Greater than or equal, element-wise (binary operator ge). @@ -6476,6 +6579,11 @@ def nunique(self, method: builtins.str = "sort", dropna: bool = True): } +@annotate( + "FRAME_GET_REPLACEMENT_VALUES_FOR_COLUMNS", + color="green", + domain="cudf_python", +) def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] ) -> Tuple[Dict[Any, bool], Dict[Any, Any], Dict[Any, Any]]: @@ -6640,6 +6748,7 @@ def _is_series(obj): return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None +@annotate("FRAME_DROP_ROWS_BY_LABELS", color="green", domain="cudf_python") def _drop_rows_by_labels( obj: DataFrameOrSeries, labels: Union[ColumnLike, abc.Iterable, str], diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index a919b00692d..948428de4f0 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -7,6 +7,7 @@ from uuid import uuid4 import numpy as np +from nvtx import annotate from pyarrow import dataset as ds, parquet as pq import cudf @@ -16,6 +17,7 @@ from cudf.utils import ioutils +@annotate("_WRITE_PARQUET", color="green", domain="cudf_python") def _write_parquet( df, paths, @@ -73,6 +75,7 @@ def _write_parquet( # Logic chosen to match: https://arrow.apache.org/ # docs/_modules/pyarrow/parquet.html#write_to_dataset +@annotate("WRITE_TO_DATASET", color="green", domain="cudf_python") def write_to_dataset( df, root_path, @@ -161,6 +164,7 @@ def write_to_dataset( @ioutils.doc_read_parquet_metadata() +@annotate("READ_PARQUET_METADATA", color="green", domain="cudf_python") def read_parquet_metadata(path): """{docstring}""" @@ -173,6 +177,7 @@ def read_parquet_metadata(path): return num_rows, num_row_groups, col_names +@annotate("_PROCESS_DATASET", color="green", domain="cudf_python") def _process_dataset( paths, fs, filters=None, row_groups=None, categorical_partitions=True, ): @@ -308,6 +313,7 @@ def _process_dataset( @ioutils.doc_read_parquet() +@annotate("READ_PARQUET", color="green", domain="cudf_python") def read_parquet( filepath_or_buffer, engine="cudf", @@ -435,6 +441,7 @@ def read_parquet( ) +@annotate("_PARQUET_TO_FRAME", color="green", domain="cudf_python") def _parquet_to_frame( paths_or_buffers, *args, @@ -502,6 +509,7 @@ def _parquet_to_frame( ) +@annotate("_WRITE_PARQUET", color="green", domain="cudf_python") def _read_parquet( filepaths_or_buffers, engine, @@ -535,6 +543,7 @@ def _read_parquet( @ioutils.doc_to_parquet() +@annotate("TO_PARQUET", color="green", domain="cudf_python") def to_parquet( df, path, @@ -646,6 +655,7 @@ def _generate_filename(): return uuid4().hex + ".parquet" +@annotate("_GET_PARTITIONED", color="green", domain="cudf_python") def _get_partitioned( df, root_path, @@ -689,6 +699,7 @@ def _get_partitioned( class ParquetDatasetWriter: + @annotate("ParquetDatasetWriter_INIT", color="green", domain="cudf_python") def __init__( self, path, @@ -765,6 +776,9 @@ def __init__( self.path_cw_map: Dict[str, int] = {} self.filename = None + @annotate( + "ParquetDatasetWriter_WRITE_TABLE", color="green", domain="cudf_python" + ) def write_table(self, df): """ Write a dataframe to the file/dataset @@ -821,6 +835,9 @@ def write_table(self, df): self.path_cw_map.update({k: new_cw_idx for k in new_paths}) self._chunked_writers[-1][0].write_table(grouped_df, part_info) + @annotate( + "ParquetDatasetWriter_CLOSE", color="green", domain="cudf_python" + ) def close(self, return_metadata=False): """ Close all open files and optionally return footer metadata as a binary diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 89b5301ee83..1b1f3e29ab2 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from collections.abc import Iterator @@ -6,6 +6,7 @@ import numpy as np import pandas as pd import pyarrow as pa +from nvtx import annotate from dask.dataframe.core import get_parallel_type, meta_nonempty from dask.dataframe.dispatch import ( @@ -39,6 +40,7 @@ @meta_nonempty.register(cudf.BaseIndex) +@annotate("_nonempty_index", color="green", domain="dask_cudf_python") def _nonempty_index(idx): if isinstance(idx, cudf.core.index.RangeIndex): return cudf.core.index.RangeIndex(2, name=idx.name) @@ -73,6 +75,7 @@ def _nonempty_index(idx): raise TypeError(f"Don't know how to handle index of type {type(idx)}") +@annotate("_get_non_empty_data", color="green", domain="dask_cudf_python") def _get_non_empty_data(s): if isinstance(s._column, cudf.core.column.CategoricalColumn): categories = ( @@ -100,6 +103,7 @@ def _get_non_empty_data(s): @meta_nonempty.register(cudf.Series) +@annotate("_nonempty_series", color="green", domain="dask_cudf_python") def _nonempty_series(s, idx=None): if idx is None: idx = _nonempty_index(s.index) @@ -109,6 +113,7 @@ def _nonempty_series(s, idx=None): @meta_nonempty.register(cudf.DataFrame) +@annotate("meta_nonempty_cudf", color="green", domain="dask_cudf_python") def meta_nonempty_cudf(x): idx = meta_nonempty(x.index) columns_with_dtype = dict() @@ -124,15 +129,18 @@ def meta_nonempty_cudf(x): @make_meta_dispatch.register((cudf.Series, cudf.DataFrame)) +@annotate("make_meta_cudf", color="green", domain="dask_cudf_python") def make_meta_cudf(x, index=None): return x.head(0) @make_meta_dispatch.register(cudf.BaseIndex) +@annotate("make_meta_cudf_index", color="green", domain="dask_cudf_python") def make_meta_cudf_index(x, index=None): return x[:0] +@annotate("_empty_series", color="green", domain="dask_cudf_python") def _empty_series(name, dtype, index=None): if isinstance(dtype, str) and dtype == "category": return cudf.Series( @@ -142,6 +150,7 @@ def _empty_series(name, dtype, index=None): @make_meta_obj.register(object) +@annotate("make_meta_object_cudf", color="green", domain="dask_cudf_python") def make_meta_object_cudf(x, index=None): """Create an empty cudf object containing the desired metadata. @@ -212,6 +221,7 @@ def make_meta_object_cudf(x, index=None): @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.BaseIndex)) +@annotate("concat_cudf", color="green", domain="dask_cudf_python") def concat_cudf( dfs, axis=0, @@ -236,11 +246,13 @@ def concat_cudf( @categorical_dtype_dispatch.register( (cudf.DataFrame, cudf.Series, cudf.BaseIndex) ) +@annotate("categorical_dtype_cudf", color="green", domain="dask_cudf_python") def categorical_dtype_cudf(categories=None, ordered=None): return cudf.CategoricalDtype(categories=categories, ordered=ordered) @tolist_dispatch.register((cudf.Series, cudf.BaseIndex)) +@annotate("tolist_cudf", color="green", domain="dask_cudf_python") def tolist_cudf(obj): return obj.to_arrow().to_pylist() @@ -248,6 +260,9 @@ def tolist_cudf(obj): @is_categorical_dtype_dispatch.register( (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series) ) +@annotate( + "is_categorical_dtype_cudf", color="green", domain="dask_cudf_python" +) def is_categorical_dtype_cudf(obj): return cudf.api.types.is_categorical_dtype(obj) @@ -261,6 +276,7 @@ def is_categorical_dtype_cudf(obj): ) @percentile_lookup.register((cudf.Series, cp.ndarray, cudf.BaseIndex)) + @annotate("percentile_cudf", color="green", domain="dask_cudf_python") def percentile_cudf(a, q, interpolation="linear"): # Cudf dispatch to the equivalent of `np.percentile`: # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html @@ -305,6 +321,7 @@ def percentile_cudf(a, q, interpolation="linear"): @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex)) +@annotate("union_categoricals_cudf", color="green", domain="dask_cudf_python") def union_categoricals_cudf( to_union, sort_categories=False, ignore_order=False ): @@ -313,11 +330,13 @@ def union_categoricals_cudf( ) +@annotate("safe_hash", color="green", domain="dask_cudf_python") def safe_hash(frame): return cudf.Series(frame.hash_values(), index=frame.index) @hash_object_dispatch.register((cudf.DataFrame, cudf.Series)) +@annotate("hash_object_cudf", color="green", domain="dask_cudf_python") def hash_object_cudf(frame, index=True): if index: return safe_hash(frame.reset_index()) @@ -325,6 +344,7 @@ def hash_object_cudf(frame, index=True): @hash_object_dispatch.register(cudf.BaseIndex) +@annotate("hash_object_cudf_index", color="green", domain="dask_cudf_python") def hash_object_cudf_index(ind, index=None): if isinstance(ind, cudf.MultiIndex): @@ -335,6 +355,7 @@ def hash_object_cudf_index(ind, index=None): @group_split_dispatch.register((cudf.Series, cudf.DataFrame)) +@annotate("group_split_cudf", color="green", domain="dask_cudf_python") def group_split_cudf(df, c, k, ignore_index=False): return dict( zip( @@ -349,10 +370,12 @@ def group_split_cudf(df, c, k, ignore_index=False): @sizeof_dispatch.register(cudf.DataFrame) +@annotate("sizeof_cudf_dataframe", color="green", domain="dask_cudf_python") def sizeof_cudf_dataframe(df): return int(df.memory_usage().sum()) @sizeof_dispatch.register((cudf.Series, cudf.BaseIndex)) +@annotate("sizeof_cudf_series_index", color="green", domain="dask_cudf_python") def sizeof_cudf_series_index(obj): return obj.memory_usage() diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 729db6c232d..d8802f33941 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import math import warnings @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from nvtx import annotate from tlz import partition_all import dask @@ -57,6 +58,7 @@ def __dask_postcompute__(self): def __dask_postpersist__(self): return type(self), (self._name, self._meta, self.divisions) + @annotate("_FRAME_INIT", color="green", domain="dask_cudf_python") def __init__(self, dsk, name, meta, divisions): if not isinstance(dsk, HighLevelGraph): dsk = HighLevelGraph.from_collections(name, dsk, dependencies=[]) @@ -82,6 +84,9 @@ def __repr__(self): s = "" return s % (type(self).__name__, len(self.dask), self.npartitions) + @annotate( + "_FRAME_to_dask_dataframe", color="green", domain="dask_cudf_python" + ) def to_dask_dataframe(self, **kwargs): """Create a dask.dataframe object from a dask_cudf object""" nullable_pd_dtype = kwargs.get("nullable_pd_dtype", False) @@ -99,6 +104,9 @@ def to_dask_dataframe(self, **kwargs): class DataFrame(_Frame, dd.core.DataFrame): _partition_type = cudf.DataFrame + @annotate( + "DATAFRAME_assign_column", color="green", domain="dask_cudf_python" + ) def _assign_column(self, k, v): def assigner(df, k, v): out = df.copy() @@ -108,6 +116,7 @@ def assigner(df, k, v): meta = assigner(self._meta, k, dask_make_meta(v)) return self.map_partitions(assigner, k, v, meta=meta) + @annotate("DATAFRAME_apply_rows", color="green", domain="dask_cudf_python") def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None): import uuid @@ -127,6 +136,7 @@ def do_apply_rows(df, func, incols, outcols, kwargs): do_apply_rows, func, incols, outcols, kwargs, meta=meta ) + @annotate("DATAFRAME_merge", color="green", domain="dask_cudf_python") def merge(self, other, **kwargs): if kwargs.pop("shuffle", "tasks") != "tasks": raise ValueError( @@ -138,6 +148,7 @@ def merge(self, other, **kwargs): on = list(on) return super().merge(other, on=on, shuffle="tasks", **kwargs) + @annotate("DATAFRAME_join", color="green", domain="dask_cudf_python") def join(self, other, **kwargs): if kwargs.pop("shuffle", "tasks") != "tasks": raise ValueError( @@ -155,6 +166,7 @@ def join(self, other, **kwargs): on = list(on) return super().join(other, how=how, on=on, shuffle="tasks", **kwargs) + @annotate("DATAFRAME_set_index", color="green", domain="dask_cudf_python") def set_index(self, other, sorted=False, divisions=None, **kwargs): if kwargs.pop("shuffle", "tasks") != "tasks": raise ValueError( @@ -226,6 +238,9 @@ def set_index(self, other, sorted=False, divisions=None, **kwargs): **kwargs, ) + @annotate( + "DATAFRAME_sort_values", color="green", domain="dask_cudf_python" + ) def sort_values( self, by, @@ -261,12 +276,14 @@ def sort_values( return df.reset_index(drop=True) return df + @annotate("DATAFRAME_to_parquet", color="green", domain="dask_cudf_python") def to_parquet(self, path, *args, **kwargs): """Calls dask.dataframe.io.to_parquet with CudfEngine backend""" from dask_cudf.io import to_parquet return to_parquet(self, path, *args, **kwargs) + @annotate("DATAFRAME_to_orc", color="green", domain="dask_cudf_python") def to_orc(self, path, **kwargs): """Calls dask_cudf.io.to_orc""" from dask_cudf.io import to_orc @@ -274,6 +291,7 @@ def to_orc(self, path, **kwargs): return to_orc(self, path, **kwargs) @derived_from(pd.DataFrame) + @annotate("DATAFRAME_var", color="green", domain="dask_cudf_python") def var( self, axis=None, @@ -302,6 +320,9 @@ def var( else: return _parallel_var(self, meta, skipna, split_every, out) + @annotate( + "DATAFRAME_repartition", color="green", domain="dask_cudf_python" + ) def repartition(self, *args, **kwargs): """Wraps dask.dataframe DataFrame.repartition method. Uses DataFrame.shuffle if `columns=` is specified. @@ -324,6 +345,7 @@ def repartition(self, *args, **kwargs): ) return super().repartition(*args, **kwargs) + @annotate("DATAFRAME_shuffle", color="green", domain="dask_cudf_python") def shuffle(self, *args, **kwargs): """Wraps dask.dataframe DataFrame.shuffle method""" shuffle_arg = kwargs.pop("shuffle", None) @@ -331,18 +353,21 @@ def shuffle(self, *args, **kwargs): raise ValueError("dask_cudf does not support disk-based shuffle.") return super().shuffle(*args, shuffle="tasks", **kwargs) + @annotate("DATAFRAME_groupby", color="green", domain="dask_cudf_python") def groupby(self, by=None, **kwargs): from .groupby import CudfDataFrameGroupBy return CudfDataFrameGroupBy(self, by=by, **kwargs) +@annotate("DATAFRAME_sum_of_squares", color="green", domain="dask_cudf_python") def sum_of_squares(x): x = x.astype("f8")._column outcol = libcudf.reduce.reduce("sum_of_squares", x) return cudf.Series(outcol) +@annotate("DATAFRAME_var_aggregate", color="green", domain="dask_cudf_python") def var_aggregate(x2, x, n, ddof): try: with warnings.catch_warnings(record=True): @@ -355,10 +380,12 @@ def var_aggregate(x2, x, n, ddof): return np.float64(np.nan) +@annotate("DATAFRAME_nlargest_agg", color="green", domain="dask_cudf_python") def nlargest_agg(x, **kwargs): return cudf.concat(x).nlargest(**kwargs) +@annotate("DATAFRAME_nsmallest_agg", color="green", domain="dask_cudf_python") def nsmallest_agg(x, **kwargs): return cudf.concat(x).nsmallest(**kwargs) @@ -366,6 +393,7 @@ def nsmallest_agg(x, **kwargs): class Series(_Frame, dd.core.Series): _partition_type = cudf.Series + @annotate("Series_count", color="green", domain="dask_cudf_python") def count(self, split_every=False): return reduction( [self], @@ -375,12 +403,14 @@ def count(self, split_every=False): meta="i8", ) + @annotate("Series_mean", color="green", domain="dask_cudf_python") def mean(self, split_every=False): sum = self.sum(split_every=split_every) n = self.count(split_every=split_every) return sum / n @derived_from(pd.DataFrame) + @annotate("Series_var", color="green", domain="dask_cudf_python") def var( self, axis=None, @@ -409,16 +439,19 @@ def var( else: return _parallel_var(self, meta, skipna, split_every, out) + @annotate("Series_groupby", color="green", domain="dask_cudf_python") def groupby(self, *args, **kwargs): from .groupby import CudfSeriesGroupBy return CudfSeriesGroupBy(self, *args, **kwargs) @property + @annotate("Series_list", color="green", domain="dask_cudf_python") def list(self): return ListMethods(self) @property + @annotate("Series_struct", color="green", domain="dask_cudf_python") def struct(self): return StructMethods(self) @@ -427,6 +460,7 @@ class Index(Series, dd.core.Index): _partition_type = cudf.Index # type: ignore +@annotate("_naive_var", color="green", domain="dask_cudf_python") def _naive_var(ddf, meta, skipna, ddof, split_every, out): num = ddf._get_numeric_data() x = 1.0 * num.sum(skipna=skipna, split_every=split_every) @@ -441,6 +475,7 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out): return handle_out(out, result) +@annotate("_parallel_var", color="green", domain="dask_cudf_python") def _parallel_var(ddf, meta, skipna, split_every, out): def _local_var(x, skipna): if skipna: @@ -507,6 +542,7 @@ def _finalize_var(vals): return handle_out(out, result) +@annotate("_extract_meta", color="green", domain="dask_cudf_python") def _extract_meta(x): """ Extract internal cache data (``_meta``) from dask_cudf objects @@ -522,6 +558,7 @@ def _extract_meta(x): return x +@annotate("_emulate", color="green", domain="dask_cudf_python") def _emulate(func, *args, **kwargs): """ Apply a function using args / kwargs. If arguments contain dd.DataFrame / @@ -531,6 +568,7 @@ def _emulate(func, *args, **kwargs): return func(*_extract_meta(args), **_extract_meta(kwargs)) +@annotate("align_partitions", color="green", domain="dask_cudf_python") def align_partitions(args): """Align partitions between dask_cudf objects. @@ -546,6 +584,7 @@ def align_partitions(args): return args +@annotate("reduction", color="green", domain="dask_cudf_python") def reduction( args, chunk=None, @@ -684,6 +723,7 @@ def reduction( return dd.core.new_dd_object(graph, b, meta, (None, None)) +@annotate("from_cudf", color="green", domain="dask_cudf_python") def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): if isinstance(getattr(data, "index", None), cudf.MultiIndex): raise NotImplementedError( @@ -705,6 +745,7 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): ) +@annotate("from_dask_dataframe", color="green", domain="dask_cudf_python") def from_dask_dataframe(df): return df.map_partitions(cudf.from_pandas) diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 1bc270a5b9f..658e63ea923 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from nvtx import annotate from dask.base import tokenize from dask.dataframe.core import ( @@ -35,11 +36,19 @@ class CudfDataFrameGroupBy(DataFrameGroupBy): + @annotate( + "CudfDataFrameGroupBy_INIT", color="green", domain="dask_cudf_python" + ) def __init__(self, *args, **kwargs): self.sep = kwargs.pop("sep", "___") self.as_index = kwargs.pop("as_index", True) super().__init__(*args, **kwargs) + @annotate( + "CudfDataFrameGroupBy_GETITEM", + color="green", + domain="dask_cudf_python", + ) def __getitem__(self, key): if isinstance(key, list): g = CudfDataFrameGroupBy( @@ -53,6 +62,9 @@ def __getitem__(self, key): g._meta = g._meta[key] return g + @annotate( + "CudfDataFrameGroupBy_MEAN", color="green", domain="dask_cudf_python" + ) def mean(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -66,6 +78,11 @@ def mean(self, split_every=None, split_out=1): as_index=self.as_index, ) + @annotate( + "CudfDataFrameGroupBy_COLLECT", + color="green", + domain="dask_cudf_python", + ) def collect(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -79,6 +96,11 @@ def collect(self, split_every=None, split_out=1): as_index=self.as_index, ) + @annotate( + "CudfDataFrameGroupBy_AGGREGATE", + color="green", + domain="dask_cudf_python", + ) def aggregate(self, arg, split_every=None, split_out=1): if arg == "size": return self.size() @@ -118,11 +140,17 @@ def aggregate(self, arg, split_every=None, split_out=1): class CudfSeriesGroupBy(SeriesGroupBy): + @annotate( + "CudfSeriesGroupBy_INIT", color="green", domain="dask_cudf_python" + ) def __init__(self, *args, **kwargs): self.sep = kwargs.pop("sep", "___") self.as_index = kwargs.pop("as_index", True) super().__init__(*args, **kwargs) + @annotate( + "CudfSeriesGroupBy_MEAN", color="green", domain="dask_cudf_python" + ) def mean(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -136,6 +164,9 @@ def mean(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] + @annotate( + "CudfSeriesGroupBy_STD", color="green", domain="dask_cudf_python" + ) def std(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -149,6 +180,9 @@ def std(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] + @annotate( + "CudfSeriesGroupBy_VAR", color="green", domain="dask_cudf_python" + ) def var(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -162,6 +196,9 @@ def var(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] + @annotate( + "CudfSeriesGroupBy_COLLECT", color="green", domain="dask_cudf_python" + ) def collect(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -175,6 +212,9 @@ def collect(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] + @annotate( + "CudfSeriesGroupBy_AGGREGATE", color="green", domain="dask_cudf_python" + ) def aggregate(self, arg, split_every=None, split_out=1): if arg == "size": return self.size() @@ -205,6 +245,7 @@ def aggregate(self, arg, split_every=None, split_out=1): ) +@annotate("groupby_agg", color="green", domain="dask_cudf_python") def groupby_agg( ddf, gb_cols, @@ -371,6 +412,7 @@ def groupby_agg( return new_dd_object(graph, gb_agg_name, _meta, divisions) +@annotate("_redirect_aggs", color="green", domain="dask_cudf_python") def _redirect_aggs(arg): """Redirect aggregations to their corresponding name in cuDF""" redirects = { @@ -397,6 +439,7 @@ def _redirect_aggs(arg): return redirects.get(arg, arg) +@annotate("_is_supported", color="green", domain="dask_cudf_python") def _is_supported(arg, supported: set): """Check that aggregations in `arg` are a subset of `supported`""" if isinstance(arg, (list, dict)): @@ -422,6 +465,7 @@ def _make_name(*args, sep="_"): return sep.join(_args) +@annotate("_groupby_partition_agg", color="green", domain="dask_cudf_python") def _groupby_partition_agg( df, gb_cols, aggs, columns, split_out, dropna, sort, sep ): @@ -479,6 +523,7 @@ def _groupby_partition_agg( return output +@annotate("_tree_node_agg", color="green", domain="dask_cudf_python") def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep): """Node in groupby-aggregation reduction tree. @@ -513,6 +558,7 @@ def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep): return gb +@annotate("_var_agg", color="green", domain="dask_cudf_python") def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): """Calculate variance (given count, sum, and sum-squared columns).""" @@ -534,6 +580,7 @@ def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): return var +@annotate("_finalize_gb_agg", color="green", domain="dask_cudf_python") def _finalize_gb_agg( gb, gb_cols, diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index af40d9ca41b..ada738c5a9b 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -1,9 +1,11 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. + from collections.abc import Iterator import cupy import numpy as np import tlz as toolz +from nvtx import annotate from dask.base import tokenize from dask.dataframe import methods @@ -16,12 +18,14 @@ from cudf.api.types import is_categorical_dtype +@annotate("set_index_post", color="green", domain="dask_cudf_python") def set_index_post(df, index_name, drop, column_dtype): df2 = df.set_index(index_name, drop=drop) df2.columns = df2.columns.astype(column_dtype) return df2 +@annotate("_set_partitions_pre", color="green", domain="dask_cudf_python") def _set_partitions_pre(s, divisions, ascending=True, na_position="last"): if ascending: partitions = divisions.searchsorted(s, side="right") - 1 @@ -38,6 +42,7 @@ def _set_partitions_pre(s, divisions, ascending=True, na_position="last"): return partitions +@annotate("_quantile", color="green", domain="dask_cudf_python") def _quantile(a, q): n = len(a) if not len(a): @@ -45,6 +50,7 @@ def _quantile(a, q): return (a.quantiles(q=q.tolist(), interpolation="nearest"), n) +@annotate("merge_quantiles", color="green", domain="dask_cudf_python") def merge_quantiles(finalq, qs, vals): """Combine several quantile calculations of different data. [NOTE: Same logic as dask.array merge_percentiles] @@ -107,6 +113,7 @@ def _append_counts(val, count): return rv.reset_index(drop=True) +@annotate("_approximate_quantile", color="green", domain="dask_cudf_python") def _approximate_quantile(df, q): """Approximate quantiles of DataFrame or Series. [NOTE: Same logic as dask.dataframe Series quantile] @@ -180,6 +187,7 @@ def set_quantile_index(df): return df +@annotate("quantile_divisions", color="green", domain="cudf_python") def quantile_divisions(df, by, npartitions): qn = np.linspace(0.0, 1.0, npartitions + 1).tolist() divisions = _approximate_quantile(df[by], qn).compute() @@ -213,6 +221,7 @@ def quantile_divisions(df, by, npartitions): return divisions +@annotate("sort_values", color="green", domain="cudf_python") def sort_values( df, by, From 317553f40522cb3e457a37682ecbc890a910298b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 11 Feb 2022 19:04:03 -0600 Subject: [PATCH 16/47] Remove making redundant `copy` across code-base (#10257) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR removes some of the existing `copy` overheads in the code-base: All scan operations with `skipna=False`: ```python # branch-22.04: In [3]: df = cudf.DataFrame( ...: { ...: "a": [1, 2, 3, np.nan] * 10000000, ...: "c": [0.0, 0.12, np.nan, 10.12] * 10000000, ...: }, ...: nan_as_null=False, ...: ) In [3]: %timeit df.cummin(skipna=False) 27.6 ms ± 587 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) # THIS PR: In [3]: %timeit df.cummin(skipna=False) 14.3 ms ± 43 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` `assign`: ```python # branch-22.04: In [3]: %timeit df.assign(b=10, c=11, d=12) 36.5 ms ± 1.16 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # THIS PR: In [3]: %timeit df.assign(b=10, c=11, d=12) 27.6 ms ± 8.73 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) ``` `replace`: ```python In [10]: df Out[10]: a c 0 1 0.00 1 2 0.12 2 3 10.12 3 1 0.00 4 2 0.12 ... .. ... 29999995 2 0.12 29999996 3 10.12 29999997 1 0.00 29999998 2 0.12 29999999 3 10.12 [30000000 rows x 2 columns] # branch-22.04 In [11]: %timeit df.replace([1, 2, 3], [4, 5, 6]) 25.7 ms ± 157 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) # THIS PR: In [3]: %timeit df.replace([1, 2, 3], [4, 5, 6]) 14.5 ms ± 52.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` `nans_to_nulls`: ```python # branch-22.04: In [3]: df = cudf.DataFrame( ...: { ...: "a": [1, 2, 3, np.nan] * 10000000, ...: "b": ["a", "b", "c", "d"] * 10000000, ...: "c": [0.0, 0.12, np.nan, 10.12] * 10000000, ...: }, ...: nan_as_null=False, ...: ) In [4]: %timeit df.nans_to_nulls() 32.1 ms ± 269 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) # THIS PR: In [5]: %timeit df.nans_to_nulls() 10.3 ms ± 28.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` `Series.astype`: ```python # branchh-22.04: In [11]: s.nans_to_nulls() Out[11]: 0 1.0 1 2.0 2 3.0 3 4 1.0 ... 39999995 39999996 1.0 39999997 2.0 39999998 3.0 39999999 Name: a, Length: 40000000, dtype: float64 In [12]: s = s.nans_to_nulls() In [11]: %timeit s.astype('int64', copy=True) 18.2 ms ± 20.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) # THIS PR: In [8]: %timeit s.astype('int64', copy=True) 9.19 ms ± 21.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` `Series.add_prefix/add_suffix`: ```python In [8]: s Out[8]: 0 a 1 b 2 c 3 d 4 a .. 39999995 d 39999996 a 39999997 b 39999998 c 39999999 d Name: b, Length: 40000000, dtype: object In [6]: s.index Out[6]: StringIndex(['0' '1' '2' ... '39999997' '39999998' '39999999'], dtype='object') # branch-22.04: In [4]: %timeit s.add_prefix("abcdef") 63.5 ms ± 281 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) # THIS PR: In [7]: %timeit s.add_prefix("abcdef") 60.9 ms ± 64.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) ``` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/10257 --- python/cudf/cudf/core/_base_index.py | 6 +- python/cudf/cudf/core/column/categorical.py | 63 +++++++++++++-------- python/cudf/cudf/core/column/column.py | 34 ++++------- python/cudf/cudf/core/column/numerical.py | 43 +++++++++++--- python/cudf/cudf/core/column/string.py | 6 +- python/cudf/cudf/core/dataframe.py | 16 ++++-- python/cudf/cudf/core/frame.py | 43 ++++++++------ python/cudf/cudf/core/indexed_frame.py | 26 +++++---- python/cudf/cudf/core/series.py | 19 +++---- python/cudf/cudf/io/orc.py | 4 +- 10 files changed, 155 insertions(+), 105 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 2e6f138d2e3..cdb1e9fc86f 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. from __future__ import annotations @@ -1201,9 +1201,9 @@ def rename(self, name, inplace=False): self.name = name return None else: - out = self.copy(deep=False) + out = self.copy(deep=True) out.name = name - return out.copy(deep=True) + return out def astype(self, dtype, copy=False): """ diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 24f9dc83ca9..f57d88fb39d 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. from __future__ import annotations @@ -1048,7 +1048,9 @@ def find_and_replace( f"got to_replace dtype: {to_replace_col.dtype} and " f"value dtype: {replacement_col.dtype}" ) - df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col}) + df = cudf.DataFrame._from_data( + {"old": to_replace_col, "new": replacement_col} + ) df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) if df._data["old"].null_count == 1: fill_value = df._data["new"][df._data["old"].isnull()][0] @@ -1058,8 +1060,7 @@ def find_and_replace( new_categories = self.categories.append( column.as_column([fill_value]) ) - replaced = self.copy() - replaced = replaced._set_categories(new_categories) + replaced = self._set_categories(new_categories) replaced = replaced.fillna(fill_value) df = df.dropna(subset=["old"]) to_replace_col = df._data["old"] @@ -1078,35 +1079,41 @@ def find_and_replace( replacement_col = df._data["new"] # create a dataframe containing the pre-replacement categories - # and a copy of them to work with. The index of this dataframe - # represents the original ints that map to the categories - old_cats = cudf.DataFrame() - old_cats["cats"] = column.as_column(replaced.dtype.categories) - new_cats = old_cats.copy(deep=True) - - # Create a column with the appropriate labels replaced - old_cats["cats_replace"] = old_cats["cats"].replace( - to_replace_col, replacement_col + # and a column with the appropriate labels replaced. + # The index of this dataframe represents the original + # ints that map to the categories + cats_col = column.as_column(replaced.dtype.categories) + old_cats = cudf.DataFrame._from_data( + { + "cats": cats_col, + "cats_replace": cats_col.find_and_replace( + to_replace_col, replacement_col + ), + } ) # Construct the new categorical labels # If a category is being replaced by an existing one, we # want to map it to None. If it's totally new, we want to # map it to the new label it is to be replaced by - dtype_replace = cudf.Series(replacement_col) - dtype_replace[dtype_replace.isin(old_cats["cats"])] = None - new_cats["cats"] = new_cats["cats"].replace( - to_replace_col, dtype_replace + dtype_replace = cudf.Series._from_data({None: replacement_col}) + dtype_replace[dtype_replace.isin(cats_col)] = None + new_cats_col = cats_col.find_and_replace( + to_replace_col, dtype_replace._column ) # anything we mapped to None, we want to now filter out since # those categories don't exist anymore # Resetting the index creates a column 'index' that associates # the original integers to the new labels - bmask = new_cats._data["cats"].notnull() - new_cats = cudf.DataFrame( - {"cats": new_cats._data["cats"].apply_boolean_mask(bmask)} - ).reset_index() + bmask = new_cats_col.notnull() + new_cats_col = new_cats_col.apply_boolean_mask(bmask) + new_cats = cudf.DataFrame._from_data( + { + "index": cudf.core.column.arange(len(new_cats_col)), + "cats": new_cats_col, + } + ) # old_cats contains replaced categories and the ints that # previously mapped to those categories and the index of @@ -1507,9 +1514,15 @@ def _set_categories( old_codes = column.arange(len(cur_cats), dtype=out_code_dtype) new_codes = column.arange(len(new_cats), dtype=out_code_dtype) - new_df = cudf.DataFrame({"new_codes": new_codes, "cats": new_cats}) - old_df = cudf.DataFrame({"old_codes": old_codes, "cats": cur_cats}) - cur_df = cudf.DataFrame({"old_codes": cur_codes, "order": cur_order}) + new_df = cudf.DataFrame._from_data( + data={"new_codes": new_codes, "cats": new_cats} + ) + old_df = cudf.DataFrame._from_data( + data={"old_codes": old_codes, "cats": cur_cats} + ) + cur_df = cudf.DataFrame._from_data( + data={"old_codes": cur_codes, "order": cur_order} + ) # Join the old and new categories and line up their codes df = old_df.merge(new_df, on="cats", how="left") @@ -1519,7 +1532,7 @@ def _set_categories( df.reset_index(drop=True, inplace=True) ordered = ordered if ordered is not None else self.ordered - new_codes = df["new_codes"]._column + new_codes = df._data["new_codes"] # codes can't have masks, so take mask out before moving in return column.build_categorical_column( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e5669fd44aa..2535ba5ab8d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -177,34 +177,28 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool: return self.binary_operator("NULL_EQUALS", other).all() def all(self, skipna: bool = True) -> bool: + # The skipna argument is only used for numerical columns. # If all entries are null the result is True, including when the column # is empty. - result_col = self.nans_to_nulls() if skipna else self - if result_col.null_count == result_col.size: + if self.null_count == self.size: return True - if isinstance(result_col, ColumnBase): - return libcudf.reduce.reduce("all", result_col, dtype=np.bool_) - - return result_col + return libcudf.reduce.reduce("all", self, dtype=np.bool_) def any(self, skipna: bool = True) -> bool: # Early exit for fast cases. - result_col = self.nans_to_nulls() if skipna else self - if not skipna and result_col.has_nulls(): + + if not skipna and self.has_nulls(): return True - elif skipna and result_col.null_count == result_col.size: + elif skipna and self.null_count == self.size: return False - if isinstance(result_col, ColumnBase): - return libcudf.reduce.reduce("any", result_col, dtype=np.bool_) - - return result_col + return libcudf.reduce.reduce("any", self, dtype=np.bool_) def dropna(self, drop_nan: bool = False) -> ColumnBase: - col = self.nans_to_nulls() if drop_nan else self - return drop_nulls([col])[0] + # The drop_nan argument is only used for numerical columns. + return drop_nulls([self])[0] def to_arrow(self) -> pa.Array: """Convert to PyArrow Array @@ -1164,9 +1158,6 @@ def corr(self, other: ColumnBase): f"cannot perform corr with types {self.dtype}, {other.dtype}" ) - def nans_to_nulls(self: T) -> T: - return self - @property def contains_na_entries(self) -> bool: return self.null_count != 0 @@ -1177,14 +1168,13 @@ def _process_for_reduction( skipna = True if skipna is None else skipna if skipna: - result_col = self.nans_to_nulls() - if result_col.has_nulls(): - result_col = result_col.dropna() + if self.has_nulls(): + result_col = self.dropna() else: if self.has_nulls(): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - result_col = self + result_col = self if min_count > 0: valid_count = len(result_col) - result_col.null_count diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 0e32c530e69..2c483cbd00b 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -20,6 +20,7 @@ import cudf from cudf import _lib as libcudf +from cudf._lib.stream_compaction import drop_nulls from cudf._typing import BinaryOperand, ColumnLike, Dtype, DtypeObj, ScalarLike from cudf.api.types import is_integer_dtype, is_number from cudf.core.buffer import Buffer @@ -316,6 +317,27 @@ def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn: return self return libcudf.unary.cast(self, dtype) + def all(self, skipna: bool = True) -> bool: + # If all entries are null the result is True, including when the column + # is empty. + result_col = self.nans_to_nulls() if skipna else self + + if result_col.null_count == result_col.size: + return True + + return libcudf.reduce.reduce("all", result_col, dtype=np.bool_) + + def any(self, skipna: bool = True) -> bool: + # Early exit for fast cases. + result_col = self.nans_to_nulls() if skipna else self + + if not skipna and result_col.has_nulls(): + return True + elif skipna and result_col.null_count == result_col.size: + return False + + return libcudf.reduce.reduce("any", result_col, dtype=np.bool_) + @property def nan_count(self) -> int: if self.dtype.kind != "f": @@ -325,6 +347,10 @@ def nan_count(self) -> int: self._nan_count = nan_col.sum() return self._nan_count + def dropna(self, drop_nan: bool = False) -> NumericalColumn: + col = self.nans_to_nulls() if drop_nan else self + return drop_nulls([col])[0] + @property def contains_na_entries(self) -> bool: return (self.nan_count != 0) or (self.null_count != 0) @@ -350,12 +376,14 @@ def _can_return_nan(self, skipna: bool = None) -> bool: def _process_for_reduction( self, skipna: bool = None, min_count: int = 0 - ) -> Union[ColumnBase, ScalarLike]: + ) -> Union[NumericalColumn, ScalarLike]: skipna = True if skipna is None else skipna if self._can_return_nan(skipna=skipna): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - return super()._process_for_reduction( + + col = self.nans_to_nulls() if skipna else self + return super(NumericalColumn, col)._process_for_reduction( skipna=skipna, min_count=min_count ) @@ -403,7 +431,6 @@ def find_and_replace( replacement_col = _normalize_find_and_replace_input( self.dtype, replacement ) - replaced = self.copy() if len(replacement_col) == 1 and len(to_replace_col) > 1: replacement_col = column.as_column( utils.scalar_broadcast_to( @@ -411,11 +438,13 @@ def find_and_replace( ) ) elif len(replacement_col) == 1 and len(to_replace_col) == 0: - return replaced + return self.copy() to_replace_col, replacement_col, replaced = numeric_normalize_types( - to_replace_col, replacement_col, replaced + to_replace_col, replacement_col, self + ) + df = cudf.DataFrame._from_data( + {"old": to_replace_col, "new": replacement_col} ) - df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col}) df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) if df._data["old"].null_count == 1: replaced = replaced.fillna( @@ -424,7 +453,7 @@ def find_and_replace( df = df.dropna(subset=["old"]) return libcudf.replace.replace( - replaced, df["old"]._column, df["new"]._column + replaced, df._data["old"], df._data["new"] ) def fillna( diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 415200521ac..5e40356ed0f 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5339,7 +5339,9 @@ def find_and_replace( and replacement_col.dtype != self.dtype ): return self.copy() - df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col}) + df = cudf.DataFrame._from_data( + {"old": to_replace_col, "new": replacement_col} + ) df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) if df._data["old"].null_count == 1: res = self.fillna(df._data["new"][df._data["old"].isnull()][0]) @@ -5411,7 +5413,7 @@ def binary_operator( return cast( "column.ColumnBase", libstrings.concatenate( - cudf.DataFrame({0: lhs, 1: rhs}), + cudf.DataFrame._from_data(data={0: lhs, 1: rhs}), sep=cudf.Scalar(""), na_rep=cudf.Scalar(None, "str"), ), diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c3c79135c34..371404ca477 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1344,10 +1344,16 @@ def assign(self, **kwargs): 1 1 4 2 2 5 """ - new = self.copy() + new_df = cudf.DataFrame(index=self.index.copy()) + for name, col in self._data.items(): + if name in kwargs: + new_df[name] = kwargs.pop(name) + else: + new_df._data[name] = col.copy() + for k, v in kwargs.items(): - new[k] = v - return new + new_df[k] = v + return new_df @classmethod @annotate("CONCAT", color="orange", domain="cudf_python") @@ -2315,7 +2321,7 @@ def set_index( 3 4 d 4.0 4 5 e 5.0 - Set the index to become the ‘b’ column: + Set the index to become the 'b' column: >>> df.set_index('b') a c @@ -2326,7 +2332,7 @@ def set_index( d 4 4.0 e 5 5.0 - Create a MultiIndex using columns ‘a’ and ‘b’: + Create a MultiIndex using columns 'a' and 'b': >>> df.set_index(["a", "b"]) c diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 2d073d46891..6e46c107d2e 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1313,13 +1313,17 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): else: thresh = len(df) - for col in self._data.names: + for name, col in df._data.items(): + try: + check_col = col.nans_to_nulls() + except AttributeError: + check_col = col no_threshold_valid_count = ( - len(df[col]) - df[col].nans_to_nulls().null_count + len(col) - check_col.null_count ) < thresh if no_threshold_valid_count: continue - out_cols.append(col) + out_cols.append(name) return self[out_cols] @@ -2193,7 +2197,7 @@ def replace( ) if not (to_replace is None and value is None): - copy_data = self._data.copy(deep=False) + copy_data = {} ( all_na_per_column, to_replace_per_column, @@ -2202,11 +2206,11 @@ def replace( to_replace=to_replace, value=value, columns_dtype_map={ - col: copy_data._data[col].dtype for col in copy_data._data + col: self._data[col].dtype for col in self._data }, ) - for name, col in copy_data.items(): + for name, col in self._data.items(): try: copy_data[name] = col.find_and_replace( to_replace_per_column[name], @@ -4477,16 +4481,21 @@ def _scan(self, op, axis=None, skipna=True, cast_to_int=False): results = {} for name, col in self._data.items(): if skipna: - result_col = self._data[name].nans_to_nulls() + try: + result_col = col.nans_to_nulls() + except AttributeError: + result_col = col else: - result_col = self._data[name].copy() - if result_col.has_nulls(include_nan=True): + if col.has_nulls(include_nan=True): # Workaround as find_first_value doesn't seem to work # incase of bools. first_index = int( - result_col.isnull().astype("int8").find_first_value(1) + col.isnull().astype("int8").find_first_value(1) ) + result_col = col.copy() result_col[first_index:] = None + else: + result_col = col if ( cast_to_int @@ -4920,13 +4929,13 @@ def nans_to_nulls(self): 1 3.14 2 """ - return self._from_data( - { - name: col.copy().nans_to_nulls() - for name, col in self._data.items() - }, - self._index, - ) + result_data = {} + for name, col in self._data.items(): + try: + result_data[name] = col.nans_to_nulls() + except AttributeError: + result_data[name] = col.copy() + return self._from_data(result_data, self._index) @annotate("FRAME_INVERT", color="green", domain="cudf_python") def __invert__(self): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index fab5d75f62b..10b9f2396bb 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. """Base class for Frame types that have an index.""" from __future__ import annotations @@ -24,7 +24,6 @@ is_integer_dtype, is_list_like, ) -from cudf.core.column import arange, as_column from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame from cudf.core.index import Index, RangeIndex, _index_from_columns @@ -59,15 +58,14 @@ def _indices_from_labels(obj, labels): - from cudf.core.column import column if not isinstance(labels, cudf.MultiIndex): - labels = column.as_column(labels) + labels = cudf.core.column.as_column(labels) if is_categorical_dtype(obj.index): labels = labels.astype("category") codes = labels.codes.astype(obj.index._values.codes.dtype) - labels = column.build_categorical_column( + labels = cudf.core.column.build_categorical_column( categories=labels.dtype.categories, codes=codes, ordered=labels.dtype.ordered, @@ -78,8 +76,12 @@ def _indices_from_labels(obj, labels): # join is not guaranteed to maintain the index ordering # so we will sort it with its initial ordering which is stored # in column "__" - lhs = cudf.DataFrame({"__": arange(len(labels))}, index=labels) - rhs = cudf.DataFrame({"_": arange(len(obj))}, index=obj.index) + lhs = cudf.DataFrame( + {"__": cudf.core.column.arange(len(labels))}, index=labels + ) + rhs = cudf.DataFrame( + {"_": cudf.core.column.arange(len(obj))}, index=obj.index + ) return lhs.join(rhs).sort_values("__")["_"] @@ -334,7 +336,7 @@ def sort_index( Parameters ---------- - axis : {0 or ‘index’, 1 or ‘columns’}, default 0 + axis : {0 or 'index', 1 or 'columns'}, default 0 The axis along which to sort. The value 0 identifies the rows, and 1 identifies the columns. level : int or level name or list of ints or list of level names @@ -346,7 +348,7 @@ def sort_index( If True, perform operation in-place. kind : sorting method such as `quick sort` and others. Not yet supported. - na_position : {‘first’, ‘last’}, default ‘last’ + na_position : {'first', 'last'}, default 'last' Puts NaNs at the beginning if first; last puts NaNs at the end. sort_remaining : bool, default True Not yet supported @@ -858,7 +860,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs): except Exception as e: raise RuntimeError("UDF kernel execution failed.") from e - col = as_column(ans_col) + col = cudf.core.column.as_column(ans_col) col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask)) result = cudf.Series._from_data({None: col}, self._index) @@ -1000,9 +1002,9 @@ def _align_to_index( # to recover ordering after index alignment. sort_col_id = str(uuid4()) if how == "left": - lhs[sort_col_id] = arange(len(lhs)) + lhs[sort_col_id] = cudf.core.column.arange(len(lhs)) elif how == "right": - rhs[sort_col_id] = arange(len(rhs)) + rhs[sort_col_id] = cudf.core.column.arange(len(rhs)) result = lhs.join(rhs, how=how, sort=sort) if how in ("left", "right"): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index ddaa358e9d9..90ebeba5087 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1926,10 +1926,7 @@ def astype(self, dtype, copy=False, errors="raise"): try: data = self._column.astype(dtype) - return self._from_data( - {self.name: (data.copy(deep=True) if copy else data)}, - index=self._index, - ) + return self._from_data({self.name: data}, index=self._index) except Exception as e: if errors == "raise": @@ -3430,14 +3427,16 @@ def merge( return result def add_prefix(self, prefix): - result = self.copy(deep=True) - result.index = prefix + self.index.astype(str) - return result + return Series._from_data( + data=self._data.copy(deep=True), + index=prefix + self.index.astype(str), + ) def add_suffix(self, suffix): - result = self.copy(deep=True) - result.index = self.index.astype(str) + suffix - return result + return Series._from_data( + data=self._data.copy(deep=True), + index=self.index.astype(str) + suffix, + ) def keys(self): """ diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index a09fb1f8e12..62260cbb822 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -20,8 +20,8 @@ def _make_empty_df(filepath_or_buffer, columns): orc_file = orc.ORCFile(filepath_or_buffer) schema = orc_file.schema col_names = schema.names if columns is None else columns - return cudf.DataFrame( - { + return cudf.DataFrame._from_data( + data={ col_name: cudf.core.column.column_empty( row_count=0, dtype=schema.field(col_name).type.to_pandas_dtype(), From 7f2a16f7aaacc343b4b14a2da9321713f6351860 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sat, 12 Feb 2022 00:47:21 -0500 Subject: [PATCH 17/47] Improve hash join detail functions (#10273) This PR includes several changes in the hash join `detail` functions: - Fixes a bug where public join APIs were invoked in `detail` join functions. External invocations are replaced with the corresponding `detail` ones. - Uses structured bindings to improve code readability - Add an early exit before the actual join operation to improve performance Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10273 --- cpp/src/join/join.cu | 65 +++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index ef9e7867a2d..7a478ca2eb3 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -13,8 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include +#include "join/hash_join.cuh" +#include "join/join_common_utils.hpp" #include #include @@ -51,8 +51,8 @@ inner_join(table_view const& left_input, // build the hash map from the smaller table. if (right.num_rows() > left.num_rows()) { cudf::hash_join hj_obj(left, compare_nulls, stream); - auto result = hj_obj.inner_join(right, std::nullopt, stream, mr); - return std::make_pair(std::move(result.second), std::move(result.first)); + auto [right_result, left_result] = hj_obj.inner_join(right, std::nullopt, stream, mr); + return std::make_pair(std::move(left_result), std::move(right_result)); } else { cudf::hash_join hj_obj(right, compare_nulls, stream); return hj_obj.inner_join(left, std::nullopt, stream, mr); @@ -78,16 +78,17 @@ std::unique_ptr
inner_join(table_view const& left_input, auto const left = scatter_columns(matched.second.front(), left_on, left_input); auto const right = scatter_columns(matched.second.back(), right_on, right_input); - auto join_indices = inner_join(left.select(left_on), right.select(right_on), compare_nulls, mr); + auto const [left_join_indices, right_join_indices] = cudf::detail::inner_join( + left.select(left_on), right.select(right_on), compare_nulls, stream, mr); std::unique_ptr
left_result = detail::gather(left, - join_indices.first->begin(), - join_indices.first->end(), + left_join_indices->begin(), + left_join_indices->end(), out_of_bounds_policy::DONT_CHECK, stream, mr); std::unique_ptr
right_result = detail::gather(right, - join_indices.second->begin(), - join_indices.second->end(), + right_join_indices->begin(), + right_join_indices->end(), out_of_bounds_policy::DONT_CHECK, stream, mr); @@ -134,23 +135,24 @@ std::unique_ptr
left_join(table_view const& left_input, table_view const left = scatter_columns(matched.second.front(), left_on, left_input); table_view const right = scatter_columns(matched.second.back(), right_on, right_input); - auto join_indices = left_join(left.select(left_on), right.select(right_on), compare_nulls); - - if ((left_on.empty() || right_on.empty()) || - is_trivial_join(left, right, cudf::detail::join_kind::LEFT_JOIN)) { - auto probe_build_pair = get_empty_joined_table(left, right); - return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), - std::move(probe_build_pair.second)); + if ((left_on.empty() or right_on.empty()) or + cudf::detail::is_trivial_join(left, right, cudf::detail::join_kind::LEFT_JOIN)) { + auto [left_empty_table, right_empty_table] = get_empty_joined_table(left, right); + return cudf::detail::combine_table_pair(std::move(left_empty_table), + std::move(right_empty_table)); } + + auto const [left_join_indices, right_join_indices] = cudf::detail::left_join( + left.select(left_on), right.select(right_on), compare_nulls, stream, mr); std::unique_ptr
left_result = detail::gather(left, - join_indices.first->begin(), - join_indices.first->end(), + left_join_indices->begin(), + left_join_indices->end(), out_of_bounds_policy::NULLIFY, stream, mr); std::unique_ptr
right_result = detail::gather(right, - join_indices.second->begin(), - join_indices.second->end(), + right_join_indices->begin(), + right_join_indices->end(), out_of_bounds_policy::NULLIFY, stream, mr); @@ -197,23 +199,24 @@ std::unique_ptr
full_join(table_view const& left_input, table_view const left = scatter_columns(matched.second.front(), left_on, left_input); table_view const right = scatter_columns(matched.second.back(), right_on, right_input); - auto join_indices = full_join(left.select(left_on), right.select(right_on), compare_nulls); - - if ((left_on.empty() || right_on.empty()) || - is_trivial_join(left, right, cudf::detail::join_kind::FULL_JOIN)) { - auto probe_build_pair = get_empty_joined_table(left, right); - return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), - std::move(probe_build_pair.second)); + if ((left_on.empty() or right_on.empty()) or + cudf::detail::is_trivial_join(left, right, cudf::detail::join_kind::FULL_JOIN)) { + auto [left_empty_table, right_empty_table] = get_empty_joined_table(left, right); + return cudf::detail::combine_table_pair(std::move(left_empty_table), + std::move(right_empty_table)); } + + auto const [left_join_indices, right_join_indices] = cudf::detail::full_join( + left.select(left_on), right.select(right_on), compare_nulls, stream, mr); std::unique_ptr
left_result = detail::gather(left, - join_indices.first->begin(), - join_indices.first->end(), + left_join_indices->begin(), + left_join_indices->end(), out_of_bounds_policy::NULLIFY, stream, mr); std::unique_ptr
right_result = detail::gather(right, - join_indices.second->begin(), - join_indices.second->end(), + right_join_indices->begin(), + right_join_indices->end(), out_of_bounds_policy::NULLIFY, stream, mr); From b977d1ea7927822bdbf9482f3e1b6cd5ceec47d3 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 14 Feb 2022 10:00:16 -0500 Subject: [PATCH 18/47] Fix out-of-memory error in UrlDecode benchmark (#10258) Fixes out-of-memory error that occurs when running the `STRINGS_BENCH` `UrlDecode` benchmark combined with any other benchmark. The following minimal command shows the error: ``` benchmarks/STRINGS_BENCH '--benchmark_filter=UrlDecode|translate_large/16' ... ------------------------------------------------------------------------------------------------------------------ Benchmark Time CPU Iterations UserCounters... ------------------------------------------------------------------------------------------------------------------ StringTranslate/translate_large/16777216/32/manual_time 17.4 ms 17.5 ms 40 bytes_per_second=15.8359G/s terminate called after throwing an instance of 'rmm::out_of_memory' what(): std::bad_alloc: out_of_memory: RMM failure at:/cudf/cpp/build/_deps/rmm-src/include/rmm/mr/device/pool_memory_resource.hpp:192: Maximum pool size exceeded ``` The `UrlDecode` is the only benchmark in the `STRINGS_BENCH` using a `TEMPLATED_BENCHMARK_F` which causes a new separate memory pool to be created instead of reusing the one already created from the previous benchmark. This PR reworks the benchmark macros in `url_decode.cpp` to avoid using `TEMPLATED_BENCHMARK_F` which fixes the issue. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - https://github.com/nvdbaranec - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/10258 --- cpp/benchmarks/string/url_decode.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/benchmarks/string/url_decode.cpp b/cpp/benchmarks/string/url_decode.cpp index 4dc77cffa1a..6dc79c44437 100644 --- a/cpp/benchmarks/string/url_decode.cpp +++ b/cpp/benchmarks/string/url_decode.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -70,8 +70,7 @@ cudf::test::strings_column_wrapper generate_column(cudf::size_type num_rows, class UrlDecode : public cudf::benchmark { }; -template -void BM_url_decode(benchmark::State& state) +void BM_url_decode(benchmark::State& state, int esc_seq_pct) { cudf::size_type const num_rows = state.range(0); cudf::size_type const chars_per_row = state.range(1); @@ -88,12 +87,14 @@ void BM_url_decode(benchmark::State& state) (chars_per_row + sizeof(cudf::size_type))); } -#define URLD_BENCHMARK_DEFINE(esc_seq_pct) \ - TEMPLATED_BENCHMARK_F(UrlDecode, BM_url_decode, esc_seq_pct) \ - ->Args({100000000, 10}) \ - ->Args({10000000, 100}) \ - ->Args({1000000, 1000}) \ - ->Unit(benchmark::kMillisecond) \ +#define URLD_BENCHMARK_DEFINE(esc_seq_pct) \ + BENCHMARK_DEFINE_F(UrlDecode, esc_seq_pct) \ + (::benchmark::State & st) { BM_url_decode(st, esc_seq_pct); } \ + BENCHMARK_REGISTER_F(UrlDecode, esc_seq_pct) \ + ->Args({100000000, 10}) \ + ->Args({10000000, 100}) \ + ->Args({1000000, 1000}) \ + ->Unit(benchmark::kMillisecond) \ ->UseManualTime(); URLD_BENCHMARK_DEFINE(10) From 463266f1a9dfeda5c7095a20c239a817e3e93826 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 14 Feb 2022 10:00:47 -0500 Subject: [PATCH 19/47] Fix out-of-memory error in compiled-binaryop benchmark (#10269) Fixes out-of-memory error that occurs when running the `BINARYOP_BENCH` `COMPILED_BINARYOP` benchmark combined with the `BINARYOP` benchmark. The following minimal command shows the error: ``` benchmarks/BINARYOP_BENCH '--benchmark_filter=COMPILED_BINARYOP|100000000/10' ... BINARYOP/binaryop_double_imbalanced_unique/100000000/10/manual_time 40.4 ms 40.4 ms 17 bytes_per_second=202.953G/s terminate called after throwing an instance of 'rmm::out_of_memory' what(): std::bad_alloc: out_of_memory: RMM failure at:/conda/envs/rapids/include/rmm/mr/device/pool_memory_resource.hpp:192: Maximum pool size exceeded Aborted (core dumped) ``` The `COMPILED_BINARYOP` is using a `TEMPLATED_BENCHMARK_F` macro which causes a new separate memory pool to be created instead of reusing the one already created by `BINARYOP`. This PR reworks the benchmark macros in `compiled_binaryop.cpp` to avoid using `TEMPLATED_BENCHMARK_F` allowing it share the existing memory pool. Similar to #10258 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - MithunR (https://github.com/mythrocks) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/10269 --- cpp/benchmarks/binaryop/compiled_binaryop.cpp | 37 +++++++++++-------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp index 745d4e354e7..f8226c7387a 100644 --- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp +++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,8 +27,8 @@ class COMPILED_BINARYOP : public cudf::benchmark { }; -template -void BM_compiled_binaryop(benchmark::State& state) +template +void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) { const cudf::size_type column_size{(cudf::size_type)state.range(0)}; @@ -50,21 +50,26 @@ void BM_compiled_binaryop(benchmark::State& state) } // TODO tparam boolean for null. -#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut) \ - TEMPLATED_BENCHMARK_F(COMPILED_BINARYOP, \ - BM_compiled_binaryop, \ - TypeLhs, \ - TypeRhs, \ - TypeOut, \ - cudf::binary_operator::binop) \ - ->Unit(benchmark::kMicrosecond) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ - ->Arg(10000000) /* 10M */ \ +#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \ + BENCHMARK_DEFINE_F(COMPILED_BINARYOP, name) \ + (::benchmark::State & st) \ + { \ + BM_compiled_binaryop(st, cudf::binary_operator::bop); \ + } \ + BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name) \ + ->Unit(benchmark::kMicrosecond) \ + ->UseManualTime() \ + ->Arg(10000) /* 10k */ \ + ->Arg(100000) /* 100k */ \ + ->Arg(1000000) /* 1M */ \ + ->Arg(10000000) /* 10M */ \ ->Arg(100000000); /* 100M */ +#define build_name(a, b, c, d) a##_##b##_##c##_##d + +#define BINARYOP_BENCHMARK_DEFINE(lhs, rhs, bop, tout) \ + BM_BINARYOP_BENCHMARK_DEFINE(build_name(bop, lhs, rhs, tout), lhs, rhs, bop, tout) + using namespace cudf; using namespace numeric; From 7025c40c16bfea9b37ac9110f62b7b23a2c51725 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 14 Feb 2022 09:01:45 -0700 Subject: [PATCH 20/47] Add JNI for `strings::split_re` and `strings::split_record_re` (#10139) This PR adds Java binding for the new strings API `strings::split_re` and `strings::split_record_re`, which allows splitting strings by regular expression delimiters. In addition, the Java string split overloads with default split pattern (an empty string) are removed in this PR. That is because with default empty pattern the Java's split API produces different results than cudf. Finally, some cleanup has been perform automatically thanks to IntelliJ IDE. Depends on https://github.com/rapidsai/cudf/pull/10128. This is breaking change which is fixed by https://github.com/NVIDIA/spark-rapids/pull/4714. Thus, it should be merged at the same time with https://github.com/NVIDIA/spark-rapids/pull/4714. Authors: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Andy Grove (https://github.com/andygrove) URL: https://github.com/rapidsai/cudf/pull/10139 --- .../main/java/ai/rapids/cudf/ColumnView.java | 215 +++++++++++------- java/src/main/native/src/ColumnViewJni.cpp | 81 +++++-- .../java/ai/rapids/cudf/ColumnVectorTest.java | 129 +++++++---- 3 files changed, 283 insertions(+), 142 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 3ff2a370e4f..f91ee5535b1 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -826,18 +826,18 @@ public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... co /** * Creates a deep copy of a column while replacing the validity mask. The validity mask is the * device_vector equivalent of the boolean column given as argument. - * + * * The boolColumn must have the same number of rows as the current column. - * The result column will have the same number of rows as the current column. + * The result column will have the same number of rows as the current column. * For all indices `i` where the boolColumn is `true`, the result column will have a valid value at index i. * For all other values (i.e. `false` or `null`), the result column will have nulls. - * + * * If the current column has a null at a given index `i`, and the new validity mask is `true` at index `i`, * then the row value is undefined. - * + * * @param boolColumn bool column whose value is to be used as the validity mask. * @return Deep copy of the column with replaced validity mask. - */ + */ public final ColumnVector copyWithBooleanColumnAsValidity(ColumnView boolColumn) { return new ColumnVector(copyWithBooleanColumnAsValidity(getNativeView(), boolColumn.getNativeView())); } @@ -2345,88 +2345,128 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) { } /** - * Returns a list of columns by splitting each string using the specified delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * An empty string indicates split on whitespace. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. - * @return New table of strings columns. + * Returns a list of columns by splitting each string using the specified pattern. The number of + * rows in the output columns will be the same as the input column. Null entries are added for a + * row where split results have been exhausted. Null input entries result in all nulls in the + * corresponding rows of the output columns. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return list of strings columns as a table. */ - public final Table stringSplit(Scalar delimiter, int maxSplit) { + public final Table stringSplit(String pattern, int limit, boolean splitByRegex) { assert type.equals(DType.STRING) : "column type must be a String"; - assert delimiter != null : "delimiter may not be null"; - assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar"; - return new Table(stringSplit(this.getNativeView(), delimiter.getScalarHandle(), maxSplit)); + assert pattern != null : "pattern is null"; + assert pattern.length() > 0 : "empty pattern is not supported"; + assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported"; + return new Table(stringSplit(this.getNativeView(), pattern, limit, splitByRegex)); } - + /** - * Returns a list of columns by splitting each string using the specified delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * An empty string indicates split on whitespace. - * @return New table of strings columns. + * Returns a list of columns by splitting each string using the specified pattern. The number of + * rows in the output columns will be the same as the input column. Null entries are added for a + * row where split results have been exhausted. Null input entries result in all nulls in the + * corresponding rows of the output columns. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return list of strings columns as a table. */ - public final Table stringSplit(Scalar delimiter) { - return stringSplit(delimiter, -1); + public final Table stringSplit(String pattern, boolean splitByRegex) { + return stringSplit(pattern, -1, splitByRegex); } /** - * Returns a list of columns by splitting each string using whitespace as the delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @return New table of strings columns. + * Returns a list of columns by splitting each string using the specified string literal + * delimiter. The number of rows in the output columns will be the same as the input column. + * Null entries are added for a row where split results have been exhausted. Null input entries + * result in all nulls in the corresponding rows of the output columns. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @return list of strings columns as a table. */ - public final Table stringSplit() { - try (Scalar emptyString = Scalar.fromString("")) { - return stringSplit(emptyString, -1); - } + public final Table stringSplit(String delimiter, int limit) { + return stringSplit(delimiter, limit, false); } /** - * Returns a column of lists of strings by splitting each string using whitespace as the delimiter. + * Returns a list of columns by splitting each string using the specified string literal + * delimiter. The number of rows in the output columns will be the same as the input column. + * Null entries are added for a row where split results have been exhausted. Null input entries + * result in all nulls in the corresponding rows of the output columns. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @return list of strings columns as a table. */ - public final ColumnVector stringSplitRecord() { - return stringSplitRecord(-1); + public final Table stringSplit(String delimiter) { + return stringSplit(delimiter, -1, false); } /** - * Returns a column of lists of strings by splitting each string using whitespace as the delimiter. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified pattern. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return a LIST column of string elements. */ - public final ColumnVector stringSplitRecord(int maxSplit) { - try (Scalar emptyString = Scalar.fromString("")) { - return stringSplitRecord(emptyString, maxSplit); - } + public final ColumnVector stringSplitRecord(String pattern, int limit, boolean splitByRegex) { + assert type.equals(DType.STRING) : "column type must be String"; + assert pattern != null : "pattern is null"; + assert pattern.length() > 0 : "empty pattern is not supported"; + assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported"; + return new ColumnVector( + stringSplitRecord(this.getNativeView(), pattern, limit, splitByRegex)); + } + + /** + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified pattern. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return a LIST column of string elements. + */ + public final ColumnVector stringSplitRecord(String pattern, boolean splitByRegex) { + return stringSplitRecord(pattern, -1, splitByRegex); } /** - * Returns a column of lists of strings by splitting each string using the specified delimiter. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * An empty string indicates split on whitespace. + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified string literal delimiter. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @return a LIST column of string elements. */ - public final ColumnVector stringSplitRecord(Scalar delimiter) { - return stringSplitRecord(delimiter, -1); + public final ColumnVector stringSplitRecord(String delimiter, int limit) { + return stringSplitRecord(delimiter, limit, false); } /** - * Returns a column that is a list of strings. Each string list is made by splitting each input - * string using the specified delimiter. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * An empty string indicates split on whitespace. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. - * @return New table of strings columns. + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified string literal delimiter. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @return a LIST column of string elements. */ - public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit) { - assert type.equals(DType.STRING) : "column type must be a String"; - assert delimiter != null : "delimiter may not be null"; - assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar"; - return new ColumnVector( - stringSplitRecord(this.getNativeView(), delimiter.getScalarHandle(), maxSplit)); + public final ColumnVector stringSplitRecord(String delimiter) { + return stringSplitRecord(delimiter, -1, false); } /** @@ -3248,7 +3288,7 @@ public enum FindOptions {FIND_FIRST, FIND_LAST}; * Create a column of int32 indices, indicating the position of the scalar search key * in each list row. * All indices are 0-based. If a search key is not found, the index is set to -1. - * The index is set to null if one of the following is true: + * The index is set to null if one of the following is true: * 1. The search key is null. * 2. The list row is null. * @param key The scalar search key @@ -3265,7 +3305,7 @@ public final ColumnVector listIndexOf(Scalar key, FindOptions findOption) { * Create a column of int32 indices, indicating the position of each row in the * search key column in the corresponding row of the lists column. * All indices are 0-based. If a search key is not found, the index is set to -1. - * The index is set to null if one of the following is true: + * The index is set to null if one of the following is true: * 1. The search key row is null. * 2. The list row is null. * @param keys ColumnView of search keys. @@ -3531,15 +3571,36 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle private static native long substringLocate(long columnView, long substringScalar, int start, int end); /** - * Native method which returns array of columns by splitting each string using the specified - * delimiter. - * @param columnView native handle of the cudf::column_view being operated on. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * Returns a list of columns by splitting each string using the specified pattern. The number of + * rows in the output columns will be the same as the input column. Null entries are added for a + * row where split results have been exhausted. Null input entries result in all nulls in the + * corresponding rows of the output columns. + * + * @param nativeHandle native handle of the input strings column that being operated on. + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. */ - private static native long[] stringSplit(long columnView, long delimiter, int maxSplit); + private static native long[] stringSplit(long nativeHandle, String pattern, int limit, + boolean splitByRegex); - private static native long stringSplitRecord(long nativeView, long scalarHandle, int maxSplit); + /** + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified string literal delimiter. + * + * @param nativeHandle native handle of the input strings column that being operated on. + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + */ + private static native long stringSplitRecord(long nativeHandle, String pattern, int limit, + boolean splitByRegex); /** * Native method to calculate substring from a given string column. 0 indexing. @@ -3714,7 +3775,7 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat /** * Native method to search list rows for null elements. * @param nativeView the column view handle of the list - * @return column handle of the resultant boolean column + * @return column handle of the resultant boolean column */ private static native long listContainsNulls(long nativeView); @@ -3896,20 +3957,20 @@ private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] vi /** * Native method to deep copy a column while replacing the null mask. The null mask is the * device_vector equivalent of the boolean column given as argument. - * + * * The boolColumn must have the same number of rows as the exemplar column. * The result column will have the same number of rows as the exemplar. * For all indices `i` where the boolean column is `true`, the result column will have a valid value at index i. * For all other values (i.e. `false` or `null`), the result column will have nulls. - * + * * If the exemplar column has a null at a given index `i`, and the new validity mask is `true` at index `i`, * then the resultant row value is undefined. - * + * * @param exemplarViewHandle column view of the column that is deep copied. * @param boolColumnViewHandle bool column whose value is to be used as the null mask. * @return Deep copy of the column with replaced null mask. - */ - private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, + */ + private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, long boolColumnViewHandle) throws CudfException; //////// diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index eec4a78a457..548844aa0d3 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -561,34 +562,78 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env, } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass, - jlong column_view, - jlong delimiter_ptr, - jint max_split) { - JNI_NULL_CHECK(env, column_view, "column is null", 0); - JNI_NULL_CHECK(env, delimiter_ptr, "string scalar delimiter is null", 0); + jlong input_handle, + jstring pattern_obj, + jint limit, + jboolean split_by_regex) { + JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); + + if (limit == 0 || limit == 1) { + // Cannot achieve the results of splitting with limit == 0 or limit == 1. + // This is because cudf operates on a different parameter (`max_split`) which is converted from + // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an + // unlimited split. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "limit == 0 and limit == 1 are not supported", 0); + } + try { cudf::jni::auto_set_device(env); - cudf::strings_column_view const scv{*reinterpret_cast(column_view)}; - auto delimiter = reinterpret_cast(delimiter_ptr); + auto const input = reinterpret_cast(input_handle); + auto const strs_input = cudf::strings_column_view{*input}; - return cudf::jni::convert_table_for_return(env, - cudf::strings::split(scv, *delimiter, max_split)); + auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj); + if (pattern_jstr.is_empty()) { + // Java's split API produces different behaviors than cudf when splitting with empty + // pattern. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0); + } + + auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes()); + auto const max_split = limit > 1 ? limit - 1 : limit; + auto result = split_by_regex ? + cudf::strings::split_re(strs_input, pattern, max_split) : + cudf::strings::split(strs_input, cudf::string_scalar{pattern}, max_split); + return cudf::jni::convert_table_for_return(env, std::move(result)); } CATCH_STD(env, 0); } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass, - jlong column_view, - jlong delimiter, - jint max_split) { - JNI_NULL_CHECK(env, column_view, "column is null", 0); - JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0); + jlong input_handle, + jstring pattern_obj, + jint limit, + jboolean split_by_regex) { + JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); + + if (limit == 0 || limit == 1) { + // Cannot achieve the results of splitting with limit == 0 or limit == 1. + // This is because cudf operates on a different parameter (`max_split`) which is converted from + // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an + // unlimited split. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "limit == 0 and limit == 1 are not supported", 0); + } + try { cudf::jni::auto_set_device(env); - cudf::column_view *cv = reinterpret_cast(column_view); - cudf::strings_column_view scv(*cv); - cudf::string_scalar *ss_scalar = reinterpret_cast(delimiter); - return release_as_jlong(cudf::strings::split_record(scv, *ss_scalar, max_split)); + auto const input = reinterpret_cast(input_handle); + auto const strs_input = cudf::strings_column_view{*input}; + + auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj); + if (pattern_jstr.is_empty()) { + // Java's split API produces different behaviors than cudf when splitting with empty + // pattern. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0); + } + + auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes()); + auto const max_split = limit > 1 ? limit - 1 : limit; + auto result = + split_by_regex ? + cudf::strings::split_record_re(strs_input, pattern, max_split) : + cudf::strings::split_record(strs_input, cudf::string_scalar{pattern}, max_split); + return release_as_jlong(result); } CATCH_STD(env, 0); } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 9c00cdbc084..b759c746735 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4364,10 +4364,10 @@ void testExtractListElements() { ColumnVector expected = ColumnVector.fromStrings("Héllo", "thésé", null, - null, + "", "ARé", "test"); - ColumnVector tmp = v.stringSplitRecord(); + ColumnVector tmp = v.stringSplitRecord(" "); ColumnVector result = tmp.extractListElement(0)) { assertColumnsAreEqual(expected, result); } @@ -4761,28 +4761,12 @@ void testListSortRowsWithStringChild() { } } - @Test - void testStringSplitRecord() { - try (ColumnVector v = ColumnVector.fromStrings("Héllo there", "thésé", "null", "", "ARé some", "test strings"); - ColumnVector expected = ColumnVector.fromLists( - new HostColumnVector.ListType(true, - new HostColumnVector.BasicType(true, DType.STRING)), - Arrays.asList("Héllo", "there"), - Arrays.asList("thésé"), - Arrays.asList("null"), - Arrays.asList(""), - Arrays.asList("ARé", "some"), - Arrays.asList("test", "strings")); - Scalar pattern = Scalar.fromString(" "); - ColumnVector result = v.stringSplitRecord(pattern, -1)) { - assertColumnsAreEqual(expected, result); - } - } - @Test void testStringSplit() { - try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", "ARé some things", "test strings here"); - Table expectedSplitOnce = new Table.TestBuilder() + String pattern = " "; + try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", + "ARé some things", "test strings here"); + Table expectedSplitLimit2 = new Table.TestBuilder() .column("Héllo", "thésé", null, "", "ARé", "test") .column("there all", null, null, null, "some things", "strings here") .build(); @@ -4791,41 +4775,92 @@ void testStringSplit() { .column("there", null, null, null, "some", "strings") .column("all", null, null, null, "things", "here") .build(); - Scalar pattern = Scalar.fromString(" "); - Table resultSplitOnce = v.stringSplit(pattern, 1); + Table resultSplitLimit2 = v.stringSplit(pattern, 2); Table resultSplitAll = v.stringSplit(pattern)) { - assertTablesAreEqual(expectedSplitOnce, resultSplitOnce); + assertTablesAreEqual(expectedSplitLimit2, resultSplitLimit2); assertTablesAreEqual(expectedSplitAll, resultSplitAll); } } @Test - void teststringSplitWhiteSpace() { - try (ColumnVector v = ColumnVector.fromStrings("Héllo thesé", null, "are\tsome", "tést\nString", " "); - Table expected = new Table.TestBuilder().column("Héllo", null, "are", "tést", null) - .column("thesé", null, "some", "String", null) - .build(); - Table result = v.stringSplit()) { - assertTablesAreEqual(expected, result); + void testStringSplitByRegularExpression() { + String pattern = "[_ ]"; + try (ColumnVector v = ColumnVector.fromStrings("Héllo_there all", "thésé", null, "", + "ARé some_things", "test_strings_here"); + Table expectedSplitLimit2 = new Table.TestBuilder() + .column("Héllo", "thésé", null, "", "ARé", "test") + .column("there all", null, null, null, "some_things", "strings_here") + .build(); + Table expectedSplitAll = new Table.TestBuilder() + .column("Héllo", "thésé", null, "", "ARé", "test") + .column("there", null, null, null, "some", "strings") + .column("all", null, null, null, "things", "here") + .build(); + Table resultSplitLimit2 = v.stringSplit(pattern, 2, true); + Table resultSplitAll = v.stringSplit(pattern, true)) { + assertTablesAreEqual(expectedSplitLimit2, resultSplitLimit2); + assertTablesAreEqual(expectedSplitAll, resultSplitAll); } } @Test - void teststringSplitThrowsException() { - assertThrows(CudfException.class, () -> { - try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); - Scalar delimiter = Scalar.fromString(null); - Table result = cv.stringSplit(delimiter)) {} - }); - assertThrows(AssertionError.class, () -> { - try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); - Scalar delimiter = Scalar.fromInt(1); - Table result = cv.stringSplit(delimiter)) {} - }); - assertThrows(AssertionError.class, () -> { - try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); - Table result = cv.stringSplit(null)) {} - }); + void testStringSplitRecord() { + String pattern = " "; + try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", + "ARé some things", "test strings here"); + ColumnVector expectedSplitLimit2 = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there all"), + Arrays.asList("thésé"), + null, + Arrays.asList(""), + Arrays.asList("ARé", "some things"), + Arrays.asList("test", "strings here")); + ColumnVector expectedSplitAll = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there", "all"), + Arrays.asList("thésé"), + null, + Arrays.asList(""), + Arrays.asList("ARé", "some", "things"), + Arrays.asList("test", "strings", "here")); + ColumnVector resultSplitLimit2 = v.stringSplitRecord(pattern, 2); + ColumnVector resultSplitAll = v.stringSplitRecord(pattern)) { + assertColumnsAreEqual(expectedSplitLimit2, resultSplitLimit2); + assertColumnsAreEqual(expectedSplitAll, resultSplitAll); + } + } + + @Test + void testStringSplitRecordByRegularExpression() { + String pattern = "[_ ]"; + try (ColumnVector v = ColumnVector.fromStrings("Héllo_there all", "thésé", null, "", + "ARé some_things", "test_strings_here"); + ColumnVector expectedSplitLimit2 = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there all"), + Arrays.asList("thésé"), + null, + Arrays.asList(""), + Arrays.asList("ARé", "some_things"), + Arrays.asList("test", "strings_here")); + ColumnVector expectedSplitAll = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there", "all"), + Arrays.asList("thésé"), + null, + Arrays.asList(""), + Arrays.asList("ARé", "some", "things"), + Arrays.asList("test", "strings", "here")); + ColumnVector resultSplitLimit2 = v.stringSplitRecord(pattern, 2, true); + ColumnVector resultSplitAll = v.stringSplitRecord(pattern, true)) { + assertColumnsAreEqual(expectedSplitLimit2, resultSplitLimit2); + assertColumnsAreEqual(expectedSplitAll, resultSplitAll); + } } @Test From c21cca944fd8d65aed0068538859141bb21f0049 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 14 Feb 2022 09:35:58 -0700 Subject: [PATCH 21/47] Fix groupby reductions that perform operations on source type instead of target type (#10250) In groupby reductions, some operations such as SUM on integers have the output type different from the source type. For example, target type is int64 while the source type is int32. This is to prevent overflow of the computation result. This fixes a bug in groupby reductions that perform computation using the data in source type instead of target type. Closes #10246. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - Conor Hoekstra (https://github.com/codereport) URL: https://github.com/rapidsai/cudf/pull/10250 --- .../sort/group_single_pass_reduction_util.cuh | 39 +++++++++++++------ cpp/tests/groupby/sum_tests.cpp | 23 ++++++++++- 2 files changed, 49 insertions(+), 13 deletions(-) diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh index dde4e00eb4a..8e1463f7964 100644 --- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh +++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,12 +74,16 @@ struct element_arg_minmax_fn { /** * @brief Value accessor for column which supports dictionary column too. * + * This is similar to `value_accessor` in `column_device_view.cuh` but with support of dictionary + * type. + * * @tparam T Type of the underlying column. For dictionary column, type of the key column. */ template struct value_accessor { column_device_view const col; bool const is_dict; + value_accessor(column_device_view const& col) : col(col), is_dict(cudf::is_dictionary(col.type())) { } @@ -93,6 +97,7 @@ struct value_accessor { return col.element(i); } } + __device__ auto operator()(size_type i) const { return value(i); } }; @@ -100,20 +105,28 @@ struct value_accessor { * @brief Null replaced value accessor for column which supports dictionary column too. * For null value, returns null `init` value * - * @tparam T Type of the underlying column. For dictionary column, type of the key column. + * @tparam SourceType Type of the underlying column. For dictionary column, type of the key column. + * @tparam TargetType Type that is used for computation. */ -template -struct null_replaced_value_accessor : value_accessor { - using super_t = value_accessor; +template +struct null_replaced_value_accessor : value_accessor { + using super_t = value_accessor; + + TargetType const init; bool const has_nulls; - T const init; - null_replaced_value_accessor(column_device_view const& col, T const& init, bool const has_nulls) + + null_replaced_value_accessor(column_device_view const& col, + TargetType const& init, + bool const has_nulls) : super_t(col), init(init), has_nulls(has_nulls) { } - __device__ T operator()(size_type i) const + + __device__ TargetType operator()(size_type i) const { - return has_nulls && super_t::col.is_null_nocheck(i) ? init : super_t::value(i); + return has_nulls && super_t::col.is_null_nocheck(i) + ? init + : static_cast(super_t::value(i)); } }; @@ -168,7 +181,7 @@ struct group_reduction_functor; + using SourceDType = device_storage_type_t; using ResultType = cudf::detail::target_type_t; using ResultDType = device_storage_type_t; @@ -203,9 +216,11 @@ struct group_reduction_functor; - auto init = OpType::template identity(); + auto init = OpType::template identity(); auto inp_values = cudf::detail::make_counting_transform_iterator( - 0, null_replaced_value_accessor{*d_values_ptr, init, values.has_nulls()}); + 0, + null_replaced_value_accessor{ + *d_values_ptr, init, values.has_nulls()}); do_reduction(inp_values, result_begin, OpType{}); } diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp index 5947e309bec..be7da4a784c 100644 --- a/cpp/tests/groupby/sum_tests.cpp +++ b/cpp/tests/groupby/sum_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -156,6 +156,27 @@ TYPED_TEST(groupby_sum_test, dictionary) force_use_sort_impl::YES); } +struct overflow_test : public cudf::test::BaseFixture { +}; +TEST_F(overflow_test, overflow_integer) +{ + using int32_col = fixed_width_column_wrapper; + using int64_col = fixed_width_column_wrapper; + + auto const keys = int32_col{0, 0}; + auto const vals = int32_col{-2147483648, -2147483648}; + auto const expect_keys = int32_col{0}; + auto const expect_vals = int64_col{-4294967296L}; + + auto test_sum = [&](auto const use_sort) { + auto agg = make_sum_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), use_sort); + }; + + test_sum(force_use_sort_impl::NO); + test_sum(force_use_sort_impl::YES); +} + template struct FixedPointTestAllReps : public cudf::test::BaseFixture { }; From c2846fb8580a2a6ddd29808bea470623b71feb19 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 14 Feb 2022 12:32:12 -0800 Subject: [PATCH 22/47] Fix incorrect slicing of GDS read/write calls (#10274) Issue happens when the read/write size is a multiple of the maximum slice size. It this case, size of the last slice is computed as `0`, instead of `max_slice_size`: `(t == n_slices - 1) ? size % max_slice_bytes : max_slice_bytes` This PR reimplements this part of code and adds unit tests. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - Devavret Makkar (https://github.com/devavret) URL: https://github.com/rapidsai/cudf/pull/10274 --- cpp/src/io/utilities/file_io_utilities.cpp | 36 ++++++++++------- cpp/src/io/utilities/file_io_utilities.hpp | 17 +++++++- cpp/tests/CMakeLists.txt | 1 + cpp/tests/io/file_io_test.cpp | 46 ++++++++++++++++++++++ 4 files changed, 85 insertions(+), 15 deletions(-) create mode 100644 cpp/tests/io/file_io_test.cpp diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index dabe992d959..e2893a2e881 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -194,20 +194,13 @@ template > make_sliced_tasks( F function, DataT* ptr, size_t offset, size_t size, cudf::detail::thread_pool& pool) { + constexpr size_t default_max_slice_size = 4 * 1024 * 1024; + static auto const max_slice_size = getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_size); + auto const slices = make_file_io_slices(size, max_slice_size); std::vector> slice_tasks; - constexpr size_t default_max_slice_bytes = 4 * 1024 * 1024; - static auto const max_slice_bytes = - getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_bytes); - size_t const n_slices = util::div_rounding_up_safe(size, max_slice_bytes); - size_t slice_offset = 0; - for (size_t t = 0; t < n_slices; ++t) { - DataT* ptr_slice = ptr + slice_offset; - - size_t const slice_size = (t == n_slices - 1) ? size % max_slice_bytes : max_slice_bytes; - slice_tasks.push_back(pool.submit(function, ptr_slice, slice_size, offset + slice_offset)); - - slice_offset += slice_size; - } + std::transform(slices.cbegin(), slices.cend(), std::back_inserter(slice_tasks), [&](auto& slice) { + return pool.submit(function, ptr + slice.offset, slice.size, offset + slice.offset); + }); return slice_tasks; } @@ -318,6 +311,21 @@ std::unique_ptr make_cufile_output(std::string const& filepa return nullptr; } +std::vector make_file_io_slices(size_t size, size_t max_slice_size) +{ + max_slice_size = std::max(1024ul, max_slice_size); + auto const n_slices = util::div_rounding_up_safe(size, max_slice_size); + std::vector slices; + slices.reserve(n_slices); + std::generate_n(std::back_inserter(slices), n_slices, [&, idx = 0]() mutable { + auto const slice_offset = idx++ * max_slice_size; + auto const slice_size = std::min(size - slice_offset, max_slice_size); + return file_io_slice{slice_offset, slice_size}; + }); + + return slices; +} + } // namespace detail } // namespace io } // namespace cudf diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index fcee4e43a20..be3ecc49ab0 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -291,6 +291,21 @@ std::unique_ptr make_cufile_input(std::string const& filepath */ std::unique_ptr make_cufile_output(std::string const& filepath); +/** + * @brief Byte range to be read/written in a single operation. + */ +struct file_io_slice { + size_t offset; + size_t size; +}; + +/** + * @brief Split the total number of bytes to read/write into slices to enable parallel IO. + * + * If `max_slice_size` is below 1024, 1024 will be used instead to prevent potential misuse. + */ +std::vector make_file_io_slices(size_t size, size_t max_slice_size); + } // namespace detail } // namespace io } // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 913761ecd03..27dd472b3f5 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -199,6 +199,7 @@ ConfigureTest( ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cpp) ConfigureTest(CSV_TEST io/csv_test.cpp) +ConfigureTest(FILE_IO_TEST io/file_io_test.cpp) ConfigureTest(ORC_TEST io/orc_test.cpp) ConfigureTest(PARQUET_TEST io/parquet_test.cpp) ConfigureTest(JSON_TEST io/json_test.cpp) diff --git a/cpp/tests/io/file_io_test.cpp b/cpp/tests/io/file_io_test.cpp new file mode 100644 index 00000000000..b546239fdca --- /dev/null +++ b/cpp/tests/io/file_io_test.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include + +// Base test fixture for tests +struct CuFileIOTest : public cudf::test::BaseFixture { +}; + +TEST_F(CuFileIOTest, SliceSize) +{ + std::vector> test_cases{ + {1 << 20, 1 << 18}, {1 << 18, 1 << 20}, {1 << 20, 3333}, {0, 1 << 18}, {0, 0}, {1 << 20, 0}}; + for (auto const& test_case : test_cases) { + auto const slices = cudf::io::detail::make_file_io_slices(test_case.first, test_case.second); + if (slices.empty()) { + ASSERT_EQ(test_case.first, 0); + } else { + ASSERT_EQ(slices.front().offset, 0); + ASSERT_EQ(slices.back().offset + slices.back().size, test_case.first); + for (auto i = 1u; i < slices.size(); ++i) { + ASSERT_EQ(slices[i].offset, slices[i - 1].offset + slices[i - 1].size); + } + } + } +} + +CUDF_TEST_PROGRAM_MAIN() From 374b38745932616dbeca2e84e1e86d63c228a179 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Mon, 14 Feb 2022 15:59:06 -0500 Subject: [PATCH 23/47] Replace custom `cached_property` implementation with functools (#10272) Replaces our custom `cached_property` with that provided by `functools` since Python 3.8. This uncovered a couple of typing bugs that previously eluded us. Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - https://github.com/brandon-b-miller - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10272 --- python/cudf/cudf/core/_base_index.py | 2 +- python/cudf/cudf/core/column_accessor.py | 7 +++---- python/cudf/cudf/core/frame.py | 12 ++++++------ python/cudf/cudf/core/groupby/groupby.py | 3 ++- python/cudf/cudf/core/index.py | 5 +++-- python/cudf/cudf/core/indexed_frame.py | 2 +- python/cudf/cudf/core/join/join.py | 6 +++--- python/cudf/cudf/core/multiindex.py | 9 +++------ python/cudf/cudf/utils/utils.py | 22 ---------------------- 9 files changed, 22 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index cdb1e9fc86f..c96d940c378 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -3,6 +3,7 @@ from __future__ import annotations import pickle +from functools import cached_property from typing import Any, Set import pandas as pd @@ -31,7 +32,6 @@ is_mixed_with_object_dtype, numeric_normalize_types, ) -from cudf.utils.utils import cached_property class BaseIndex(Serializable): diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 67976ac27d4..9cb86ca1cd2 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -4,7 +4,7 @@ import itertools from collections.abc import MutableMapping -from functools import reduce +from functools import cached_property, reduce from typing import ( TYPE_CHECKING, Any, @@ -20,7 +20,6 @@ import cudf from cudf.core import column -from cudf.utils.utils import cached_property if TYPE_CHECKING: from cudf.core.column import ColumnBase @@ -360,9 +359,9 @@ def select_by_index(self, index: Any) -> ColumnAccessor: start, stop, step = index.indices(len(self._data)) keys = self.names[start:stop:step] elif pd.api.types.is_integer(index): - keys = [self.names[index]] + keys = (self.names[index],) else: - keys = (self.names[i] for i in index) + keys = tuple(self.names[i] for i in index) data = {k: self._data[k] for k in keys} return self.__class__( data, multiindex=self.multiindex, level_names=self.level_names, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 6e46c107d2e..6038bb49bfb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -90,22 +90,22 @@ def _num_rows(self) -> int: return len(self._data.columns[0]) @property - def _column_names(self) -> List[Any]: # TODO: List[str]? - return self._data.names + def _column_names(self) -> Tuple[Any, ...]: # TODO: Tuple[str]? + return tuple(self._data.names) @property - def _index_names(self) -> List[Any]: # TODO: List[str]? + def _index_names(self) -> Optional[Tuple[Any, ...]]: # TODO: Tuple[str]? # TODO: Temporarily suppressing mypy warnings to avoid introducing bugs # by returning an empty list where one is not expected. return ( None # type: ignore if self._index is None - else self._index._data.names + else tuple(self._index._data.names) ) @property - def _columns(self) -> List[Any]: # TODO: List[Column]? - return self._data.columns + def _columns(self) -> Tuple[Any, ...]: # TODO: Tuple[Column]? + return tuple(self._data.columns) def serialize(self): header = { diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index ff700144bed..4bd14a2c47b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -4,6 +4,7 @@ import itertools import pickle import warnings +from functools import cached_property import numpy as np import pandas as pd @@ -16,7 +17,7 @@ from cudf.core.abc import Serializable from cudf.core.column.column import arange, as_column from cudf.core.multiindex import MultiIndex -from cudf.utils.utils import GetAttrGetItemMixin, cached_property +from cudf.utils.utils import GetAttrGetItemMixin # The three functions below return the quantiles [25%, 50%, 75%] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index f71f930a21c..5b60e8dbd1c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1,10 +1,11 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. from __future__ import annotations import math import pickle import warnings +from functools import cached_property from numbers import Number from typing import ( Any, @@ -54,7 +55,7 @@ from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import find_common_type -from cudf.utils.utils import cached_property, search_range +from cudf.utils.utils import search_range T = TypeVar("T", bound="Frame") diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 10b9f2396bb..7788a5346c8 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -6,6 +6,7 @@ import operator import warnings from collections import Counter, abc +from functools import cached_property from typing import Callable, Type, TypeVar from uuid import uuid4 @@ -29,7 +30,6 @@ from cudf.core.index import Index, RangeIndex, _index_from_columns from cudf.core.multiindex import MultiIndex from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame -from cudf.utils.utils import cached_property doc_reset_index_template = """ Reset the index of the {klass}, or a level of it. diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 39ff4718550..c7e46cf0165 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,7 +1,7 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from __future__ import annotations -from typing import TYPE_CHECKING, Callable, cast +from typing import TYPE_CHECKING, Any, Callable, List, cast import cudf from cudf import _lib as libcudf @@ -320,7 +320,7 @@ def _sort_result(self, result: Frame) -> Frame: # same order as given in 'on'. If the indices are used as # keys, the index will be sorted. If one index is specified, # the key columns on the other side will be used to sort. - by = [] + by: List[Any] = [] if self._using_left_index and self._using_right_index: if result._index is not None: by.extend(result._index._data.columns) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 8581b97c217..5e0cd2ca8cb 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. from __future__ import annotations @@ -6,6 +6,7 @@ import numbers import pickle from collections.abc import Sequence +from functools import cached_property from numbers import Integral from typing import Any, List, MutableMapping, Optional, Tuple, Union @@ -22,11 +23,7 @@ from cudf.core._compat import PANDAS_GE_120 from cudf.core.frame import Frame from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index -from cudf.utils.utils import ( - NotIterable, - _maybe_indices_to_slice, - cached_property, -) +from cudf.utils.utils import NotIterable, _maybe_indices_to_slice class MultiIndex(Frame, BaseIndex, NotIterable): diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 8571d9ffed5..4143cbd1d66 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -144,28 +144,6 @@ def set_allocator( IS_NEP18_ACTIVE = _is_nep18_active() -class cached_property: - """ - Like @property, but only evaluated upon first invocation. - To force re-evaluation of a cached_property, simply delete - it with `del`. - """ - - # TODO: Can be replaced with functools.cached_property when we drop support - # for Python 3.7. - - def __init__(self, func): - self.func = func - - def __get__(self, instance, cls): - if instance is None: - return self - else: - value = self.func(instance) - object.__setattr__(instance, self.func.__name__, value) - return value - - class GetAttrGetItemMixin: """This mixin changes `__getattr__` to attempt a `__getitem__` call. From a443dd1ae6144d8afc9b9bcd5390d9165ce3017d Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 14 Feb 2022 14:20:46 -0800 Subject: [PATCH 24/47] Convert Column Name to String Before Using Struct Column Factory (#10156) Closes #10155 `build_struct_column` requires that the field names to be strings. But dataframe column names can be any hashable types. Passing in column names as field names in `to_struct` is thus unsafe. This PR adds a check and raise a warning if the cast to string is required to take place. Authors: - Michael Wang (https://github.com/isVoid) - Bradley Dice (https://github.com/bdice) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Sheilah Kirui (https://github.com/skirui-source) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10156 --- python/cudf/cudf/core/column/column.py | 4 ++-- python/cudf/cudf/core/dataframe.py | 10 +++++++++- python/cudf/cudf/tests/test_struct.py | 8 ++++++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2535ba5ab8d..393afe4a5b9 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1602,8 +1602,8 @@ def build_struct_column( Parameters ---------- - names : list-like - Field names to map to children dtypes + names : sequence of strings + Field names to map to children dtypes, must be strings. children : tuple mask: Buffer diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 371404ca477..90119ba7b17 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5864,8 +5864,16 @@ def to_struct(self, name=None): ----- Note that a copy of the columns is made. """ + if not all(isinstance(name, str) for name in self._data.names): + warnings.warn( + "DataFrame contains non-string column name(s). Struct column " + "requires field name to be string. Non-string column names " + "will be casted to string as the field name." + ) + field_names = [str(name) for name in self._data.names] + col = cudf.core.column.build_struct_column( - names=self._data.names, children=self._data.columns, size=len(self) + names=field_names, children=self._data.columns, size=len(self) ) return cudf.Series._from_data( cudf.core.column_accessor.ColumnAccessor( diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index dbff626c363..167f171fa26 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -205,6 +205,14 @@ def test_dataframe_to_struct(): df["a"][0] = 5 assert_eq(got, expect) + # check that a non-string (but convertible to string) named column can be + # converted to struct + df = cudf.DataFrame([[1, 2], [3, 4]], columns=[(1, "b"), 0]) + expect = cudf.Series([{"(1, 'b')": 1, "0": 2}, {"(1, 'b')": 3, "0": 4}]) + with pytest.warns(UserWarning, match="will be casted"): + got = df.to_struct() + assert_eq(got, expect) + @pytest.mark.parametrize( "series, slce", From f5ae74f720eb3fe5c82d96d306587a470144dc4f Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Tue, 15 Feb 2022 09:30:18 +0800 Subject: [PATCH 25/47] Allow Java bindings to use default decimal precisions when writing columns (#10276) Closes #9851 This PR is to fix the bug raised in #9851 : Currently, Java bindings set decimal precision for each column when building OrcWriterOptions. It fills zero for non-decimal columns which do not carry precision information. In principle, we should only set precision for decimal columns. It is not easy to write a test for this change, since we won't try to read non-decimal data as decimal type in both spark-rapids and cuDF Java. Authors: - Alfred Xu (https://github.com/sperlingxx) Approvers: - Jason Lowe (https://github.com/jlowe) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/10276 --- .../java/ai/rapids/cudf/ColumnWriterOptions.java | 9 ++++++--- java/src/main/native/src/TableJni.cpp | 13 ++++++++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java index 0e49636fae6..78b3d5d52ec 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,6 +40,9 @@ private ColumnWriterOptions(AbstractStructBuilder builder) { (ColumnWriterOptions[]) builder.children.toArray(new ColumnWriterOptions[0]); } + // The sentinel value of unknown precision (default value) + public static int UNKNOWN_PRECISION = -1; + /** * Constructor used for list */ @@ -103,7 +106,7 @@ protected ColumnWriterOptions withDecimal(String name, int precision, protected ColumnWriterOptions withTimestamp(String name, boolean isInt96, boolean isNullable) { - return new ColumnWriterOptions(name, isInt96, 0, isNullable); + return new ColumnWriterOptions(name, isInt96, UNKNOWN_PRECISION, isNullable); } /** @@ -243,7 +246,7 @@ public ColumnWriterOptions(String columnName, boolean isTimestampTypeInt96, public ColumnWriterOptions(String columnName, boolean isNullable) { this.isTimestampTypeInt96 = false; - this.precision = 0; + this.precision = UNKNOWN_PRECISION; this.isNullable = isNullable; this.columnName = columnName; } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index eac76222475..1cf56da35da 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -676,9 +676,10 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata, int write_index = 0; for (int i = 0; i < num_children; i++, write_index++) { cudf::io::column_in_metadata child; - child.set_name(col_names[read_index]) - .set_decimal_precision(precisions[read_index]) - .set_nullability(nullability[read_index]); + child.set_name(col_names[read_index]).set_nullability(nullability[read_index]); + if (precisions[read_index] > -1) { + child.set_decimal_precision(precisions[read_index]); + } if (!is_int96.is_null()) { child.set_int96_timestamps(is_int96[read_index]); } @@ -717,8 +718,10 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam for (int i = read_index, write_index = 0; i < top_level_children; i++, write_index++) { metadata.column_metadata[write_index] .set_name(cpp_names[read_index]) - .set_nullability(col_nullability[read_index]) - .set_decimal_precision(precisions[read_index]); + .set_nullability(col_nullability[read_index]); + if (precisions[read_index] > -1) { + metadata.column_metadata[write_index].set_decimal_precision(precisions[read_index]); + } if (!is_int96.is_null()) { metadata.column_metadata[write_index].set_int96_timestamps(is_int96[read_index]); } From 17b7907ee56747caf946e07870ccb5187729d57a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 14 Feb 2022 17:31:58 -0800 Subject: [PATCH 26/47] Add copyright check as pre-commit hook. (#10290) Since #10253 added copyright checks, it is helpful to have the corresponding check enabled via pre-commit so that copyright issues can be found locally before pushing. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/10290 --- .pre-commit-config.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1e1ad94ab0b..9e72c0119f3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -88,6 +88,13 @@ repos: # of dependencies, so we'll have to update this manually. additional_dependencies: - cmake-format==0.6.11 + - id: copyright-check + name: copyright-check + # This hook's use of Git tools appears to conflict with + # existing CI invocations so we don't invoke it during CI runs. + stages: [commit] + entry: python ./ci/checks/copyright.py --git-modified-only + language: python default_language_version: python: python3 From 8b0737d7a4cfd3a266a5450b15df4b978fb6dc4f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 14 Feb 2022 17:32:09 -0800 Subject: [PATCH 27/47] Enable numpy ufuncs for DataFrame (#10287) This PR addresses the primary issue in #9083, enabling all numpy ufuncs for DataFrame objects. It builds on the work in #10217, generalizing that code path to support multiple columns and moving the method up to `IndexedFrame` to share the logic with `DataFrame`. The custom preprocessing of inputs before handing off to cupy that was implemented in #10217 has been replaced by reusing parts of the existing binop machinery for greater generality, which is especially important for DataFrame binops since they support a wider range of alternative operand types. The current internal refactor is intentionally minimal to leave the focus on the new ufunc features. I will make a follow-up to clean up the internal functions by adding a proper set of hooks into the binop and ufunc implementations so that we can share these implementations with Index types as well, at which point we will be able to remove the extraneous APIs discussed in https://github.com/rapidsai/cudf/issues/9083#issuecomment-1005175782. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/10287 --- python/cudf/cudf/core/dataframe.py | 33 ++-- python/cudf/cudf/core/indexed_frame.py | 148 +++++++++++++++++ python/cudf/cudf/core/series.py | 159 +++---------------- python/cudf/cudf/core/single_column_frame.py | 6 +- python/cudf/cudf/tests/test_array_ufunc.py | 101 +++++++++++- 5 files changed, 292 insertions(+), 155 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 90119ba7b17..fb15f8da8d9 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1271,14 +1271,6 @@ def memory_usage(self, index=True, deep=False): {str(k): v for k, v in super().memory_usage(index, deep).items()} ) - @annotate("DATAFRAME_ARRAY_UFUNC", color="blue", domain="cudf_python") - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - if method == "__call__" and hasattr(cudf, ufunc.__name__): - func = getattr(cudf, ufunc.__name__) - return func(self) - else: - return NotImplemented - @annotate("DATAFRAME_ARRAY_FUNCTION", color="blue", domain="cudf_python") def __array_function__(self, func, types, args, kwargs): @@ -1864,8 +1856,7 @@ def _get_columns_by_label(self, labels, downcast=False): ) return out - @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python") - def _binaryop( + def _prep_for_binop( self, other: Any, fn: str, @@ -1885,6 +1876,7 @@ def _binaryop( # implementation assumes that binary operations between a column and # NULL are always commutative, even for binops (like subtraction) that # are normally anticommutative. + # TODO: We probably should support pandas DataFrame/Series objects. if isinstance(rhs, Sequence): # TODO: Consider validating sequence length (pandas does). operands = { @@ -1948,11 +1940,30 @@ def _binaryop( right = right_dict[col] operands[col] = (left, right, reflect, fill_value) else: + return NotImplemented, None + + return operands, lhs._index + + @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python") + def _binaryop( + self, + other: Any, + fn: str, + fill_value: Any = None, + reflect: bool = False, + can_reindex: bool = False, + *args, + **kwargs, + ): + operands, out_index = self._prep_for_binop( + other, fn, fill_value, reflect, can_reindex + ) + if operands is NotImplemented: return NotImplemented return self._from_data( ColumnAccessor(type(self)._colwise_binop(operands, fn)), - index=lhs._index, + index=out_index, ) @annotate("DATAFRAME_UPDATE", color="blue", domain="cudf_python") diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 7788a5346c8..e1ff3984948 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1697,6 +1697,154 @@ def last(self, offset): slice_func=lambda i: self.iloc[i:], ) + # For more detail on this function and how it should work, see + # https://numpy.org/doc/stable/reference/ufuncs.html + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # We don't currently support reduction, accumulation, etc. We also + # don't support any special kwargs or higher arity ufuncs than binary. + if method != "__call__" or kwargs or ufunc.nin > 2: + return NotImplemented + + # Binary operations + binary_operations = { + # Arithmetic binary operations. + "add": "add", + "subtract": "sub", + "multiply": "mul", + "matmul": "matmul", + "divide": "truediv", + "true_divide": "truediv", + "floor_divide": "floordiv", + "power": "pow", + "float_power": "pow", + "remainder": "mod", + "mod": "mod", + "fmod": "mod", + # Bitwise binary operations. + "bitwise_and": "and", + "bitwise_or": "or", + "bitwise_xor": "xor", + # Comparison binary operators + "greater": "gt", + "greater_equal": "ge", + "less": "lt", + "less_equal": "le", + "not_equal": "ne", + "equal": "eq", + } + + # First look for methods of the class. + fname = ufunc.__name__ + if fname in binary_operations: + reflect = self is not inputs[0] + other = inputs[0] if reflect else inputs[1] + + # These operators need to be mapped to their inverses when + # performing a reflected operation because no reflected version of + # the operators themselves exist. + ops_without_reflection = { + "gt": "lt", + "ge": "le", + "lt": "gt", + "le": "ge", + # ne and eq are symmetric, so they are their own inverse op + "ne": "ne", + "eq": "eq", + } + + op = binary_operations[fname] + if reflect and op in ops_without_reflection: + op = ops_without_reflection[op] + reflect = False + op = f"__{'r' if reflect else ''}{op}__" + + # pandas bitwise operations return bools if indexes are misaligned. + if ( + "bitwise" in fname + and isinstance(other, IndexedFrame) + and not self.index.equals(other.index) + ): + return getattr(self, op)(other).astype(bool) + # Float_power returns float irrespective of the input type. + if fname == "float_power": + return getattr(self, op)(other).astype(float) + return getattr(self, op)(other) + + # Special handling for unary operations. + if fname == "negative": + return self * -1 + if fname == "positive": + return self.copy(deep=True) + if fname == "invert": + return ~self + if fname == "absolute": + return self.abs() + if fname == "fabs": + return self.abs().astype(np.float64) + + # Note: There are some operations that may be supported by libcudf but + # are not supported by pandas APIs. In particular, libcudf binary + # operations support logical and/or operations, but those operations + # are not defined on pd.Series/DataFrame. For now those operations will + # dispatch to cupy, but if ufuncs are ever a bottleneck we could add + # special handling to dispatch those (or any other) functions that we + # could implement without cupy. + + # Attempt to dispatch all other functions to cupy. + cupy_func = getattr(cp, fname) + if cupy_func: + # Indices must be aligned before converting to arrays. + if ufunc.nin == 2: + other = inputs[self is inputs[0]] + inputs, index = self._prep_for_binop(other, fname) + else: + inputs = { + name: (col, None, False, None) + for name, col in self._data.items() + } + index = self._index + + mask = None + data = [{} for _ in range(ufunc.nout)] + for name, (left, right, _, _) in inputs.items(): + cupy_inputs = [] + # TODO: I'm jumping through multiple hoops to get the unary + # behavior to match up with the binary. I should see if there + # are better patterns to employ here. + for inp in (left, right) if ufunc.nin == 2 else (left,): + if ( + isinstance(inp, cudf.core.column.ColumnBase) + and inp.has_nulls() + ): + new_mask = cudf.core.column.as_column(inp.nullmask) + + # TODO: This is a hackish way to perform a bitwise and + # of bitmasks. Once we expose + # cudf::detail::bitwise_and, then we can use that + # instead. + mask = new_mask if mask is None else (mask & new_mask) + + # Arbitrarily fill with zeros. For ufuncs, we assume + # that the end result propagates nulls via a bitwise + # and, so these elements are irrelevant. + inp = inp.fillna(0) + cupy_inputs.append(cp.asarray(inp)) + + cp_output = cupy_func(*cupy_inputs, **kwargs) + if ufunc.nout == 1: + cp_output = (cp_output,) + for i, out in enumerate(cp_output): + data[i][name] = cudf.core.column.as_column(out).set_mask( + mask + ) + + out = tuple( + self.__class__._from_data(out, index=index) for out in data + ) + return out[0] if ufunc.nout == 1 else out + + return NotImplemented + def _check_duplicate_level_names(specified, level_names): """Raise if any of `specified` has duplicates in `level_names`.""" diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 90ebeba5087..3aef4447a28 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -7,7 +7,6 @@ import pickle import warnings from collections import abc as abc -from itertools import repeat from numbers import Number from shutil import get_terminal_size from typing import Any, MutableMapping, Optional, Set, Union @@ -959,141 +958,6 @@ def to_frame(self, name=None): def memory_usage(self, index=True, deep=False): return sum(super().memory_usage(index, deep).values()) - # For more detail on this function and how it should work, see - # https://numpy.org/doc/stable/reference/ufuncs.html - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - # We don't currently support reduction, accumulation, etc. We also - # don't support any special kwargs or higher arity ufuncs than binary. - if method != "__call__" or kwargs or ufunc.nin > 2: - return NotImplemented - - # Binary operations - binary_operations = { - # Arithmetic binary operations. - "add": "add", - "subtract": "sub", - "multiply": "mul", - "matmul": "matmul", - "divide": "truediv", - "true_divide": "truediv", - "floor_divide": "floordiv", - "power": "pow", - "float_power": "pow", - "remainder": "mod", - "mod": "mod", - "fmod": "mod", - # Bitwise binary operations. - "bitwise_and": "and", - "bitwise_or": "or", - "bitwise_xor": "xor", - # Comparison binary operators - "greater": "gt", - "greater_equal": "ge", - "less": "lt", - "less_equal": "le", - "not_equal": "ne", - "equal": "eq", - } - - # First look for methods of the class. - fname = ufunc.__name__ - if fname in binary_operations: - reflect = self is not inputs[0] - other = inputs[0] if reflect else inputs[1] - - # These operators need to be mapped to their inverses when - # performing a reflected operation because no reflected version of - # the operators themselves exist. - ops_without_reflection = { - "gt": "lt", - "ge": "le", - "lt": "gt", - "le": "ge", - # ne and eq are symmetric, so they are their own inverse op - "ne": "ne", - "eq": "eq", - } - - op = binary_operations[fname] - if reflect and op in ops_without_reflection: - op = ops_without_reflection[op] - reflect = False - op = f"__{'r' if reflect else ''}{op}__" - - # pandas bitwise operations return bools if indexes are misaligned. - # TODO: Generalize for other types of Frames - if ( - "bitwise" in fname - and isinstance(other, Series) - and not self.index.equals(other.index) - ): - return getattr(self, op)(other).astype(bool) - # Float_power returns float irrespective of the input type. - if fname == "float_power": - return getattr(self, op)(other).astype(float) - return getattr(self, op)(other) - - # Special handling for unary operations. - if fname == "negative": - return self * -1 - if fname == "positive": - return self.copy(deep=True) - if fname == "invert": - return ~self - if fname == "absolute": - return self.abs() - if fname == "fabs": - return self.abs().astype(np.float64) - - # Note: There are some operations that may be supported by libcudf but - # are not supported by pandas APIs. In particular, libcudf binary - # operations support logical and/or operations, but those operations - # are not defined on pd.Series/DataFrame. For now those operations will - # dispatch to cupy, but if ufuncs are ever a bottleneck we could add - # special handling to dispatch those (or any other) functions that we - # could implement without cupy. - - # Attempt to dispatch all other functions to cupy. - cupy_func = getattr(cupy, fname) - if cupy_func: - # Indices must be aligned before converting to arrays. - if ufunc.nin == 2 and all(map(isinstance, inputs, repeat(Series))): - inputs = _align_indices(inputs, allow_non_unique=True) - index = inputs[0].index - else: - index = self.index - - cupy_inputs = [] - mask = None - for inp in inputs: - # TODO: Generalize for other types of Frames - if isinstance(inp, Series) and inp.has_nulls: - new_mask = as_column(inp.nullmask) - - # TODO: This is a hackish way to perform a bitwise and of - # bitmasks. Once we expose cudf::detail::bitwise_and, then - # we can use that instead. - mask = new_mask if mask is None else (mask & new_mask) - - # Arbitrarily fill with zeros. For ufuncs, we assume that - # the end result propagates nulls via a bitwise and, so - # these elements are irrelevant. - inp = inp.fillna(0) - cupy_inputs.append(cupy.asarray(inp)) - - cp_output = cupy_func(*cupy_inputs, **kwargs) - - def make_frame(arr): - return self.__class__._from_data( - {self.name: as_column(arr).set_mask(mask)}, index=index - ) - - if ufunc.nout > 1: - return tuple(make_frame(out) for out in cp_output) - return make_frame(cp_output) - - return NotImplemented - def __array_function__(self, func, types, args, kwargs): handled_types = [cudf.Series] for t in types: @@ -1342,9 +1206,9 @@ def __repr__(self): lines.append(category_memory) return "\n".join(lines) - def _binaryop( + def _prep_for_binop( self, - other: Frame, + other: Any, fn: str, fill_value: Any = None, reflect: bool = False, @@ -1376,9 +1240,24 @@ def _binaryop( lhs = self operands = lhs._make_operands_for_binop(other, fill_value, reflect) + return operands, lhs._index + + def _binaryop( + self, + other: Frame, + fn: str, + fill_value: Any = None, + reflect: bool = False, + can_reindex: bool = False, + *args, + **kwargs, + ): + operands, out_index = self._prep_for_binop( + other, fn, fill_value, reflect, can_reindex + ) return ( - lhs._from_data( - data=lhs._colwise_binop(operands, fn), index=lhs._index, + self._from_data( + data=self._colwise_binop(operands, fn), index=out_index, ) if operands is not NotImplemented else NotImplemented diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index bf867923b57..50b206d3388 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. """Base class for Frame types that only have a single column.""" from __future__ import annotations @@ -274,7 +274,7 @@ def factorize(self, na_sentinel=-1): def _make_operands_for_binop( self, - other: T, + other: Any, fill_value: Any = None, reflect: bool = False, *args, @@ -310,7 +310,7 @@ def _make_operands_for_binop( else: result_name = self.name - # This needs to be tested correctly + # TODO: This needs to be tested correctly if isinstance(other, SingleColumnFrame): other = other._column elif not _is_scalar_or_zero_d_array(other): diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index a384ddecca6..f1aad1af9e6 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -109,7 +109,7 @@ def test_ufunc_series(ufunc, has_nulls, indexed): @pytest.mark.parametrize("reflect", [True, False]) def test_binary_ufunc_series_array(ufunc, has_nulls, indexed, type_, reflect): fname = ufunc.__name__ - if fname in ("greater", "greater_equal") and has_nulls: + if fname in ("greater", "greater_equal", "logical_and") and has_nulls: pytest.xfail( "The way cudf casts nans in arrays to nulls during binops with " "cudf objects is currently incompatible with pandas." @@ -181,3 +181,102 @@ def test_ufunc_cudf_series_error_with_out_kwarg(func): # this throws a value-error because of presence of out kwarg with pytest.raises(TypeError): func(x1=cudf_s1, x2=cudf_s2, out=cudf_s3) + + +# Skip matmul since it requires aligned shapes. +@pytest.mark.parametrize("ufunc", (uf for uf in _UFUNCS if uf != np.matmul)) +@pytest.mark.parametrize("has_nulls", [True, False]) +@pytest.mark.parametrize("indexed", [True, False]) +def test_ufunc_dataframe(ufunc, has_nulls, indexed): + # Note: This test assumes that all ufuncs are unary or binary. + fname = ufunc.__name__ + # TODO: When pandas starts supporting misaligned indexes properly, remove + # this check but enable the one below. + if indexed: + pytest.xfail( + "pandas does not currently support misaligned indexes in " + "DataFrames, but we do. Until this is fixed we will skip these " + "tests. See the error here: " + "https://github.com/pandas-dev/pandas/blob/main/pandas/core/arraylike.py#L212, " # noqa: E501 + "called from https://github.com/pandas-dev/pandas/blob/main/pandas/core/arraylike.py#L258" # noqa: E501 + ) + # TODO: Enable the check below when we remove the check above. + # if indexed and fname in ( + # "greater", + # "greater_equal", + # "less", + # "less_equal", + # "not_equal", + # "equal", + # ): + # pytest.skip("Comparison operators do not support misaligned indexes.") # noqa: E501 + + N = 100 + # Avoid zeros in either array to skip division by 0 errors. Also limit the + # scale to avoid issues with overflow, etc. We use ints because some + # operations (like bitwise ops) are not defined for floats. + # TODO: Add tests of mismatched columns etc. + pandas_args = args = [ + cudf.DataFrame( + {"foo": cp.random.randint(low=1, high=10, size=N)}, + index=cp.random.choice(range(N), N, False) if indexed else None, + ) + for _ in range(ufunc.nin) + ] + + if has_nulls: + # Converting nullable integer cudf.Series to pandas will produce a + # float pd.Series, so instead we replace nulls with an arbitrary + # integer value, precompute the mask, and then reapply it afterwards. + for arg in args: + set_random_null_mask_inplace(arg["foo"]) + pandas_args = [arg.copy() for arg in args] + for arg in pandas_args: + arg["foo"] = arg["foo"].fillna(0) + + # Note: Different indexes must be aligned before the mask is computed. + # This requires using an internal function (_align_indices), and that + # is unlikely to change for the foreseeable future. + aligned = ( + cudf.core.dataframe._align_indices(*args) + if indexed and ufunc.nin == 2 + else args + ) + mask = reduce( + operator.or_, (a["foo"].isna() for a in aligned) + ).to_pandas() + + try: + got = ufunc(*args) + except AttributeError as e: + # We xfail if we don't have an explicit dispatch and cupy doesn't have + # the method so that we can easily identify these methods. As of this + # writing, the only missing methods are isnat and heaviside. + if "module 'cupy' has no attribute" in str(e): + pytest.xfail(reason="Operation not supported by cupy") + raise + + expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) + + try: + if ufunc.nout > 1: + for g, e in zip(got, expect): + if has_nulls: + e[mask] = np.nan + assert_eq(g, e) + else: + if has_nulls: + expect[mask] = np.nan + assert_eq(got, expect) + except AssertionError: + # TODO: This branch can be removed when + # https://github.com/rapidsai/cudf/issues/10178 is resolved + if fname in ("power", "float_power"): + not_equal = cudf.from_pandas(expect) != got + not_equal[got.isna()] = False + diffs = got[not_equal] - cudf.from_pandas( + expect[not_equal.to_pandas()] + ) + if diffs["foo"].abs().max() == 1: + pytest.xfail("https://github.com/rapidsai/cudf/issues/10178") + raise From 851e23545d29bca17a778f84dbc2000b3dde0ba8 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 15 Feb 2022 10:31:45 -0600 Subject: [PATCH 28/47] Reduce pytest runtime (#10203) This PR reduces the overall runtime of the cuDF pytest suite. Changes include: - asserting equal on the GPU where possible for large datasets - in some cases reducing excessive test data size part of https://github.com/rapidsai/cudf/issues/9999 Authors: - https://github.com/brandon-b-miller Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Ashwin Srinath (https://github.com/shwina) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10203 --- python/cudf/cudf/testing/_utils.py | 9 +++- .../test_avro_reader_fastavro_integration.py | 4 +- python/cudf/cudf/tests/test_binops.py | 15 +++--- python/cudf/cudf/tests/test_csv.py | 11 +++-- .../cudf/tests/test_extension_compilation.py | 12 +++-- python/cudf/cudf/tests/test_indexing.py | 46 +++++++++---------- python/cudf/cudf/tests/test_orc.py | 24 +++++----- python/cudf/cudf/tests/test_parquet.py | 8 ++-- python/cudf/cudf/tests/test_repr.py | 45 ++++++++---------- python/cudf/cudf/tests/test_reshape.py | 8 ++-- python/cudf/cudf/tests/test_string.py | 7 +-- python/cudf/cudf/tests/test_udf_masked_ops.py | 18 +++++--- 12 files changed, 106 insertions(+), 101 deletions(-) diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index b97b2d660d6..e767c0c62be 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -1,5 +1,6 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. +import itertools import re import warnings from collections.abc import Mapping, Sequence @@ -330,3 +331,9 @@ def does_not_raise(): def xfail_param(param, **kwargs): return pytest.param(param, marks=pytest.mark.xfail(**kwargs)) + + +parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize( + "left_dtype,right_dtype", + list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)), +) diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index a064bec9e82..9eb01ae31b4 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -210,7 +210,7 @@ def test_can_parse_no_schema(): assert_eq(expected, actual) -@pytest.mark.parametrize("rows", [0, 1, 10, 100000]) +@pytest.mark.parametrize("rows", [0, 1, 10, 1000]) @pytest.mark.parametrize("codec", ["null", "deflate", "snappy"]) def test_avro_compression(rows, codec): schema = { diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 76add8b9c5d..02ca7a0cd58 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -4,7 +4,7 @@ import decimal import operator import random -from itertools import product +from itertools import combinations_with_replacement, product import cupy as cp import numpy as np @@ -216,13 +216,12 @@ def test_series_compare(cmpop, obj_class, dtype): def _series_compare_nulls_typegen(): - tests = [] - tests += list(product(DATETIME_TYPES, DATETIME_TYPES)) - tests += list(product(TIMEDELTA_TYPES, TIMEDELTA_TYPES)) - tests += list(product(NUMERIC_TYPES, NUMERIC_TYPES)) - tests += list(product(STRING_TYPES, STRING_TYPES)) - - return tests + return [ + *combinations_with_replacement(DATETIME_TYPES, 2), + *combinations_with_replacement(TIMEDELTA_TYPES, 2), + *combinations_with_replacement(NUMERIC_TYPES, 2), + *combinations_with_replacement(STRING_TYPES, 2), + ] @pytest.mark.parametrize("cmpop", _cmpops) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 9208b8c7cd4..f3d69e1745e 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import gzip import os @@ -8,6 +8,7 @@ from io import BytesIO, StringIO from pathlib import Path +import cupy as cp import numpy as np import pandas as pd import pytest @@ -1009,17 +1010,17 @@ def test_small_zip(tmpdir): def test_csv_reader_carriage_return(tmpdir): rows = 1000 names = ["int_row", "int_double_row"] - buffer = ",".join(names) + "\r\n" for row in range(rows): buffer += str(row) + ", " + str(2 * row) + "\r\n" df = read_csv(StringIO(buffer)) + expect = cudf.DataFrame( + {"int_row": cp.arange(rows), "int_double_row": cp.arange(rows) * 2} + ) assert len(df) == rows - for row in range(0, rows): - assert df[names[0]][row] == row - assert df[names[1]][row] == 2 * row + assert_eq(expect, df) def test_csv_reader_tabs(): diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py index 47c9448cf63..692f40873d7 100644 --- a/python/cudf/cudf/tests/test_extension_compilation.py +++ b/python/cudf/cudf/tests/test_extension_compilation.py @@ -1,13 +1,17 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. import operator import cupy as cp +import numpy as np import pytest from numba import cuda, types from numba.cuda import compile_ptx +from numba.np.numpy_support import from_dtype from cudf import NA from cudf.core.udf.api import Masked from cudf.core.udf.typing import MaskedType +from cudf.testing._utils import parametrize_numeric_dtypes_pairwise arith_ops = ( operator.add, @@ -159,19 +163,21 @@ def func(x): @pytest.mark.parametrize("op", ops) -@pytest.mark.parametrize("ty1", number_types, ids=number_ids) -@pytest.mark.parametrize("ty2", number_types, ids=number_ids) +@parametrize_numeric_dtypes_pairwise @pytest.mark.parametrize( "masked", ((False, True), (True, False), (True, True)), ids=("um", "mu", "mm"), ) -def test_compile_arith_masked_ops(op, ty1, ty2, masked): +def test_compile_arith_masked_ops(op, left_dtype, right_dtype, masked): def func(x, y): return op(x, y) cc = (7, 5) + ty1 = from_dtype(np.dtype(left_dtype)) + ty2 = from_dtype(np.dtype(right_dtype)) + if masked[0]: ty1 = MaskedType(ty1) if masked[1]: diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 102e5b57e8e..19d7c8a10ab 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. from itertools import combinations @@ -1292,45 +1292,43 @@ def test_loc_datetime_index(sli, is_dataframe): @pytest.mark.parametrize( - "gdf", + "gdf_kwargs", [ - cudf.DataFrame({"a": range(1000000)}), - cudf.DataFrame({"a": range(1000000), "b": range(1000000)}), - cudf.DataFrame({"a": range(20), "b": range(20)}), - cudf.DataFrame( - { + {"data": {"a": range(100000)}}, + {"data": {"a": range(100000), "b": range(100000)}}, + { + "data": { "a": range(20), "b": range(20), "c": ["abc", "def", "xyz", "def", "pqr"] * 4, } - ), - cudf.DataFrame(index=[1, 2, 3]), - cudf.DataFrame(index=range(1000000)), - cudf.DataFrame(columns=["a", "b", "c", "d"]), - cudf.DataFrame(columns=["a"], index=range(1000000)), - cudf.DataFrame( - columns=["a", "col2", "...col n"], index=range(1000000) - ), - cudf.DataFrame(index=cudf.Series(range(1000000)).astype("str")), - cudf.DataFrame( - columns=["a", "b", "c", "d"], - index=cudf.Series(range(1000000)).astype("str"), - ), + }, + {"index": [1, 2, 3]}, + {"index": range(100000)}, + {"columns": ["a", "b", "c", "d"]}, + {"columns": ["a"], "index": range(100000)}, + {"columns": ["a", "col2", "...col n"], "index": range(100000)}, + {"index": cudf.Series(range(100000)).astype("str")}, + { + "columns": ["a", "b", "c", "d"], + "index": cudf.Series(range(100000)).astype("str"), + }, ], ) @pytest.mark.parametrize( "slice", [ - slice(250000, 500000), - slice(250000, 250001), - slice(500000), + slice(25000, 50000), + slice(25000, 25001), + slice(50000), slice(1, 10), slice(10, 20), slice(15, 24000), slice(6), ], ) -def test_dataframe_sliced(gdf, slice): +def test_dataframe_sliced(gdf_kwargs, slice): + gdf = cudf.DataFrame(**gdf_kwargs) pdf = gdf.to_pandas() actual = gdf[slice] diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 8689f773a02..623098741a9 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -16,6 +16,7 @@ import cudf from cudf.io.orc import ORCWriter +from cudf.testing import assert_frame_equal from cudf.testing._utils import ( assert_eq, gen_rand_series, @@ -93,7 +94,7 @@ def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine): path, engine=engine, columns=columns, use_index=use_index ) - assert_eq(expect, got, check_categorical=False) + assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False) def test_orc_reader_filenotfound(tmpdir): @@ -384,11 +385,13 @@ def test_orc_writer(datadir, tmpdir, reference_file, columns, compression): else: print(type(excpr).__name__) - expect = orcfile.read(columns=columns).to_pandas() - cudf.from_pandas(expect).to_orc(gdf_fname.strpath, compression=compression) - got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas() + expect = cudf.from_pandas(orcfile.read(columns=columns).to_pandas()) + expect.to_orc(gdf_fname.strpath, compression=compression) + got = cudf.from_pandas( + pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas() + ) - assert_eq(expect, got) + assert_frame_equal(expect, got) @pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"]) @@ -405,11 +408,11 @@ def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): else: print(type(excpr).__name__) - expect = orcfile.read().to_pandas() - cudf.from_pandas(expect).to_orc(gdf_fname.strpath, statistics=stats_freq) - got = pa.orc.ORCFile(gdf_fname).read().to_pandas() + expect = cudf.from_pandas(orcfile.read().to_pandas()) + expect.to_orc(gdf_fname.strpath, statistics=stats_freq) + got = cudf.from_pandas(pa.orc.ORCFile(gdf_fname).read().to_pandas()) - assert_eq(expect, got) + assert_frame_equal(expect, got) @pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"]) @@ -492,8 +495,7 @@ def test_chunked_orc_writer( writer.close() got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas() - - assert_eq(expect, got) + assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got)) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index e1ca9f6f006..7feaa400446 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1105,9 +1105,9 @@ def test_parquet_reader_list_large_multi_rowgroup_nulls(tmpdir): assert_eq(expect, got) -@pytest.mark.parametrize("skip", range(0, 128)) +@pytest.mark.parametrize("skip", [0, 1, 5, 10]) def test_parquet_reader_list_skiprows(skip, tmpdir): - num_rows = 128 + num_rows = 10 src = pd.DataFrame( { "a": list_gen(int_gen, 0, num_rows, 80, 50), @@ -1124,9 +1124,9 @@ def test_parquet_reader_list_skiprows(skip, tmpdir): assert_eq(expect, got, check_dtype=False) -@pytest.mark.parametrize("skip", range(0, 120)) +@pytest.mark.parametrize("skip", [0, 1, 5, 10]) def test_parquet_reader_list_num_rows(skip, tmpdir): - num_rows = 128 + num_rows = 20 src = pd.DataFrame( { "a": list_gen(int_gen, 0, num_rows, 80, 50), diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index ca02ee55df0..8f2e4811e36 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. import textwrap @@ -13,7 +13,14 @@ from cudf.testing import _utils as utils from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes -repr_categories = utils.NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] +repr_categories = [ + "uint16", + "int64", + "float64", + "str", + "category", + "datetime64[ns]", +] @pytest.mark.parametrize("dtype", repr_categories) @@ -84,36 +91,22 @@ def test_full_series(nrows, dtype): pd.reset_option("display.max_rows") +@pytest.mark.parametrize("nrows", [5, 10, 15]) +@pytest.mark.parametrize("ncols", [5, 10, 15]) +@pytest.mark.parametrize("size", [20, 21]) @pytest.mark.parametrize("dtype", repr_categories) -@pytest.mark.parametrize("nrows", [0, 1, 2, 9, 20 / 2, 11, 20 - 1, 20, 20 + 1]) -@pytest.mark.parametrize("ncols", [0, 1, 2, 9, 20 / 2, 11, 20 - 1, 20, 20 + 1]) -def test_full_dataframe_20(dtype, nrows, ncols): - size = 20 - pdf = pd.DataFrame( - {idx: np.random.randint(0, 100, size) for idx in range(size)} - ).astype(dtype) - gdf = cudf.from_pandas(pdf) - - assert pdf.__repr__() == gdf.__repr__() - assert pdf._repr_html_() == gdf._repr_html_() - assert pdf._repr_latex_() == gdf._repr_latex_() - - -@pytest.mark.parametrize("dtype", repr_categories) -@pytest.mark.parametrize("nrows", [9, 21 / 2, 11, 21 - 1]) -@pytest.mark.parametrize("ncols", [9, 21 / 2, 11, 21 - 1]) -def test_full_dataframe_21(dtype, nrows, ncols): - size = 21 +def test_full_dataframe_20(dtype, size, nrows, ncols): pdf = pd.DataFrame( {idx: np.random.randint(0, 100, size) for idx in range(size)} ).astype(dtype) gdf = cudf.from_pandas(pdf) - pd.options.display.max_rows = int(nrows) - pd.options.display.max_columns = int(ncols) - assert pdf.__repr__() == gdf.__repr__() - pd.reset_option("display.max_rows") - pd.reset_option("display.max_columns") + with pd.option_context( + "display.max_rows", int(nrows), "display.max_columns", int(ncols) + ): + assert repr(pdf) == repr(gdf) + assert pdf._repr_html_() == gdf._repr_html_() + assert pdf._repr_latex_() == gdf._repr_latex_() @given( diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index b8f975f233e..2efa781c506 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. import re @@ -17,9 +17,9 @@ ) -@pytest.mark.parametrize("num_id_vars", [0, 1, 2, 10]) -@pytest.mark.parametrize("num_value_vars", [0, 1, 2, 10]) -@pytest.mark.parametrize("num_rows", [1, 2, 1000]) +@pytest.mark.parametrize("num_id_vars", [0, 1, 2]) +@pytest.mark.parametrize("num_value_vars", [0, 1, 2]) +@pytest.mark.parametrize("num_rows", [1, 2, 100]) @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES) @pytest.mark.parametrize("nulls", ["none", "some", "all"]) def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index efe8e523d4e..56218372c23 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -532,12 +532,7 @@ def _cat_convert_seq_to_cudf(others): @pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"]) @pytest.mark.parametrize("na_rep", [None, "", "null", "a"]) @pytest.mark.parametrize( - "index", - [ - ["1", "2", "3", "4", "5"], - pd.Series(["1", "2", "3", "4", "5"]), - pd.Index(["1", "2", "3", "4", "5"]), - ], + "index", [["1", "2", "3", "4", "5"]], ) def test_string_cat(ps_gs, others, sep, na_rep, index): ps, gs = ps_gs diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index 56090c8eacf..faaea6eec08 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -1,3 +1,4 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. import math import operator @@ -14,7 +15,11 @@ unary_ops, ) from cudf.core.udf.utils import precompiled -from cudf.testing._utils import NUMERIC_TYPES, _decimal_series, assert_eq +from cudf.testing._utils import ( + _decimal_series, + assert_eq, + parametrize_numeric_dtypes_pairwise, +) def run_masked_udf_test(func, data, args=(), **kwargs): @@ -238,10 +243,9 @@ def func(row): run_masked_udf_test(func, gdf, check_dtype=False) -@pytest.mark.parametrize("dtype_a", list(NUMERIC_TYPES)) -@pytest.mark.parametrize("dtype_b", list(NUMERIC_TYPES)) +@parametrize_numeric_dtypes_pairwise @pytest.mark.parametrize("op", [operator.add, operator.and_, operator.eq]) -def test_apply_mixed_dtypes(dtype_a, dtype_b, op): +def test_apply_mixed_dtypes(left_dtype, right_dtype, op): """ Test that operations can be performed between columns of different dtypes and return a column with the correct @@ -251,7 +255,7 @@ def test_apply_mixed_dtypes(dtype_a, dtype_b, op): # First perform the op on two dummy data on host, if numpy can # safely type cast, we should expect it to work in udf too. try: - op(getattr(np, dtype_a)(0), getattr(np, dtype_b)(42)) + op(np.dtype(left_dtype).type(0), np.dtype(right_dtype).type(42)) except TypeError: pytest.skip("Operation is unsupported for corresponding dtype.") @@ -261,8 +265,8 @@ def func(row): return op(x, y) gdf = cudf.DataFrame({"a": [1.5, None, 3, None], "b": [4, 5, None, None]}) - gdf["a"] = gdf["a"].astype(dtype_a) - gdf["b"] = gdf["b"].astype(dtype_b) + gdf["a"] = gdf["a"].astype(left_dtype) + gdf["b"] = gdf["b"].astype(right_dtype) run_masked_udf_test(func, gdf, check_dtype=False) From ea2508ededc85ebba1a1bec362af8e27cd7c020e Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 15 Feb 2022 11:49:24 -0600 Subject: [PATCH 29/47] Deprecate `DataFrame.iteritems` and introduce `.items` (#10298) This PR deprecates `DataFrame.iteritems` to be inline with pandas https://github.com/pandas-dev/pandas/pull/45321/files. This PR also introduces `DataFrame.items` to support a dask breaking change: https://github.com/dask/dask/pull/8660 Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/10298 --- python/cudf/cudf/core/dataframe.py | 12 +++++++++++- python/cudf/cudf/tests/test_concat.py | 6 +++--- python/cudf/cudf/tests/test_dataframe.py | 8 +++----- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index fb15f8da8d9..75c515df719 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2066,6 +2066,16 @@ def __iter__(self): @annotate("DATAFRAME_ITERITEMS", color="blue", domain="cudf_python") def iteritems(self): + """Iterate over column names and series pairs""" + warnings.warn( + "iteritems is deprecated and will be removed in a future version. " + "Use .items instead.", + FutureWarning, + ) + return self.items() + + @annotate("DATAFRAME_ITEMS", color="blue", domain="cudf_python") + def items(self): """Iterate over column names and series pairs""" for k in self: yield (k, self[k]) @@ -4570,7 +4580,7 @@ def from_pandas(cls, dataframe, nan_as_null=None): df = cls() # Set columns - for col_name, col_value in dataframe.iteritems(): + for col_name, col_value in dataframe.items(): # necessary because multi-index can return multiple # columns for a single key if len(col_value.shape) == 1: diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index b8724fe36f5..1ab5931fe5f 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -574,7 +574,7 @@ def test_concat_empty_dataframes(df, other, ignore_index): expected = pd.concat(other_pd, ignore_index=ignore_index) actual = gd.concat(other_gd, ignore_index=ignore_index) if expected.shape != df.shape: - for key, col in actual[actual.columns].iteritems(): + for key, col in actual[actual.columns].items(): if is_categorical_dtype(col.dtype): if not is_categorical_dtype(expected[key].dtype): # TODO: Pandas bug: @@ -1184,7 +1184,7 @@ def test_concat_join_empty_dataframes( ) if expected.shape != df.shape: if axis == 0: - for key, col in actual[actual.columns].iteritems(): + for key, col in actual[actual.columns].items(): if is_categorical_dtype(col.dtype): if not is_categorical_dtype(expected[key].dtype): # TODO: Pandas bug: @@ -1306,7 +1306,7 @@ def test_concat_join_empty_dataframes_axis_1( ) if expected.shape != df.shape: if axis == 0: - for key, col in actual[actual.columns].iteritems(): + for key, col in actual[actual.columns].items(): if is_categorical_dtype(col.dtype): expected[key] = expected[key].fillna("-1") actual[key] = col.astype("str").fillna("-1") diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index fb173bc0eab..fdb3f430a18 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -917,9 +917,7 @@ def test_dataframe_dtypes(): dtypes = pd.Series( [np.int32, np.float32, np.float64], index=["c", "a", "b"] ) - df = cudf.DataFrame( - {k: np.ones(10, dtype=v) for k, v in dtypes.iteritems()} - ) + df = cudf.DataFrame({k: np.ones(10, dtype=v) for k, v in dtypes.items()}) assert df.dtypes.equals(dtypes) @@ -1956,7 +1954,7 @@ def test_dataframe_reductions(data, axis, func, skipna): elif func not in cudf.core.dataframe._cupy_nan_methods_map: if skipna is False: expected_exception = NotImplementedError - elif any(col.nullable for name, col in gdf.iteritems()): + elif any(col.nullable for name, col in gdf.items()): expected_exception = ValueError elif func in ("cummin", "cummax"): expected_exception = AttributeError @@ -2134,7 +2132,7 @@ def test_iter(pdf, gdf): def test_iteritems(gdf): - for k, v in gdf.iteritems(): + for k, v in gdf.items(): assert k in gdf.columns assert isinstance(v, cudf.Series) assert_eq(v, gdf[k]) From 7a620c4cad3a4ab5f2c32c451e5b661ffb6cedb8 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Tue, 15 Feb 2022 23:51:44 +0530 Subject: [PATCH 30/47] generate url decode benchmark input in device (#10278) Use device functions to move input generation to device in url_decode benchmark. Splitting PR https://github.com/rapidsai/cudf/pull/10109 for review Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - David Wendt (https://github.com/davidwendt) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/10278 --- cpp/benchmarks/CMakeLists.txt | 2 +- .../string/{url_decode.cpp => url_decode.cu} | 65 +++++++++++-------- 2 files changed, 40 insertions(+), 27 deletions(-) rename cpp/benchmarks/string/{url_decode.cpp => url_decode.cu} (53%) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 0704180bad0..3bc6dc10fdf 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -265,7 +265,7 @@ ConfigureBench( string/split.cpp string/substring.cpp string/translate.cpp - string/url_decode.cpp + string/url_decode.cu ) # ################################################################################################## diff --git a/cpp/benchmarks/string/url_decode.cpp b/cpp/benchmarks/string/url_decode.cu similarity index 53% rename from cpp/benchmarks/string/url_decode.cpp rename to cpp/benchmarks/string/url_decode.cu index 6dc79c44437..c460820d788 100644 --- a/cpp/benchmarks/string/url_decode.cpp +++ b/cpp/benchmarks/string/url_decode.cu @@ -16,11 +16,12 @@ #include #include -#include #include #include +#include #include +#include #include #include @@ -28,43 +29,55 @@ #include #include -#include -#include +#include +#include +#include +#include struct url_string_generator { - size_t num_chars; - std::bernoulli_distribution dist; - - url_string_generator(size_t num_chars, double esc_seq_chance) - : num_chars{num_chars}, dist{esc_seq_chance} + char* chars; + double esc_seq_chance; + thrust::minstd_rand engine; + thrust::uniform_real_distribution esc_seq_dist; + url_string_generator(char* c, double esc_seq_chance, thrust::minstd_rand& engine) + : chars(c), esc_seq_chance(esc_seq_chance), engine(engine), esc_seq_dist(0, 1) { } - std::string operator()(std::mt19937& engine) + __device__ void operator()(thrust::tuple str_begin_end) { - std::string str; - str.reserve(num_chars); - while (str.size() < num_chars) { - if (str.size() < num_chars - 3 && dist(engine)) { - str += "%20"; + auto begin = thrust::get<0>(str_begin_end); + auto end = thrust::get<1>(str_begin_end); + engine.discard(begin); + for (auto i = begin; i < end; ++i) { + if (esc_seq_dist(engine) < esc_seq_chance and i < end - 3) { + chars[i] = '%'; + chars[i + 1] = '2'; + chars[i + 2] = '0'; + i += 2; } else { - str.push_back('a'); + chars[i] = 'a'; } } - return str; } }; -cudf::test::strings_column_wrapper generate_column(cudf::size_type num_rows, - cudf::size_type chars_per_row, - double esc_seq_chance) +auto generate_column(cudf::size_type num_rows, cudf::size_type chars_per_row, double esc_seq_chance) { - std::mt19937 engine(1); - url_string_generator url_gen(chars_per_row, esc_seq_chance); - std::vector strings; - strings.reserve(num_rows); - std::generate_n(std::back_inserter(strings), num_rows, [&]() { return url_gen(engine); }); - return cudf::test::strings_column_wrapper(strings.begin(), strings.end()); + std::vector strings{std::string(chars_per_row, 'a')}; + auto col_1a = cudf::test::strings_column_wrapper(strings.begin(), strings.end()); + auto table_a = cudf::repeat(cudf::table_view{{col_1a}}, num_rows); + auto result_col = std::move(table_a->release()[0]); // string column with num_rows aaa... + auto chars_col = result_col->child(cudf::strings_column_view::chars_column_index).mutable_view(); + auto offset_col = result_col->child(cudf::strings_column_view::offsets_column_index).view(); + + auto engine = thrust::default_random_engine{}; + thrust::for_each_n(thrust::device, + thrust::make_zip_iterator(offset_col.begin(), + offset_col.begin() + 1), + num_rows, + url_string_generator{chars_col.begin(), esc_seq_chance, engine}); + return result_col; } class UrlDecode : public cudf::benchmark { @@ -76,7 +89,7 @@ void BM_url_decode(benchmark::State& state, int esc_seq_pct) cudf::size_type const chars_per_row = state.range(1); auto column = generate_column(num_rows, chars_per_row, esc_seq_pct / 100.0); - auto strings_view = cudf::strings_column_view(column); + auto strings_view = cudf::strings_column_view(column->view()); for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); From f263820f98cb280b17f5409e9c9204b943fe1968 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Tue, 15 Feb 2022 23:58:03 +0530 Subject: [PATCH 31/47] move input generation for type dispatcher benchmark to device (#10280) Use `cudf::sequence` to move input generation to device in type dispatcher benchmark. Splitting PR https://github.com/rapidsai/cudf/pull/10109 for review Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Nghia Truong (https://github.com/ttnghia) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/10280 --- .../type_dispatcher/type_dispatcher.cu | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu index 90097889a86..ca19e3046ad 100644 --- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu +++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,15 +14,16 @@ * limitations under the License. */ -#include "../fixture/benchmark_fixture.hpp" -#include "../synchronization/synchronization.hpp" +#include +#include #include #include #include -#include #include +#include +#include #include #include @@ -170,21 +171,18 @@ void launch_kernel(cudf::mutable_table_view input, T** d_ptr, int work_per_threa template void type_dispatcher_benchmark(::benchmark::State& state) { - const auto source_size = static_cast(state.range(1)); - - const auto n_cols = static_cast(state.range(0)); - + const auto n_cols = static_cast(state.range(0)); + const auto source_size = static_cast(state.range(1)); const auto work_per_thread = static_cast(state.range(2)); - auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); + auto init = cudf::make_fixed_width_scalar(static_cast(0)); - std::vector> source_column_wrappers; + std::vector> source_column_wrappers; std::vector source_columns; for (int i = 0; i < n_cols; ++i) { - source_column_wrappers.push_back( - cudf::test::fixed_width_column_wrapper(data, data + source_size)); - source_columns.push_back(source_column_wrappers[i]); + source_column_wrappers.push_back(cudf::sequence(source_size, *init)); + source_columns.push_back(*source_column_wrappers[i]); } cudf::mutable_table_view source_table{source_columns}; From 9eb6a66f6f2bb1e5701379154ce2affd57408acd Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 15 Feb 2022 21:05:08 -0500 Subject: [PATCH 32/47] Explicitly request CMake use `gnu++17` over `c++17` (#10297) Currently cudf requires compiler extensions to use `__int128_t` with standard library containers. Ensure the correct compilation flags are always providied. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Bradley Dice (https://github.com/bdice) - Conor Hoekstra (https://github.com/codereport) - Jordan Jacobelli (https://github.com/Ethyling) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/10297 --- cpp/CMakeLists.txt | 2 ++ cpp/benchmarks/CMakeLists.txt | 6 ++++++ cpp/tests/CMakeLists.txt | 7 +++++++ 3 files changed, 15 insertions(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 407e1f9a858..2ffd62f1b53 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -498,6 +498,8 @@ set_target_properties( # set target compile options CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON + # For std:: support of __int128_t. Can be removed once using cuda::std + CXX_EXTENSIONS ON CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON POSITION_INDEPENDENT_CODE ON diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 3bc6dc10fdf..2e5c6ec8ca8 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -59,6 +59,12 @@ function(ConfigureBench CMAKE_BENCH_NAME) ${CMAKE_BENCH_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" INSTALL_RPATH "\$ORIGIN/../../../lib" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + # For std:: support of __int128_t. Can be removed once using cuda::std + CXX_EXTENSIONS ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON ) target_link_libraries( ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 27dd472b3f5..d9b4ed01605 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -23,7 +23,14 @@ function(ConfigureTest CMAKE_TEST_NAME) ${CMAKE_TEST_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" INSTALL_RPATH "\$ORIGIN/../../../lib" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + # For std:: support of __int128_t. Can be removed once using cuda::std + CXX_EXTENSIONS ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON ) + target_link_libraries(${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock_main GTest::gtest_main) add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME}) install( From 203f7b09fd39793b40e131fb50c28e68a4dc1392 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 15 Feb 2022 22:13:35 -0600 Subject: [PATCH 33/47] DataFrame `insert` and creation optimizations (#10285) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR removes double index equality & reindexing checks in `DataFrame` construction and `insert` code-flows. `scalar_broadcast_to`, a **2.2x speedup** for `str` dtype, numeric types perf remains the same: ```python # This PR: In [12]: %timeit cudf.utils.utils.scalar_broadcast_to('abc', 400000000, "str") 100 ms ± 208 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) # branch-22.04 In [25]: %timeit cudf.utils.utils.scalar_broadcast_to('abc', 400000000, "str") 227 ms ± 10 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) ``` `assign`, a **1.2x-1.4x speedup**: ```python In [9]: import numpy as np In [10]: df = cudf.DataFrame( ...: { ...: "a": [1, 2, 3, np.nan] * 10000000, ...: "b": ["a", "b", "c", "d"] * 10000000, ...: "c": [0.0, 0.12, np.nan, 10.12] * 10000000, ...: }, ...: ) # THIS PR In [11]: %timeit df.assign(f=10) 28.1 ms ± 365 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) # branch-22.04 In [4]: %timeit df.assign(f=10) 35 ms ± 10.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) # THIS PR In [4]: %timeit df.assign(g='hello world') 38.4 ms ± 250 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) # branch-22.04 In [4]: %timeit df.assign(g='hello world') 54 ms ± 82.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) ``` DataFrame constructor, a **2x speedup**: ```python In [2]: s1 = cudf.Series([1, 2, 3] * 1000000) In [4]: s1.index = s1.index.astype("float64") In [5]: s2 = cudf.Series([2, 3, 3] * 1000000) In [6]: s2.index = s2.index.astype("float64") In [12]: s3 = cudf.Series([10, 11, 12] * 1000000) In [13]: s3.index = s3.index.astype("float64") # THIS PR In [14]: %timeit cudf.DataFrame({'a':s1, 'b':s2, 'c':s3}) 2.81 ms ± 63.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) # branch-22.04 In [9]: %timeit cudf.DataFrame({'a':s1, 'b':s2, 'c':s3}) 5.37 ms ± 188 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/10285 --- python/cudf/cudf/core/dataframe.py | 52 +++++++++++++++++++----- python/cudf/cudf/core/frame.py | 3 +- python/cudf/cudf/core/groupby/groupby.py | 2 +- python/cudf/cudf/utils/utils.py | 10 +---- 4 files changed, 44 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 75c515df719..efff67b686a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -831,8 +831,8 @@ def _init_from_dict_like( self._data.multiindex = self._data.multiindex and isinstance( col_name, tuple ) - self.insert( - i, col_name, data[col_name], nan_as_null=nan_as_null + self._insert( + i, col_name, data[col_name], nan_as_null=nan_as_null, ) if columns is not None: @@ -1093,7 +1093,7 @@ def __setitem__(self, arg, value): ) else: for col_name in self._data: - scatter_map = arg[col_name] + scatter_map = arg._data[col_name] if is_scalar(value): self._data[col_name][scatter_map] = value else: @@ -2571,6 +2571,29 @@ def insert(self, loc, name, value, nan_as_null=None): name : number or string name or label of column to be inserted value : Series or array-like + nan_as_null : bool, Default None + If ``None``/``True``, converts ``np.nan`` values to + ``null`` values. + If ``False``, leaves ``np.nan`` values as is. + """ + return self._insert( + loc=loc, + name=name, + value=value, + nan_as_null=nan_as_null, + ignore_index=False, + ) + + @annotate("DATAFRAME__INSERT", color="green", domain="cudf_python") + def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): + """ + Same as `insert`, with additional `ignore_index` param. + + ignore_index : bool, default True + If True, there will be no index equality check & reindexing + happening. + If False, a reindexing operation is performed if + `value.index` is not equal to `self.index`. """ if name in self._data: raise NameError(f"duplicated column name {name}") @@ -2591,7 +2614,8 @@ def insert(self, loc, name, value, nan_as_null=None): if len(self) == 0: if isinstance(value, (pd.Series, Series)): - self._index = as_index(value.index) + if not ignore_index: + self._index = as_index(value.index) elif len(value) > 0: self._index = RangeIndex(start=0, stop=len(value)) new_data = self._data.__class__() @@ -2604,9 +2628,11 @@ def insert(self, loc, name, value, nan_as_null=None): ) self._data = new_data elif isinstance(value, (pd.Series, Series)): - value = Series(value, nan_as_null=nan_as_null)._align_to_index( - self._index, how="right", sort=False - ) + value = Series(value, nan_as_null=nan_as_null) + if not ignore_index: + value = value._align_to_index( + self._index, how="right", sort=False + ) value = column.as_column(value, nan_as_null=nan_as_null) @@ -4731,8 +4757,8 @@ def to_arrow(self, preserve_index=True): for gen_name, col_name in zip( gen_names, self.index._data.names ): - data.insert( - data.shape[1], gen_name, self.index._data[col_name] + data._insert( + data.shape[1], gen_name, self.index._data[col_name], ) descr = gen_names[0] index_descr.append(descr) @@ -5725,7 +5751,7 @@ def select_dtypes(self, include=None, exclude=None): for k, col in self._data.items(): infered_type = cudf_dtype_from_pydata_dtype(col.dtype) if infered_type in inclusion: - df.insert(len(df._data), k, col) + df._insert(len(df._data), k, col) return df @@ -6540,7 +6566,11 @@ def _setitem_with_dataframe( raise ValueError("Can not insert new column with a bool mask") else: # handle append case - input_df.insert(len(input_df._data), col_1, replace_df[col_2]) + input_df._insert( + loc=len(input_df._data), + name=col_1, + value=replace_df[col_2], + ) def extract_col(df, col): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 6038bb49bfb..d478baf2d69 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6806,10 +6806,9 @@ def _drop_rows_by_labels( join_res = working_df.join(to_join, how="leftanti") # 4. Reconstruct original layout, and rename - join_res.insert( + join_res._insert( ilevel, name=join_res._index.name, value=join_res._index ) - join_res = join_res.reset_index(drop=True) midx = cudf.MultiIndex.from_frame( join_res.iloc[:, 0:idx_nlv], names=obj._index.names diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 4bd14a2c47b..5c1c7bab245 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -287,7 +287,7 @@ def agg(self, func): if not self._as_index: for col_name in reversed(self.grouping._named_columns): - result.insert( + result._insert( 0, col_name, result.index.get_level_values(col_name)._values, diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 4143cbd1d66..8adca07b252 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -67,15 +67,7 @@ def scalar_broadcast_to(scalar, size, dtype=None): scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) dtype = scalar.dtype - if cudf.dtype(dtype).kind in ("O", "U"): - gather_map = column.full(size, 0, dtype="int32") - scalar_str_col = column.as_column([scalar], dtype="str") - return scalar_str_col[gather_map] - else: - out_col = column.column_empty(size, dtype=dtype) - if out_col.size != 0: - out_col.data_array_view[:] = scalar - return out_col + return cudf.core.column.full(size=size, fill_value=scalar, dtype=dtype) def initfunc(f): From dffed183b6548452b1d363e57e1f1b0ec2dcffac Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Wed, 16 Feb 2022 13:57:28 +0530 Subject: [PATCH 34/47] device input generation in join bench (#10277) Use device functions to move input generation to device in join benchmark. Splitting PR https://github.com/rapidsai/cudf/pull/10109 for review Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Devavret Makkar (https://github.com/devavret) - Vyas Ramasubramani (https://github.com/vyasr) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/10277 --- cpp/benchmarks/join/join_common.hpp | 58 +++++++++++++++++------------ 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp index e88253395d8..f2b9cb1bdb9 100644 --- a/cpp/benchmarks/join/join_common.hpp +++ b/cpp/benchmarks/join/join_common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,24 +16,41 @@ #pragma once -#include -#include +#include "generate_input_tables.cuh" -#include +#include +#include + +#include +#include #include +#include +#include #include +#include #include #include -#include -#include -#include -#include +#include + +#include +#include +#include #include -#include "generate_input_tables.cuh" +struct null75_generator { + thrust::minstd_rand engine; + thrust::uniform_int_distribution rand_gen; + null75_generator() : engine(), rand_gen() {} + __device__ bool operator()(size_t i) + { + engine.discard(i); + // roughly 75% nulls + return (rand_gen(engine) & 3) == 0; + } +}; template rand_gen(0, build_table_size); - auto build_random_null_mask = [&rand_gen](int size) { + auto build_random_null_mask = [](int size) { // roughly 75% nulls - auto validity = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [&rand_gen](auto i) { return (rand_gen.generate() & 3) == 0; }); - return cudf::test::detail::make_null_mask(validity, validity + size); + auto validity = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), null75_generator{}); + return cudf::detail::valid_if(validity, validity + size, thrust::identity{}).first; }; std::unique_ptr build_key_column = [&]() { @@ -96,17 +111,14 @@ static void BM_join(state_type& state, Join JoinFunc) selectivity, multiplicity); - auto payload_data_it = thrust::make_counting_iterator(0); - cudf::test::fixed_width_column_wrapper build_payload_column( - payload_data_it, payload_data_it + build_table_size); - - cudf::test::fixed_width_column_wrapper probe_payload_column( - payload_data_it, payload_data_it + probe_table_size); + auto init = cudf::make_fixed_width_scalar(static_cast(0)); + auto build_payload_column = cudf::sequence(build_table_size, *init); + auto probe_payload_column = cudf::sequence(probe_table_size, *init); CHECK_CUDA(0); - cudf::table_view build_table({build_key_column->view(), build_payload_column}); - cudf::table_view probe_table({probe_key_column->view(), probe_payload_column}); + cudf::table_view build_table({build_key_column->view(), *build_payload_column}); + cudf::table_view probe_table({probe_key_column->view(), *probe_payload_column}); // Setup join parameters and result table [[maybe_unused]] std::vector columns_to_join = {0}; From 4474d9ed0cb23a3774f56edac01356e2a6481335 Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Wed, 16 Feb 2022 10:00:42 -0500 Subject: [PATCH 35/47] Fix documentation issues (#10306) This PR addresses some issues that were causing documentation builds to fail. Namely: 1. Changing all `Example` headers to `Examples` (`Examples` is correct, `Example` is incorrect. see: https://numpydoc.readthedocs.io/en/latest/format.html#examples) 2. Ensuring all headers have correct underline length Authors: - AJ Schmidt (https://github.com/ajschmidt8) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/10306 --- python/cudf/cudf/core/series.py | 30 ++++++++++++------------- python/cudf/cudf/core/window/rolling.py | 4 ++-- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3aef4447a28..90ae7274a3f 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3857,8 +3857,8 @@ def is_leap_year(self): Series Booleans indicating if dates belong to a leap year. - Example - ------- + Examples + -------- >>> import pandas as pd, cudf >>> s = cudf.Series( ... pd.date_range(start='2000-02-01', end='2013-02-01', freq='1Y')) @@ -3915,7 +3915,7 @@ def quarter(self): Integer indicating which quarter the date belongs to. Examples - ------- + -------- >>> import cudf >>> s = cudf.Series(["2020-05-31 08:00:00","1999-12-31 18:40:00"], ... dtype="datetime64[ms]") @@ -3991,8 +3991,8 @@ def days_in_month(self): Series Integers representing the number of days in month - Example - ------- + Examples + -------- >>> import pandas as pd, cudf >>> s = cudf.Series( ... pd.date_range(start='2000-08-01', end='2001-08-01', freq='1M')) @@ -4042,8 +4042,8 @@ def is_month_end(self): Series Booleans indicating if dates are the last day of the month. - Example - ------- + Examples + -------- >>> import pandas as pd, cudf >>> s = cudf.Series( ... pd.date_range(start='2000-08-26', end='2000-09-03', freq='1D')) @@ -4088,8 +4088,8 @@ def is_quarter_start(self): Series Booleans indicating if dates are the begining of a quarter - Example - ------- + Examples + -------- >>> import pandas as pd, cudf >>> s = cudf.Series( ... pd.date_range(start='2000-09-26', end='2000-10-03', freq='1D')) @@ -4134,8 +4134,8 @@ def is_quarter_end(self): Series Booleans indicating if dates are the end of a quarter - Example - ------- + Examples + -------- >>> import pandas as pd, cudf >>> s = cudf.Series( ... pd.date_range(start='2000-09-26', end='2000-10-03', freq='1D')) @@ -4182,8 +4182,8 @@ def is_year_start(self): Series Booleans indicating if dates are the first day of the year. - Example - ------- + Examples + -------- >>> import pandas as pd, cudf >>> s = cudf.Series(pd.date_range("2017-12-30", periods=3)) >>> dates @@ -4216,8 +4216,8 @@ def is_year_end(self): Series Booleans indicating if dates are the last day of the year. - Example - ------- + Examples + -------- >>> import pandas as pd, cudf >>> dates = cudf.Series(pd.date_range("2017-12-30", periods=3)) >>> dates diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 0f4256e49a6..8ffd75b1d76 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -302,8 +302,8 @@ def apply(self, func, *args, **kwargs): ----- See notes of the :meth:`cudf.Series.applymap` - Example - ------- + Examples + -------- >>> import cudf >>> def count_if_gt_3(window): From 26d29244b247b92108da0a30237ffb9452ca7060 Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Wed, 16 Feb 2022 12:31:33 -0500 Subject: [PATCH 36/47] Fix documentation issues (#10307) The doc string for the `iloc` function has two `Examples` sections, which throws an error during doc builds. This PR removes the second extraneous section. Authors: - AJ Schmidt (https://github.com/ajschmidt8) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/10307 --- python/cudf/cudf/core/indexed_frame.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index e1ff3984948..bc7337d0a42 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -258,8 +258,6 @@ def iloc(self): Selecting rows and column by position. - Examples - -------- >>> df = cudf.DataFrame({'a': range(20), ... 'b': range(20), ... 'c': range(20)}) From 183eec3253b1dbef6760e65f1b4424dbb4458e27 Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Wed, 16 Feb 2022 16:03:05 -0500 Subject: [PATCH 37/47] Remove `TODO` in `libcudf_kafka` recipe (#10309) From my local testing, it seems that the `librdkafka` `run_exports` is now fixed, so this line can be safely removed. Authors: - AJ Schmidt (https://github.com/ajschmidt8) Approvers: - Jordan Jacobelli (https://github.com/Ethyling) URL: https://github.com/rapidsai/cudf/pull/10309 --- conda/recipes/libcudf_kafka/meta.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index 0b274f3a41d..1eb5b13ddc3 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -27,8 +27,6 @@ requirements: host: - libcudf {{version}} - librdkafka >=1.7.0,<1.8.0a0 - run: - - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not test: commands: From 895b0070f6c353d6900ff8a298c97ce8dc759fb1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 16 Feb 2022 15:35:33 -0800 Subject: [PATCH 38/47] Prevent internal usage of expensive APIs (#10263) This PR introduces a decorator that can be used to mark APIs that are unnecessarily expensive and should be avoided in internal usage. The decorator does nothing by default, but when the corresponding environment variable is set the decorator wraps decorated functions in a check of the current call stack. If the caller of that API is within the cudf source tree, it will raise an exception that may optionally also indicate an alternate API for developers to use instead. This implementation is completely transparent to users (including having no performance impact aside from some negligible additional import time from applying the decorator) but can be an invaluable tool to developers seeking to reduce Python overheads in cuDF Python. This decorator has been tested and shown to work (both locally and on CI) for `DataFrame.columns`, but those changes are not included in this PR in order to keep the changeset clean. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/cudf/pull/10263 --- ci/gpu/build.sh | 8 ++--- python/cudf/cudf/tests/conftest.py | 31 +++++++++++++++++ python/cudf/cudf/utils/utils.py | 56 ++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+), 4 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 53ad948b61c..4acdc372817 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. ############################################## # cuDF GPU build and test script for CI # ############################################## @@ -249,15 +249,15 @@ fi cd "$WORKSPACE/python/cudf" gpuci_logger "Python py.test for cuDF" -py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope +py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope cudf cd "$WORKSPACE/python/dask_cudf" gpuci_logger "Python py.test for dask-cudf" -py.test -n 8 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term +py.test -n 8 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term dask_cudf cd "$WORKSPACE/python/custreamz" gpuci_logger "Python py.test for cuStreamz" -py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml="$WORKSPACE/junit-custreamz.xml" -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:"$WORKSPACE/python/custreamz/custreamz-coverage.xml" --cov-report term +py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml="$WORKSPACE/junit-custreamz.xml" -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:"$WORKSPACE/python/custreamz/custreamz-coverage.xml" --cov-report term custreamz gpuci_logger "Test notebooks" "$WORKSPACE/ci/gpu/test-notebooks.sh" 2>&1 | tee nbtest.log diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 041bd055f0a..4d5b5926d6e 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -1,10 +1,41 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + +import os import pathlib import pytest import rmm # noqa: F401 +_CURRENT_DIRECTORY = str(pathlib.Path(__file__).resolve().parent) + @pytest.fixture(scope="session") def datadir(): return pathlib.Path(__file__).parent / "data" + + +# To set and remove the NO_EXTERNAL_ONLY_APIS environment variable we must use +# the sessionstart and sessionfinish hooks rather than a simple autouse, +# session-scope fixture because we need to set these variable before collection +# occurs because the environment variable will be checked as soon as cudf is +# imported anywhere. +def pytest_sessionstart(session): + """ + Called after the Session object has been created and + before performing collection and entering the run test loop. + """ + os.environ["NO_EXTERNAL_ONLY_APIS"] = "1" + os.environ["_CUDF_TEST_ROOT"] = _CURRENT_DIRECTORY + + +def pytest_sessionfinish(session, exitstatus): + """ + Called after whole test run finished, right before + returning the exit status to the system. + """ + try: + del os.environ["NO_EXTERNAL_ONLY_APIS"] + del os.environ["_CUDF_TEST_ROOT"] + except KeyError: + pass diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 8adca07b252..cf845a5d525 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -2,6 +2,8 @@ import decimal import functools +import os +import traceback from collections.abc import Sequence from typing import FrozenSet, Set, Union @@ -37,6 +39,60 @@ } +# The test root is set by pytest to support situations where tests are run from +# a source tree on a built version of cudf. +NO_EXTERNAL_ONLY_APIS = os.getenv("NO_EXTERNAL_ONLY_APIS") + +_cudf_root = os.path.dirname(cudf.__file__) +# If the environment variable for the test root is not set, we default to +# using the path relative to the cudf root directory. +_tests_root = os.getenv("_CUDF_TEST_ROOT") or os.path.join(_cudf_root, "tests") + + +def _external_only_api(func, alternative=""): + """Decorator to indicate that a function should not be used internally. + + cudf contains many APIs that exist for pandas compatibility but are + intrinsically inefficient. For some of these cudf has internal + equivalents that are much faster. Usage of the slow public APIs inside + our implementation can lead to unnecessary performance bottlenecks. + Applying this decorator to such functions and setting the environment + variable NO_EXTERNAL_ONLY_APIS will cause such functions to raise + exceptions if they are called from anywhere inside cudf, making it easy + to identify and excise such usage. + + The `alternative` should be a complete phrase or sentence since it will + be used verbatim in error messages. + """ + + # If the first arg is a string then an alternative function to use in + # place of this API was provided, so we pass that to a subsequent call. + # It would be cleaner to implement this pattern by using a class + # decorator with a factory method, but there is no way to generically + # wrap docstrings on a class (we would need the docstring to be on the + # class itself, not instances, because that's what `help` looks at) and + # there is also no way to make mypy happy with that approach. + if isinstance(func, str): + return lambda actual_func: _external_only_api(actual_func, func) + + if not NO_EXTERNAL_ONLY_APIS: + return func + + @functools.wraps(func) + def wrapper(*args, **kwargs): + # Check the immediately preceding frame to see if it's in cudf. + frame, lineno = next(traceback.walk_stack(None)) + fn = frame.f_code.co_filename + if _cudf_root in fn and _tests_root not in fn: + raise RuntimeError( + f"External-only API called in {fn} at line {lineno}. " + f"{alternative}" + ) + return func(*args, **kwargs) + + return wrapper + + def scalar_broadcast_to(scalar, size, dtype=None): if isinstance(size, (tuple, list)): From fe37c0ecc04d88dd02076bfc48969227789aa187 Mon Sep 17 00:00:00 2001 From: Mayank Anand <36782063+mayankanand007@users.noreply.github.com> Date: Wed, 16 Feb 2022 22:53:39 -0500 Subject: [PATCH 39/47] Add covariance for sort groupby (python) (#9889) This PR adds the functionality to perform `.cov()` on a `GroupBy` object and completes #1268 Related issue: #1268 Related PRs: #9154, #9166, #9492 Next steps: - [ ] Fix Symmetry problem [PR 10098](https://github.com/rapidsai/cudf/issues/10098#issue-1109834042): avoid computing the covariance/ correlation between the same colums twice - [ ] Consolidate both `cov()` and `corr()` - [ ] Fix https://github.com/rapidsai/cudf/issues/10303 - [ ] Add `cov `bindings in `aggregation.pyx` (separate PR): [comment](https://github.com/rapidsai/cudf/pull/9889#discussion_r808439168) - [ ] Simplify `combine_columns` after https://github.com/rapidsai/cudf/issues/10153 covers `interleave_columns`: [comment](https://github.com/rapidsai/cudf/pull/9889#discussion_r808560155) Authors: - Mayank Anand (https://github.com/mayankanand007) - Michael Wang (https://github.com/isVoid) - Sheilah Kirui (https://github.com/skirui-source) Approvers: - Bradley Dice (https://github.com/bdice) - Michael Wang (https://github.com/isVoid) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/9889 --- docs/cudf/source/api_docs/groupby.rst | 1 + docs/cudf/source/basics/groupby.rst | 3 +- python/cudf/cudf/_lib/aggregation.pyx | 19 ++- python/cudf/cudf/_lib/cpp/aggregation.pxd | 6 +- python/cudf/cudf/_lib/groupby.pyx | 4 +- python/cudf/cudf/core/groupby/groupby.py | 172 ++++++++++++++++++++++ python/cudf/cudf/tests/test_dataframe.py | 118 +++++++++++++++ 7 files changed, 318 insertions(+), 5 deletions(-) diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst index 190978a7581..141e5adba93 100644 --- a/docs/cudf/source/api_docs/groupby.rst +++ b/docs/cudf/source/api_docs/groupby.rst @@ -61,6 +61,7 @@ Computations / descriptive stats GroupBy.sum GroupBy.var GroupBy.corr + GroupBy.cov The following methods are available in both ``SeriesGroupBy`` and ``DataFrameGroupBy`` objects, but may differ slightly, usually in that diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/basics/groupby.rst index cbc8f7e712f..f74853769f6 100644 --- a/docs/cudf/source/basics/groupby.rst +++ b/docs/cudf/source/basics/groupby.rst @@ -180,7 +180,8 @@ that support them: +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ | corr | ✅ | | | | | | | ✅ | +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ - + | cov | ✅ | | | | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ GroupBy apply ------------- diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 68f7101b6ee..35b806909ec 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from enum import Enum, IntEnum @@ -59,6 +59,7 @@ class AggregationKind(Enum): PTX = libcudf_aggregation.aggregation.Kind.PTX CUDA = libcudf_aggregation.aggregation.Kind.CUDA CORRELATION = libcudf_aggregation.aggregation.Kind.CORRELATION + COVARIANCE = libcudf_aggregation.aggregation.Kind.COVARIANCE class CorrelationType(IntEnum): @@ -354,6 +355,7 @@ cdef class Aggregation: )) return agg + cdef class RollingAggregation: """A Cython wrapper for rolling window aggregations. @@ -742,6 +744,21 @@ cdef class GroupbyAggregation: )) return agg + @classmethod + def cov( + cls, + libcudf_types.size_type min_periods, + libcudf_types.size_type ddof=1 + ): + cdef GroupbyAggregation agg = cls() + + agg.c_obj = move( + libcudf_aggregation. + make_covariance_aggregation[groupby_aggregation]( + min_periods, ddof + )) + return agg + cdef class GroupbyScanAggregation: """A Cython wrapper for groupby scan aggregations. diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index 3982b4fecbb..04eead4ee1b 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -40,6 +40,7 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: PTX 'cudf::aggregation::PTX' CUDA 'cudf::aggregation::CUDA' CORRELATION 'cudf::aggregation::CORRELATION' + COVARIANCE 'cudf::aggregation::COVARIANCE' Kind kind @@ -117,3 +118,6 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: cdef unique_ptr[T] make_correlation_aggregation[T]( correlation_type type, size_type min_periods) except + + + cdef unique_ptr[T] make_covariance_aggregation[T]( + size_type min_periods, size_type ddof) except + diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 314542c9549..49a924c9104 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from collections import defaultdict @@ -54,7 +54,7 @@ _CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"} _STRING_AGGS = {"COUNT", "SIZE", "MAX", "MIN", "NUNIQUE", "NTH", "COLLECT", "UNIQUE"} _LIST_AGGS = {"COLLECT"} -_STRUCT_AGGS = {"CORRELATION"} +_STRUCT_AGGS = {"CORRELATION", "COVARIANCE"} _INTERVAL_AGGS = set() _DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE", "NTH", "COLLECT"} diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 5c1c7bab245..b90f857ce84 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -12,6 +12,7 @@ import cudf from cudf._lib import groupby as libgroupby +from cudf._lib.reshape import interleave_columns from cudf._typing import DataFrameOrSeries from cudf.api.types import is_list_like from cudf.core.abc import Serializable @@ -959,6 +960,177 @@ def corr(self, method="pearson", min_periods=1): return res + def cov(self, min_periods=0, ddof=1): + """ + Compute the pairwise covariance among the columns of a DataFrame, + excluding NA/null values. + + The returned DataFrame is the covariance matrix of the columns of + the DataFrame. + + Both NA and null values are automatically excluded from the + calculation. See the note below about bias from missing values. + + A threshold can be set for the minimum number of observations + for each value created. Comparisons with observations below this + threshold will be returned as `NA`. + + This method is generally used for the analysis of time series data to + understand the relationship between different measures across time. + + Parameters + ---------- + min_periods: int, optional + Minimum number of observations required per pair of columns + to have a valid result. + + ddof: int, optional + Delta degrees of freedom, default is 1. + + Returns + ------- + DataFrame + Covariance matrix. + + Notes + ----- + Returns the covariance matrix of the DataFrame's time series. + The covariance is normalized by N-ddof. + + For DataFrames that have Series that are missing data + (assuming that data is missing at random) the returned covariance + matrix will be an unbiased estimate of the variance and covariance + between the member Series. + + However, for many applications this estimate may not be acceptable + because the estimate covariance matrix is not guaranteed to be + positive semi-definite. This could lead to estimate correlations + having absolute values which are greater than one, and/or a + non-invertible covariance matrix. See + `Estimation of covariance matrices + ` + for more details. + + Examples + -------- + >>> import cudf + >>> gdf = cudf.DataFrame({ + ... "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + ... "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], + ... "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], + ... "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], + ... }) + >>> gdf + id val1 val2 val3 + 0 a 5 4 4 + 1 a 4 5 5 + 2 a 6 6 6 + 3 b 4 1 1 + 4 b 8 2 2 + 5 b 7 9 9 + 6 c 4 8 8 + 7 c 5 5 5 + 8 c 2 1 1 + >>> gdf.groupby("id").cov() + val1 val2 val3 + id + a val1 1.000000 0.500000 0.500000 + val2 0.500000 1.000000 1.000000 + val3 0.500000 1.000000 1.000000 + b val1 4.333333 3.500000 3.500000 + val2 3.500000 19.000000 19.000000 + val3 3.500000 19.000000 19.000000 + c val1 2.333333 3.833333 3.833333 + val2 3.833333 12.333333 12.333333 + val3 3.833333 12.333333 12.333333 + """ + + # create expanded dataframe consisting all combinations of the + # struct columns-pairs used in the covariance calculation + # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) + column_names = self.grouping.values.columns.tolist() + num_cols = len(column_names) + + column_pair_structs = {} + for x, y in itertools.combinations_with_replacement(column_names, 2): + # The number of output columns is the number of input columns + # squared. We directly call the struct column factory here to + # reduce overhead and avoid copying data. Since libcudf groupby + # maintains a cache of aggregation requests, reusing the same + # column also makes use of previously cached column means and + # reduces kernel costs. + + # checks if input column names are string, raise a warning if + # not so and cast them to strings + if not (isinstance(x, str) and isinstance(y, str)): + warnings.warn( + "DataFrame contains non-string column name(s). " + "Struct columns require field names to be strings. " + "Non-string column names will be cast to strings " + "in the result's field names." + ) + x, y = str(x), str(y) + + column_pair_structs[(x, y)] = cudf.core.column.build_struct_column( + names=(x, y), + children=(self.obj._data[x], self.obj._data[y]), + size=len(self.obj), + ) + + column_pair_groupby = cudf.DataFrame._from_data( + column_pair_structs + ).groupby(by=self.grouping.keys) + + try: + gb_cov = column_pair_groupby.agg( + lambda x: x.cov(min_periods, ddof) + ) + except RuntimeError as e: + if "Unsupported groupby reduction type-agg combination" in str(e): + raise TypeError( + "Covariance accepts only numerical column-pairs" + ) + raise + + # ensure that column-pair labels are arranged in ascending order + cols_list = [ + (y, x) if i > j else (x, y) + for j, y in enumerate(column_names) + for i, x in enumerate(column_names) + ] + cols_split = [ + cols_list[i : i + num_cols] + for i in range(0, len(cols_list), num_cols) + ] + + def combine_columns(gb_cov, ys): + list_of_columns = [gb_cov._data[y] for y in ys] + frame = cudf.core.frame.Frame._from_columns(list_of_columns, ys) + return interleave_columns(frame) + + # interleave: combine the correlation results for each column-pair + # into a single column + res = cudf.DataFrame._from_data( + { + x: combine_columns(gb_cov, ys) + for ys, x in zip(cols_split, column_names) + } + ) + + # create a multiindex for the groupby correlated dataframe, + # to match pandas behavior + unsorted_idx = gb_cov.index.repeat(num_cols) + idx_sort_order = unsorted_idx._get_sorted_inds() + sorted_idx = unsorted_idx._gather(idx_sort_order) + if len(gb_cov): + # TO-DO: Should the operation below be done on the CPU instead? + sorted_idx._data[None] = as_column( + np.tile(column_names, len(gb_cov.index)) + ) + res.index = MultiIndex._from_data(sorted_idx._data) + + return res + def var(self, ddof=1): """Compute the column-wise variance of the values in each group. diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index fdb3f430a18..729d1a5ca37 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9063,6 +9063,124 @@ def test_dataframe_add_suffix(): assert_eq(got, expected) +@pytest.mark.parametrize( + "data, gkey", + [ + ( + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], + "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], + "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], + }, + ["id"], + ), + ( + { + "id": [0, 0, 0, 0, 1, 1, 1], + "a": [10.0, 3, 4, 2.0, -3.0, 9.0, 10.0], + "b": [10.0, 23, -4.0, 2, -3.0, 9, 19.0], + }, + ["id", "a"], + ), + ], +) +@pytest.mark.parametrize( + "min_periods", [0, 3], +) +@pytest.mark.parametrize( + "ddof", [1, 2], +) +def test_groupby_covariance(data, gkey, min_periods, ddof): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.groupby(gkey).cov(min_periods=min_periods, ddof=ddof) + expected = pdf.groupby(gkey).cov(min_periods=min_periods, ddof=ddof) + + assert_eq(expected, actual) + + +def test_groupby_covariance_multiindex_dataframe(): + gdf = cudf.DataFrame( + { + "a": [1, 1, 2, 2], + "b": [1, 1, 2, 3], + "c": [2, 3, 4, 5], + "d": [6, 8, 9, 1], + } + ).set_index(["a", "b"]) + + actual = gdf.groupby(level=["a", "b"]).cov() + expected = gdf.to_pandas().groupby(level=["a", "b"]).cov() + + assert_eq(expected, actual) + + +def test_groupby_covariance_empty_columns(): + gdf = cudf.DataFrame(columns=["id", "val1", "val2"]) + pdf = gdf.to_pandas() + + actual = gdf.groupby("id").cov() + expected = pdf.groupby("id").cov() + + assert_eq( + expected, actual, check_dtype=False, check_index_type=False, + ) + + +def test_groupby_cov_invalid_column_types(): + gdf = cudf.DataFrame( + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"], + "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"], + }, + ) + with pytest.raises( + TypeError, match="Covariance accepts only numerical column-pairs", + ): + gdf.groupby("id").cov() + + +def test_groupby_cov_positive_semidefinite_matrix(): + # Refer to discussions in PR #9889 re "pair-wise deletion" strategy + # being used in pandas to compute the covariance of a dataframe with + # rows containing missing values. + # Note: cuDF currently matches pandas behavior in that the covariance + # matrices are not guaranteed PSD (positive semi definite). + # https://github.com/rapidsai/cudf/pull/9889#discussion_r794158358 + gdf = cudf.DataFrame( + [[1, 2], [None, 4], [5, None], [7, 8]], columns=["v0", "v1"] + ) + actual = gdf.groupby(by=cudf.Series([1, 1, 1, 1])).cov() + actual.reset_index(drop=True, inplace=True) + + pdf = gdf.to_pandas() + expected = pdf.groupby(by=pd.Series([1, 1, 1, 1])).cov() + expected.reset_index(drop=True, inplace=True) + + assert_eq( + expected, actual, check_dtype=False, + ) + + +@pytest.mark.xfail +def test_groupby_cov_for_pandas_bug_case(): + # Handles case: pandas bug using ddof with missing data. + # Filed an issue in Pandas on GH, link below: + # https://github.com/pandas-dev/pandas/issues/45814 + pdf = pd.DataFrame( + {"id": ["a", "a"], "val1": [1.0, 2.0], "val2": [np.nan, np.nan]} + ) + expected = pdf.groupby("id").cov(ddof=2) + + gdf = cudf.from_pandas(pdf) + actual = gdf.groupby("id").cov(ddof=2) + + assert_eq(expected, actual) + + @pytest.mark.parametrize( "data", [ From f5ec4b23940195812136274e8d31b11951c5b294 Mon Sep 17 00:00:00 2001 From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com> Date: Wed, 16 Feb 2022 22:05:30 -0800 Subject: [PATCH 40/47] Implement DataFrame pct_change (#9805) Fixes: https://github.com/rapidsai/cudf/issues/9603 Next steps: - [ ] Fix https://github.com/rapidsai/cudf/issues/10314 - [ ] Move `diff `and `pct_change` to `indexed_frame` so that both are accessible to `Series` as well Authors: - Sheilah Kirui (https://github.com/skirui-source) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Michael Wang (https://github.com/isVoid) URL: https://github.com/rapidsai/cudf/pull/9805 --- python/cudf/cudf/core/column/column.py | 8 ++++ python/cudf/cudf/core/dataframe.py | 40 +++++++++++++++++++ python/cudf/cudf/tests/test_dataframe.py | 49 +++++++++++++++++++----- 3 files changed, 87 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 393afe4a5b9..1c1c2ef2bf6 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -335,6 +335,14 @@ def _fill( return self def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase: + # libcudf currently doesn't handle case when offset > len(df) + # ticket to fix the bug in link below: + # https://github.com/rapidsai/cudf/issues/10314 + if abs(offset) > len(self): + if fill_value is None: + return column_empty_like(self, masked=True) + else: + return full(len(self), fill_value, dtype=self.dtype) return libcudf.copying.shift(self, offset, fill_value) @property diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index efff67b686a..a4de6db9bda 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6196,6 +6196,46 @@ def explode(self, column, ignore_index=False): return super()._explode(column, ignore_index) + def pct_change( + self, periods=1, fill_method="ffill", limit=None, freq=None + ): + """ + Calculates the percent change between sequential elements + in the DataFrame. + + Parameters + ---------- + periods : int, default 1 + Periods to shift for forming percent change. + fill_method : str, default 'ffill' + How to handle NAs before computing percent changes. + limit : int, optional + The number of consecutive NAs to fill before stopping. + Not yet implemented. + freq : str, optional + Increment to use from time series API. + Not yet implemented. + + Returns + ------- + DataFrame + """ + if limit is not None: + raise NotImplementedError("limit parameter not supported yet.") + if freq is not None: + raise NotImplementedError("freq parameter not supported yet.") + elif fill_method not in {"ffill", "pad", "bfill", "backfill"}: + raise ValueError( + "fill_method must be one of 'ffill', 'pad', " + "'bfill', or 'backfill'." + ) + + data = self.fillna(method=fill_method, limit=limit) + + return data.diff(periods=periods) / data.shift( + periods=periods, freq=freq + ) + def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True ): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 729d1a5ca37..f765c614907 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3442,29 +3442,37 @@ def test_get_numeric_data(): @pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("period", [-1, -5, -10, -20, 0, 1, 5, 10, 20]) +@pytest.mark.parametrize("period", [-15, -1, 0, 1, 15]) @pytest.mark.parametrize("data_empty", [False, True]) def test_shift(dtype, period, data_empty): - + # TODO : this function currently tests for series.shift() + # but should instead test for dataframe.shift() if data_empty: data = None else: if dtype == np.int8: # to keep data in range - data = gen_rand(dtype, 100000, low=-2, high=2) + data = gen_rand(dtype, 10, low=-2, high=2) else: - data = gen_rand(dtype, 100000) + data = gen_rand(dtype, 10) - gdf = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)}) - pdf = pd.DataFrame({"a": pd.Series(data, dtype=dtype)}) + gs = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)}) + ps = pd.DataFrame({"a": pd.Series(data, dtype=dtype)}) - shifted_outcome = gdf.a.shift(period).fillna(0) - expected_outcome = pdf.a.shift(period).fillna(0).astype(dtype) + shifted_outcome = gs.a.shift(period) + expected_outcome = ps.a.shift(period) + # pandas uses NaNs to signal missing value and force converts the + # results columns to float types if data_empty: - assert_eq(shifted_outcome, expected_outcome, check_index_type=False) + assert_eq( + shifted_outcome, + expected_outcome, + check_index_type=False, + check_dtype=False, + ) else: - assert_eq(shifted_outcome, expected_outcome) + assert_eq(shifted_outcome, expected_outcome, check_dtype=False) @pytest.mark.parametrize("dtype", NUMERIC_TYPES) @@ -9295,3 +9303,24 @@ def test_dataframe_rename_duplicate_column(): ValueError, match="Duplicate column names are not allowed" ): gdf.rename(columns={"a": "b"}, inplace=True) + + +@pytest.mark.parametrize( + "data", + [ + np.random.RandomState(seed=10).randint(-50, 50, (10, 10)), + np.random.RandomState(seed=10).random_sample((4, 4)), + np.array([1.123, 2.343, 5.890, 0.0]), + {"a": [1.123, 2.343, np.nan, np.nan], "b": [None, 3, 9.08, None]}, + ], +) +@pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5]) +@pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"]) +def test_dataframe_pct_change(data, periods, fill_method): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.pct_change(periods=periods, fill_method=fill_method) + expected = pdf.pct_change(periods=periods, fill_method=fill_method) + + assert_eq(expected, actual) From 4e986fd253fcdcc995f320b234986d919b105de3 Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Thu, 17 Feb 2022 07:34:20 -0500 Subject: [PATCH 41/47] Remove extraneous `build.sh` parameter (#10313) The `-l` flag was introduced in #3504, but I don't see it being used anywhere. Seems safe to remove unless anyone else thinks otherwise. Authors: - AJ Schmidt (https://github.com/ajschmidt8) Approvers: - Jordan Jacobelli (https://github.com/Ethyling) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/10313 --- build.sh | 3 +-- ci/benchmark/build.sh | 8 ++------ ci/gpu/build.sh | 12 ++---------- 3 files changed, 5 insertions(+), 18 deletions(-) diff --git a/build.sh b/build.sh index 8b3add1dddd..765a1b5325f 100755 --- a/build.sh +++ b/build.sh @@ -18,7 +18,7 @@ ARGS=$* REPODIR=$(cd $(dirname $0); pwd) VALIDARGS="clean libcudf cudf dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n -l --allgpuarch --disable_nvtx --show_depr_warn --ptds -h --build_metrics --incl_cache_stats" -HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [-l] [--cmake-args=\\\"\\\"] +HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"\\\"] clean - remove all existing build artifacts and configuration (start over) libcudf - build the cudf C++ code only @@ -32,7 +32,6 @@ HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafk -v - verbose build mode -g - build for debug -n - no install step - -l - build legacy tests --allgpuarch - build for all supported GPU architectures --disable_nvtx - disable inserting NVTX profiling ranges --show_depr_warn - show cmake deprecation warnings diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh index 178bdab0154..62eeb4d131b 100755 --- a/ci/benchmark/build.sh +++ b/ci/benchmark/build.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. ######################################### # cuDF GPU build and test script for CI # ######################################### @@ -98,11 +98,7 @@ conda list --show-channel-urls ################################################################################ logger "Build libcudf..." -if [[ "${BUILD_MODE}" == "pull-request" ]]; then - "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests --ptds -else - "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests -l --ptds -fi +"$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests --ptds ################################################################################ # BENCHMARK - Run and parse libcudf and cuDF benchmarks diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 4acdc372817..a79ffa0fc47 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -128,11 +128,7 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then ################################################################################ gpuci_logger "Build from source" - if [[ "${BUILD_MODE}" == "pull-request" ]]; then - "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests --ptds - else - "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests -l --ptds - fi + "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests --ptds ################################################################################ # TEST - Run GoogleTest @@ -226,11 +222,7 @@ else install_dask gpuci_logger "Build python libs from source" - if [[ "${BUILD_MODE}" == "pull-request" ]]; then - "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka --ptds - else - "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka -l --ptds - fi + "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka --ptds fi From b3342a83cd8bc53504ed6d237912750c7c8a1f15 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 17 Feb 2022 20:16:35 +0530 Subject: [PATCH 42/47] move input generation for copy benchmark to device (#10279) Use device functions to move input generation to device in copy benchmark. Splitting PR https://github.com/rapidsai/cudf/pull/10109 for review Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/10279 --- cpp/benchmarks/CMakeLists.txt | 2 +- cpp/benchmarks/string/{copy.cpp => copy.cu} | 23 +++++++++++---------- 2 files changed, 13 insertions(+), 12 deletions(-) rename cpp/benchmarks/string/{copy.cpp => copy.cu} (83%) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 2e5c6ec8ca8..11eef015364 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -260,7 +260,7 @@ ConfigureBench( string/convert_durations.cpp string/convert_fixed_point.cpp string/convert_numerics.cpp - string/copy.cpp + string/copy.cu string/extract.cpp string/factory.cu string/filter.cpp diff --git a/cpp/benchmarks/string/copy.cpp b/cpp/benchmarks/string/copy.cu similarity index 83% rename from cpp/benchmarks/string/copy.cpp rename to cpp/benchmarks/string/copy.cu index d40b0e069bc..2f064e71c44 100644 --- a/cpp/benchmarks/string/copy.cpp +++ b/cpp/benchmarks/string/copy.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,8 @@ * limitations under the License. */ -#include +#include "string_bench_args.hpp" + #include #include #include @@ -23,10 +24,7 @@ #include #include -#include -#include - -#include "string_bench_args.hpp" +#include class StringCopy : public cudf::benchmark { }; @@ -47,11 +45,14 @@ static void BM_copy(benchmark::State& state, copy_type ct) create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); // scatter indices - std::vector host_map_data(n_rows); - std::iota(host_map_data.begin(), host_map_data.end(), 0); - std::random_shuffle(host_map_data.begin(), host_map_data.end()); - cudf::test::fixed_width_column_wrapper index_map(host_map_data.begin(), - host_map_data.end()); + auto index_map_col = make_numeric_column( + cudf::data_type{cudf::type_id::INT32}, n_rows, cudf::mask_state::UNALLOCATED); + auto index_map = index_map_col->mutable_view(); + thrust::shuffle_copy(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(n_rows), + index_map.begin(), + thrust::default_random_engine()); for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); From 28813d74ba9bf4bb5750eb7b368139be418e9ece Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Thu, 17 Feb 2022 10:55:22 -0500 Subject: [PATCH 43/47] Update upload script (#10321) This PR updates the `ci/cpu/upload.sh` script to move the `conda build ... --output` commands inside their respective conditional statements. The `conda build ... --output` commands can take some time to run, so this change should make our nightly/release CI slightly faster since it will no longer run unnecessary `conda build ... --output` commands. (`[skip ci]` was added since these changes are not tested in PRs) Authors: - AJ Schmidt (https://github.com/ajschmidt8) Approvers: - Jordan Jacobelli (https://github.com/Ethyling) URL: https://github.com/rapidsai/cudf/pull/10321 --- ci/cpu/upload.sh | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh index e6ef72d930c..f2f67e9e000 100755 --- a/ci/cpu/upload.sh +++ b/ci/cpu/upload.sh @@ -23,25 +23,13 @@ if [ -z "$MY_UPLOAD_KEY" ]; then return 0 fi -################################################################################ -# SETUP - Get conda file output locations -################################################################################ - -gpuci_logger "Get conda file output locations" - -export LIBCUDF_FILE=`conda build --no-build-id --croot "$WORKSPACE/.conda-bld" conda/recipes/libcudf --output` -export LIBCUDF_KAFKA_FILE=`conda build --no-build-id --croot "$WORKSPACE/.conda-bld" conda/recipes/libcudf_kafka --output` -export CUDF_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cudf --python=$PYTHON --output` -export DASK_CUDF_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/dask-cudf --python=$PYTHON --output` -export CUDF_KAFKA_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cudf_kafka --python=$PYTHON --output` -export CUSTREAMZ_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/custreamz --python=$PYTHON --output` - ################################################################################ # UPLOAD - Conda packages ################################################################################ gpuci_logger "Starting conda uploads" if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF" == "1" ]]; then + export LIBCUDF_FILE=$(conda build --no-build-id --croot "${CONDA_BLD_DIR}" conda/recipes/libcudf --output) test -e ${LIBCUDF_FILE} echo "Upload libcudf" echo ${LIBCUDF_FILE} @@ -49,16 +37,19 @@ if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF" == "1" ]]; then fi if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF" == "1" ]]; then + export CUDF_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/cudf --python=$PYTHON --output) test -e ${CUDF_FILE} echo "Upload cudf" echo ${CUDF_FILE} gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUDF_FILE} --no-progress + export DASK_CUDF_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/dask-cudf --python=$PYTHON --output) test -e ${DASK_CUDF_FILE} echo "Upload dask-cudf" echo ${DASK_CUDF_FILE} gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${DASK_CUDF_FILE} --no-progress + export CUSTREAMZ_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/custreamz --python=$PYTHON --output) test -e ${CUSTREAMZ_FILE} echo "Upload custreamz" echo ${CUSTREAMZ_FILE} @@ -66,6 +57,7 @@ if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF" == "1" ]]; then fi if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF_KAFKA" == "1" ]]; then + export LIBCUDF_KAFKA_FILE=$(conda build --no-build-id --croot "${CONDA_BLD_DIR}" conda/recipes/libcudf_kafka --output) test -e ${LIBCUDF_KAFKA_FILE} echo "Upload libcudf_kafka" echo ${LIBCUDF_KAFKA_FILE} @@ -73,6 +65,7 @@ if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF_KAFKA" == "1" ]]; then fi if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF_KAFKA" == "1" ]]; then + export CUDF_KAFKA_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/cudf_kafka --python=$PYTHON --output) test -e ${CUDF_KAFKA_FILE} echo "Upload cudf_kafka" echo ${CUDF_KAFKA_FILE} From fdad59783d0cf593d29661e136ed76bdaec753cf Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 17 Feb 2022 15:09:22 -0500 Subject: [PATCH 44/47] Add const qualifier to MurmurHash3_32::hash_combine (#10311) Fixes declaration of the internal `MurmurHash3_32::hash_combine()` to add the `const` qualifier. Found this while working on #10270 and trying to call `hash_combine` from a `const` instance. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Bradley Dice (https://github.com/bdice) - Conor Hoekstra (https://github.com/codereport) URL: https://github.com/rapidsai/cudf/pull/10311 --- cpp/include/cudf/detail/utilities/hash_functions.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index 51d58383de4..568fd4afd35 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, NVIDIA CORPORATION. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -130,7 +130,7 @@ struct MurmurHash3_32 { * * @returns A hash value that intelligently combines the lhs and rhs hash values */ - [[nodiscard]] __device__ inline result_type hash_combine(result_type lhs, result_type rhs) + constexpr result_type hash_combine(result_type lhs, result_type rhs) const { result_type combined{lhs}; From d48dd6f96b68b52b5bbda4eea6300087e1c3e3f0 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 17 Feb 2022 12:32:22 -0800 Subject: [PATCH 45/47] Add conversions between column_view and device_span. (#10302) This PR adds an implicit conversion operator from `column_view` to `device_span`. The immediate purpose of this PR is to make it possible to use the API `segmented_reduce(column_view data, device_span offsets, ...)` in PR #9621. This PR also resolves #9656 by adding a `column_view` constructor from `device_span`. More broadly, this PR should make it easier to refactor instances where `column.data()` is used with counting iterators to build transform iterators, or other patterns that require a length (e.g. vector factories to copy to host). Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Mark Harris (https://github.com/harrism) - Jake Hemstad (https://github.com/jrhemstad) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/10302 --- cpp/include/cudf/column/column_view.hpp | 44 ++++++++++- cpp/include/cudf/utilities/span.hpp | 2 +- cpp/tests/CMakeLists.txt | 9 ++- .../column/column_view_device_span_test.cpp | 75 +++++++++++++++++++ 4 files changed, 126 insertions(+), 4 deletions(-) create mode 100644 cpp/tests/column/column_view_device_span_test.cpp diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 325f023f283..ba15e37f9ea 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +16,13 @@ #pragma once #include +#include +#include #include +#include +#include +#include #include /** @@ -375,6 +380,43 @@ class column_view : public detail::column_view_base { */ auto child_end() const noexcept { return _children.cend(); } + /** + * @brief Construct a column view from a device_span. + * + * Only numeric and chrono types are supported. + * + * @tparam T The device span type. Must be const and match the column view's type. + * @param data A typed device span containing the column view's data. + */ + template () or cudf::is_chrono())> + column_view(device_span data) + : column_view( + cudf::data_type{cudf::type_to_id()}, data.size(), data.data(), nullptr, 0, 0, {}) + { + CUDF_EXPECTS(data.size() < std::numeric_limits::max(), + "Data exceeds the maximum size of a column view."); + } + + /** + * @brief Converts a column view into a device span. + * + * Only numeric and chrono data types are supported. The column view must not + * be nullable. + * + * @tparam T The device span type. Must be const and match the column view's type. + * @throws cudf::logic_error if the column view type does not match the span type. + * @throws cudf::logic_error if the column view is nullable. + * @return A typed device span of the column view's data. + */ + template () or cudf::is_chrono())> + [[nodiscard]] operator device_span() const + { + CUDF_EXPECTS(type() == cudf::data_type{cudf::type_to_id()}, + "Device span type must match column view type."); + CUDF_EXPECTS(!nullable(), "A nullable column view cannot be converted to a device span."); + return device_span(data(), size()); + } + private: friend column_view bit_cast(column_view const& input, data_type type); diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 0ac41b2c4a1..bfaf4694693 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d9b4ed01605..f96edd3ce5a 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -48,8 +48,13 @@ endfunction() # ################################################################################################## # * column tests ---------------------------------------------------------------------------------- ConfigureTest( - COLUMN_TEST column/bit_cast_test.cpp column/column_view_shallow_test.cpp column/column_test.cu - column/column_device_view_test.cu column/compound_test.cu + COLUMN_TEST + column/bit_cast_test.cpp + column/column_device_view_test.cu + column/column_test.cu + column/column_view_device_span_test.cpp + column/column_view_shallow_test.cpp + column/compound_test.cu ) # ################################################################################################## diff --git a/cpp/tests/column/column_view_device_span_test.cpp b/cpp/tests/column/column_view_device_span_test.cpp new file mode 100644 index 00000000000..2b7ea3b3650 --- /dev/null +++ b/cpp/tests/column/column_view_device_span_test.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +template () or cudf::is_chrono())> +std::unique_ptr example_column() +{ + auto begin = thrust::make_counting_iterator(1); + auto end = thrust::make_counting_iterator(16); + return cudf::test::fixed_width_column_wrapper(begin, end).release(); +} + +template +struct ColumnViewDeviceSpanTests : public cudf::test::BaseFixture { +}; + +using DeviceSpanTypes = cudf::test::FixedWidthTypesWithoutFixedPoint; +TYPED_TEST_SUITE(ColumnViewDeviceSpanTests, DeviceSpanTypes); + +TYPED_TEST(ColumnViewDeviceSpanTests, conversion_round_trip) +{ + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + + // Test implicit conversion, round trip + cudf::device_span device_span_from_col_view = col_view; + cudf::column_view col_view_from_device_span = device_span_from_col_view; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_view, col_view_from_device_span); +} + +struct ColumnViewDeviceSpanErrorTests : public cudf::test::BaseFixture { +}; + +TEST_F(ColumnViewDeviceSpanErrorTests, type_mismatch) +{ + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + EXPECT_THROW((void)cudf::device_span{col_view}, cudf::logic_error); +} + +TEST_F(ColumnViewDeviceSpanErrorTests, nullable_column) +{ + auto col = example_column(); + col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_NULL), col->size()); + auto col_view = cudf::column_view{*col}; + EXPECT_THROW((void)cudf::device_span{col_view}, cudf::logic_error); +} From b28bad61fd0d335b939ba31b17292ddd6f15ff95 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 17 Feb 2022 14:50:40 -0800 Subject: [PATCH 46/47] Hide warnings from pandas in test_array_ufunc.py. (#10324) This PR hides pandas warnings in `test_array_ufunc.py`. (I am working through one test file at a time so we can enable `-Werr` in the future.) These warnings come from specific functions like `arccos` for which the provided integer inputs are out-of-range. It is fine for us to hide these warnings because they do not come from cudf. Alternatives and why I chose against them: - Why not limit the test data input to be in the domain of the ufunc? Limiting data inputs would hide the warning. However, it would decrease our coverage that ensures cudf and pandas agree on those results for values outside the domain of the ufunc (particularly trigonometric functions). It would additionally require inputs to be generated based on the ufunc, which would be lengthy. - Why not make cudf raise a warning and ensure that pandas/cudf agree on warning behavior? I chose against this because we generally avoid data introspection in cudf, and this would involve a significantly more complex approach to ufunc execution. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/10324 --- python/cudf/cudf/tests/test_array_ufunc.py | 39 ++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index f1aad1af9e6..e4b4d5020ea 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -1,6 +1,8 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. import operator +import warnings +from contextlib import contextmanager from functools import reduce import cupy as cp @@ -17,6 +19,37 @@ ] +@contextmanager +def _hide_ufunc_warnings(ufunc): + # pandas raises warnings for some inputs to the following ufuncs: + name = ufunc.__name__ + if name in { + "arccos", + "arccosh", + "arcsin", + "arctanh", + "fmod", + "log", + "log10", + "log2", + "reciprocal", + }: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + f"invalid value encountered in {name}", + category=RuntimeWarning, + ) + warnings.filterwarnings( + "ignore", + f"divide by zero encountered in {name}", + category=RuntimeWarning, + ) + yield + else: + yield + + @pytest.mark.parametrize("ufunc", _UFUNCS) @pytest.mark.parametrize("has_nulls", [True, False]) @pytest.mark.parametrize("indexed", [True, False]) @@ -76,7 +109,8 @@ def test_ufunc_series(ufunc, has_nulls, indexed): pytest.xfail(reason="Operation not supported by cupy") raise - expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) + with _hide_ufunc_warnings(ufunc): + expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) try: if ufunc.nout > 1: @@ -256,7 +290,8 @@ def test_ufunc_dataframe(ufunc, has_nulls, indexed): pytest.xfail(reason="Operation not supported by cupy") raise - expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) + with _hide_ufunc_warnings(ufunc): + expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) try: if ufunc.nout > 1: From ec614acea6915cfff1d6e9425564ce3390e3e7e0 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 18 Feb 2022 09:56:55 -0500 Subject: [PATCH 47/47] Move hash type declarations to hashing.hpp (#10320) Move the `hash_id` and `DEFAULT_HASH_SEED` declarations from `types.hpp` to `hashing.hpp`. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - https://github.com/brandon-b-miller - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10320 --- .../cudf/detail/utilities/hash_functions.cuh | 1 + cpp/include/cudf/hashing.hpp | 19 +++++++++++++++++-- cpp/include/cudf/partitioning.hpp | 4 ++-- cpp/include/cudf/types.hpp | 18 +----------------- python/cudf/cudf/_lib/cpp/hash.pxd | 13 ++++++++++--- python/cudf/cudf/_lib/cpp/types.pxd | 7 ------- python/cudf/cudf/_lib/hash.pyx | 10 +++++----- 7 files changed, 36 insertions(+), 36 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index 568fd4afd35..4ec5f134bb0 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -21,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index cce05042917..e4e94074fb8 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ #include #include -#include namespace cudf { /** @@ -26,6 +25,22 @@ namespace cudf { * @file */ +/** + * @brief Identifies the hash function to be used + */ +enum class hash_id { + HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed + HASH_MURMUR3, ///< Murmur3 hash function + HASH_MD5, ///< MD5 hash function + HASH_SERIAL_MURMUR3, ///< Serial Murmur3 hash function + HASH_SPARK_MURMUR3 ///< Spark Murmur3 hash function +}; + +/** + * @brief The default seed value for hash functions + */ +static constexpr uint32_t DEFAULT_HASH_SEED = 0; + /** * @brief Computes the hash value of each row in the input set of columns. * diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp index 6b1ad7db08b..3ffd9a87d39 100644 --- a/cpp/include/cudf/partitioning.hpp +++ b/cpp/include/cudf/partitioning.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #pragma once -#include +#include #include diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 6222b2e680e..76e2589a5a9 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2018-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -326,21 +326,5 @@ inline bool operator!=(data_type const& lhs, data_type const& rhs) { return !(lh */ std::size_t size_of(data_type t); -/** - * @brief Identifies the hash function to be used - */ -enum class hash_id { - HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed - HASH_MURMUR3, ///< Murmur3 hash function - HASH_MD5, ///< MD5 hash function - HASH_SERIAL_MURMUR3, ///< Serial Murmur3 hash function - HASH_SPARK_MURMUR3 ///< Spark Murmur3 hash function -}; - -/** - * @brief The default seed value for hash functions - */ -static constexpr uint32_t DEFAULT_HASH_SEED = 0; - /** @} */ } // namespace cudf diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd index fd9992152a6..41d10b7b6da 100644 --- a/python/cudf/cudf/_lib/cpp/hash.pxd +++ b/python/cudf/cudf/_lib/cpp/hash.pxd @@ -1,18 +1,25 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport uint32_t from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector -cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil: + + ctypedef enum hash_id "cudf::hash_id": + HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY" + HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3" + HASH_MD5 "cudf::hash_id::HASH_MD5" + HASH_SERIAL_MURMUR3 "cudf::hash_id::HASH_SERIAL_MURMUR3" + HASH_SPARK_MURMUR3 "cudf::hash_id::HASH_SPARK_MURMUR3" + cdef unique_ptr[column] hash "cudf::hash" ( const table_view& input, - const libcudf_types.hash_id hash_function, + const hash_id hash_function, const uint32_t seed ) except + diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd index 23727a20ec2..b1a257feedf 100644 --- a/python/cudf/cudf/_lib/cpp/types.pxd +++ b/python/cudf/cudf/_lib/cpp/types.pxd @@ -81,13 +81,6 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil: DECIMAL64 "cudf::type_id::DECIMAL64" DECIMAL128 "cudf::type_id::DECIMAL128" - ctypedef enum hash_id "cudf::hash_id": - HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY" - HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3" - HASH_MD5 "cudf::hash_id::HASH_MD5" - HASH_SERIAL_MURMUR3 "cudf::hash_id::HASH_SERIAL_MURMUR3" - HASH_SPARK_MURMUR3 "cudf::hash_id::HASH_SPARK_MURMUR3" - cdef cppclass data_type: data_type() except + data_type(const data_type&) except + diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index adc48159aac..301f571f5fb 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport uint32_t from libcpp cimport bool @@ -10,7 +10,7 @@ from libcpp.vector cimport vector cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.hash cimport hash as cpp_hash +from cudf._lib.cpp.hash cimport hash as cpp_hash, hash_id as cpp_hash_id from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -58,11 +58,11 @@ def hash(source_table, str method, int seed=0): cdef table_view c_source_view = table_view_from_table( source_table, ignore_index=True) cdef unique_ptr[column] c_result - cdef libcudf_types.hash_id c_hash_function + cdef cpp_hash_id c_hash_function if method == "murmur3": - c_hash_function = libcudf_types.hash_id.HASH_MURMUR3 + c_hash_function = cpp_hash_id.HASH_MURMUR3 elif method == "md5": - c_hash_function = libcudf_types.hash_id.HASH_MD5 + c_hash_function = cpp_hash_id.HASH_MD5 else: raise ValueError(f"Unsupported hash function: {method}") with nogil: