From 9a60671038641b917fbd0f7b3400eb877cb7f9e7 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 17 Mar 2022 19:50:17 -0400 Subject: [PATCH] Fix cudf::shift to handle offset greater than column size (#10414) Closes #10314 Fixes logic to handle `abs(offset) > input.size()` when passed to `cudf::shift`. As mentioned in #10314 this was causing an unexpected exception: ``` C++ exception with description "parallel_for failed: cudaErrorInvalidConfiguration: invalid configuration argument" thrown in the test body. ``` The behavior now fills the entire output column with the input scalar value. If the scalar is null, then the column is filled with null entries. The logic added here did not require changing or adding any new kernel functions. Additional gtests were added to `shift_tests.cpp` as well. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - MithunR (https://github.com/mythrocks) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/10414 --- cpp/include/cudf/copying.hpp | 4 +- cpp/src/copying/shift.cu | 5 +- cpp/src/strings/copying/shift.cu | 5 +- cpp/tests/copying/shift_tests.cpp | 78 ++++++++++++++------------ python/cudf/cudf/core/column/column.py | 8 --- 5 files changed, 52 insertions(+), 48 deletions(-) diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index 850a11426af..2e559afef4f 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2018-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -371,7 +371,7 @@ std::unique_ptr copy_range( * @param fill_value Fill value for indeterminable outputs. * @param mr Device memory resource used to allocate the returned result's device memory * - * @throw cudf::logic_error if @p input dtype is not fixed-with. + * @throw cudf::logic_error if @p input dtype is neither fixed-width nor string type * @throw cudf::logic_error if @p fill_value dtype does not match @p input dtype. */ std::unique_ptr shift( diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu index dacc1d07447..38fb16f66f4 100644 --- a/cpp/src/copying/shift.cu +++ b/cpp/src/copying/shift.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -70,7 +70,7 @@ struct shift_functor { std::unique_ptr> operator()(Args&&...) { - CUDF_FAIL("shift does not support non-fixed-width types."); + CUDF_FAIL("shift only supports fixed-width or string types."); } template @@ -125,6 +125,7 @@ struct shift_functor { // avoid assigning elements we know to be invalid. if (not scalar_is_valid) { + if (std::abs(offset) > size) { return output; } if (offset > 0) { index_begin = thrust::make_counting_iterator(offset); data = data + offset; diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu index 024c8d2924d..bdcf01bd336 100644 --- a/cpp/src/strings/copying/shift.cu +++ b/cpp/src/strings/copying/shift.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -94,6 +94,9 @@ std::unique_ptr shift(strings_column_view const& input, { auto d_fill_str = static_cast(fill_value).value(stream); + // adjust offset when greater than the size of the input + if (std::abs(offset) > input.size()) { offset = input.size(); } + // output offsets column is the same size as the input auto const input_offsets = cudf::detail::slice( diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp index 256f9129cbf..47615a584a4 100644 --- a/cpp/tests/copying/shift_tests.cpp +++ b/cpp/tests/copying/shift_tests.cpp @@ -20,16 +20,14 @@ #include #include -#include #include #include +#include #include -#include #include #include -#include using cudf::test::fixed_width_column_wrapper; using TestTypes = cudf::test::Types; @@ -72,28 +70,12 @@ constexpr auto lowest() } template -struct ShiftTest : public cudf::test::BaseFixture { +struct ShiftTestsTyped : public cudf::test::BaseFixture { }; -TYPED_TEST_SUITE(ShiftTest, cudf::test::FixedWidthTypes); +TYPED_TEST_SUITE(ShiftTestsTyped, cudf::test::FixedWidthTypes); -TYPED_TEST(ShiftTest, OneColumnEmpty) -{ - using T = TypeParam; - - std::vector vals{}; - std::vector mask{}; - - auto input = fixed_width_column_wrapper{}; - auto expected = fixed_width_column_wrapper(vals.begin(), vals.end(), mask.begin()); - - auto fill = make_scalar(); - auto actual = cudf::shift(input, 5, *fill); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *actual); -} - -TYPED_TEST(ShiftTest, TwoColumnsEmpty) +TYPED_TEST(ShiftTestsTyped, ColumnEmpty) { using T = TypeParam; @@ -109,7 +91,7 @@ TYPED_TEST(ShiftTest, TwoColumnsEmpty) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *actual); } -TYPED_TEST(ShiftTest, OneColumn) +TYPED_TEST(ShiftTestsTyped, NonNullColumn) { using T = TypeParam; @@ -134,7 +116,7 @@ TYPED_TEST(ShiftTest, OneColumn) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *actual); } -TYPED_TEST(ShiftTest, OneColumnNegativeShift) +TYPED_TEST(ShiftTestsTyped, NegativeShift) { using T = TypeParam; @@ -159,7 +141,7 @@ TYPED_TEST(ShiftTest, OneColumnNegativeShift) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *actual); } -TYPED_TEST(ShiftTest, OneColumnNullFill) +TYPED_TEST(ShiftTestsTyped, NullScalar) { using T = TypeParam; @@ -186,7 +168,7 @@ TYPED_TEST(ShiftTest, OneColumnNullFill) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *actual); } -TYPED_TEST(ShiftTest, TwoColumnsNullableInput) +TYPED_TEST(ShiftTestsTyped, NullableColumn) { using T = TypeParam; @@ -199,25 +181,21 @@ TYPED_TEST(ShiftTest, TwoColumnsNullableInput) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *actual); } -TYPED_TEST(ShiftTest, MismatchFillValueDtypes) +TYPED_TEST(ShiftTestsTyped, MismatchFillValueDtypes) { using T = TypeParam; - if (std::is_same_v) { return; } - auto input = fixed_width_column_wrapper{}; - auto fill = make_scalar(); + auto fill = cudf::string_scalar(""); - std::unique_ptr output; - - EXPECT_THROW(output = cudf::shift(input, 5, *fill), cudf::logic_error); + EXPECT_THROW(cudf::shift(input, 5, fill), cudf::logic_error); } -struct ShiftTestNonFixedWidth : public cudf::test::BaseFixture { +struct ShiftTests : public cudf::test::BaseFixture { }; -TEST_F(ShiftTestNonFixedWidth, StringsShiftTest) +TEST_F(ShiftTests, StringsShiftTest) { auto input = cudf::test::strings_column_wrapper({"", "bb", "ccc", "ddddddé", ""}, {0, 1, 1, 1, 0}); @@ -243,3 +221,33 @@ TEST_F(ShiftTestNonFixedWidth, StringsShiftTest) auto sliced_left = cudf::test::strings_column_wrapper({"ccc", "ddddddé", "xx"}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced_left, *results); } + +TEST_F(ShiftTests, OffsetGreaterThanSize) +{ + auto const input_str = + cudf::test::strings_column_wrapper({"", "bb", "ccc", "ddé", ""}, {0, 1, 1, 1, 0}); + auto results = cudf::shift(input_str, 6, cudf::string_scalar("xx")); + auto expected_str = cudf::test::strings_column_wrapper({"xx", "xx", "xx", "xx", "xx"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_str, *results); + results = cudf::shift(input_str, -6, cudf::string_scalar("xx")); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_str, *results); + + results = cudf::shift(input_str, 6, cudf::string_scalar("", false)); + expected_str = cudf::test::strings_column_wrapper({"", "", "", "", ""}, {0, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_str, *results); + results = cudf::shift(input_str, -6, cudf::string_scalar("", false)); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_str, *results); + + auto const input = fixed_width_column_wrapper({0, 2, 3, 4, 0}, {0, 1, 1, 1, 0}); + results = cudf::shift(input, 6, cudf::numeric_scalar(9)); + auto expected = fixed_width_column_wrapper({9, 9, 9, 9, 9}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results); + results = cudf::shift(input, -6, cudf::numeric_scalar(9)); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results); + + results = cudf::shift(input, 6, cudf::numeric_scalar(0, false)); + expected = fixed_width_column_wrapper({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results); + results = cudf::shift(input, -6, cudf::numeric_scalar(0, false)); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results); +} diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 92075eef1b4..01a450ce1d0 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -349,14 +349,6 @@ def _fill( return self def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase: - # libcudf currently doesn't handle case when offset > len(df) - # ticket to fix the bug in link below: - # https://github.com/rapidsai/cudf/issues/10314 - if abs(offset) > len(self): - if fill_value is None: - return column_empty_like(self, masked=True) - else: - return full(len(self), fill_value, dtype=self.dtype) return libcudf.copying.shift(self, offset, fill_value) @property