From 360b45abe61be9e03668da93d28b6eb583596dbe Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 10 Dec 2021 17:24:21 -0500 Subject: [PATCH 1/3] Add dictionary support to cudf::copy_if_else --- .../dictionary/dictionary_column_view.hpp | 5 +++ cpp/src/copying/copy.cu | 37 ++++++++++++++++++- cpp/src/dictionary/dictionary_column_view.cpp | 8 +++- 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp index 1da52e67e06..039673fd868 100644 --- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp +++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp @@ -77,6 +77,11 @@ class dictionary_column_view : private column_view { */ column_view keys() const noexcept; + /** + * @brief Returns data_type of the keys + */ + data_type keys_type() const noexcept; + /** * @brief Returns the number of rows in the keys column. */ diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu index 10af2ffb614..dbd1ad5d389 100644 --- a/cpp/src/copying/copy.cu +++ b/cpp/src/copying/copy.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -267,6 +268,22 @@ struct copy_if_else_functor_impl { } }; +template <> +struct copy_if_else_functor_impl { + template + std::unique_ptr operator()(Left const& lhs, + Right const& rhs, + size_type size, + bool, + bool, + Filter filter, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr); + } +}; + /** * @brief Functor called by the `type_dispatcher` to invoke copy_if_else on combinations * of column_view and scalar @@ -297,7 +314,6 @@ std::unique_ptr copy_if_else(Left const& lhs, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type"); CUDF_EXPECTS(boolean_mask.type() == data_type(type_id::BOOL8), "Boolean mask column must be of type type_id::BOOL8"); @@ -311,7 +327,11 @@ std::unique_ptr copy_if_else(Left const& lhs, return (!has_nulls || bool_mask_device.is_valid_nocheck(i)) and bool_mask_device.element(i); }; - return cudf::type_dispatcher(lhs.type(), + + // always dispatch on dictionary-type if either input is a dictionary + auto dispatch_type = cudf::is_dictionary(rhs.type()) ? rhs.type() : lhs.type(); + + return cudf::type_dispatcher(dispatch_type, copy_if_else_functor{}, lhs, rhs, @@ -334,6 +354,8 @@ std::unique_ptr copy_if_else(column_view const& lhs, CUDF_EXPECTS(boolean_mask.size() == lhs.size(), "Boolean mask column must be the same size as lhs and rhs columns"); CUDF_EXPECTS(lhs.size() == rhs.size(), "Both columns must be of the size"); + CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type"); + return copy_if_else(lhs, rhs, lhs.has_nulls(), rhs.has_nulls(), boolean_mask, stream, mr); } @@ -345,6 +367,11 @@ std::unique_ptr copy_if_else(scalar const& lhs, { CUDF_EXPECTS(boolean_mask.size() == rhs.size(), "Boolean mask column must be the same size as rhs column"); + + auto rhs_type = + cudf::is_dictionary(rhs.type()) ? cudf::dictionary_column_view(rhs).keys_type() : lhs.type(); + CUDF_EXPECTS(lhs.type() == rhs_type, "Both inputs must be of the same type"); + return copy_if_else(lhs, rhs, !lhs.is_valid(stream), rhs.has_nulls(), boolean_mask, stream, mr); } @@ -356,6 +383,11 @@ std::unique_ptr copy_if_else(column_view const& lhs, { CUDF_EXPECTS(boolean_mask.size() == lhs.size(), "Boolean mask column must be the same size as lhs column"); + + auto lhs_type = + cudf::is_dictionary(lhs.type()) ? cudf::dictionary_column_view(lhs).keys_type() : lhs.type(); + CUDF_EXPECTS(lhs_type == rhs.type(), "Both inputs must be of the same type"); + return copy_if_else(lhs, rhs, lhs.has_nulls(), !rhs.is_valid(stream), boolean_mask, stream, mr); } @@ -365,6 +397,7 @@ std::unique_ptr copy_if_else(scalar const& lhs, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type"); return copy_if_else( lhs, rhs, !lhs.is_valid(stream), !rhs.is_valid(stream), boolean_mask, stream, mr); } diff --git a/cpp/src/dictionary/dictionary_column_view.cpp b/cpp/src/dictionary/dictionary_column_view.cpp index d33fd6c548f..4906e5b4f9c 100644 --- a/cpp/src/dictionary/dictionary_column_view.cpp +++ b/cpp/src/dictionary/dictionary_column_view.cpp @@ -44,8 +44,12 @@ column_view dictionary_column_view::keys() const noexcept { return child(1); } size_type dictionary_column_view::keys_size() const noexcept { - if (size() == 0) return 0; - return keys().size(); + return (size() == 0) ? 0 : keys().size(); +} + +data_type dictionary_column_view::keys_type() const noexcept +{ + return (size() == 0) ? data_type{type_id::EMPTY} : keys().type(); } } // namespace cudf From 8a1620471bcf5b4127880ff45528170945a11462 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 13 Dec 2021 08:27:39 -0500 Subject: [PATCH 2/3] add gtests for dictionary copy-if-else --- cpp/tests/copying/copy_tests.cpp | 83 ++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp index 651a977050c..82a64d74b1f 100644 --- a/cpp/tests/copying/copy_tests.cpp +++ b/cpp/tests/copying/copy_tests.cpp @@ -18,11 +18,13 @@ #include #include #include +#include #include #include #include #include +#include #include template @@ -633,3 +635,84 @@ TYPED_TEST(FixedPointTypes, FixedPointScaleMismatch) EXPECT_THROW(cudf::copy_if_else(a, b, mask), cudf::logic_error); } + +struct DictionaryCopyIfElseTest : public cudf::test::BaseFixture { +}; + +TEST_F(DictionaryCopyIfElseTest, ColumnColumn) +{ + auto valids = cudf::test::iterators::null_at(2); + std::vector h_strings1{"eee", "bb", "", "aa", "bb", "ééé"}; + cudf::test::dictionary_column_wrapper input1( + h_strings1.begin(), h_strings1.end(), valids); + std::vector h_strings2{"zz", "bb", "", "aa", "ééé", "ooo"}; + cudf::test::dictionary_column_wrapper input2( + h_strings2.begin(), h_strings2.end(), valids); + + bool mask[] = {1, 1, 0, 1, 0, 1}; + bool mask_v[] = {1, 1, 1, 1, 1, 0}; + cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6, mask_v); + + auto results = cudf::copy_if_else(input1, input2, mask_w); + auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view())); + + std::vector h_expected; + for (cudf::size_type idx = 0; idx < static_cast(h_strings1.size()); ++idx) { + if (mask[idx] and mask_v[idx]) + h_expected.push_back(h_strings1[idx]); + else + h_expected.push_back(h_strings2[idx]); + } + cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end(), valids); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(decoded->view(), expected); +} + +TEST_F(DictionaryCopyIfElseTest, ColumnScalar) +{ + std::string h_string{"eee"}; + cudf::string_scalar input1{h_string}; + std::vector h_strings{"zz", "", "yyy", "w", "ééé", "ooo"}; + auto valids = cudf::test::iterators::null_at(1); + cudf::test::dictionary_column_wrapper input2( + h_strings.begin(), h_strings.end(), valids); + + bool mask[] = {0, 1, 1, 1, 0, 1}; + cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6); + + auto results = cudf::copy_if_else(input2, input1, mask_w); + auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view())); + + std::vector h_expected1; + std::vector h_expected2; + for (cudf::size_type idx = 0; idx < static_cast(h_strings.size()); ++idx) { + if (mask[idx]) { + h_expected1.push_back(h_strings[idx]); + h_expected2.push_back(h_string.c_str()); + } else { + h_expected1.push_back(h_string.c_str()); + h_expected2.push_back(h_strings[idx]); + } + } + + cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end(), valids); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(decoded->view(), expected1); + + results = cudf::copy_if_else(input1, input2, mask_w); + decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view())); + + cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(decoded->view(), expected2); +} + +TEST_F(DictionaryCopyIfElseTest, TypeMismatch) +{ + cudf::test::dictionary_column_wrapper input1({1, 1, 1, 1}); + cudf::test::dictionary_column_wrapper input2({1.0, 1.0, 1.0, 1.0}); + cudf::test::fixed_width_column_wrapper mask({1, 0, 0, 1}); + + EXPECT_THROW(cudf::copy_if_else(input1, input2, mask), cudf::logic_error); + + cudf::string_scalar input3{"1"}; + EXPECT_THROW(cudf::copy_if_else(input1, input3, mask), cudf::logic_error); + EXPECT_THROW(cudf::copy_if_else(input3, input2, mask), cudf::logic_error); +} From 40d15df367e5a61c91bad789c6b85a8a2000a078 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 14 Dec 2021 11:49:50 -0500 Subject: [PATCH 3/3] fix type-match error check logic --- cpp/include/cudf/dictionary/dictionary_column_view.hpp | 2 +- cpp/src/copying/copy.cu | 2 +- cpp/tests/copying/copy_tests.cpp | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp index 039673fd868..42f8310040e 100644 --- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp +++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp @@ -78,7 +78,7 @@ class dictionary_column_view : private column_view { column_view keys() const noexcept; /** - * @brief Returns data_type of the keys + * @brief Returns the `data_type` of the keys child column. */ data_type keys_type() const noexcept; diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu index dbd1ad5d389..91fc5f02989 100644 --- a/cpp/src/copying/copy.cu +++ b/cpp/src/copying/copy.cu @@ -369,7 +369,7 @@ std::unique_ptr copy_if_else(scalar const& lhs, "Boolean mask column must be the same size as rhs column"); auto rhs_type = - cudf::is_dictionary(rhs.type()) ? cudf::dictionary_column_view(rhs).keys_type() : lhs.type(); + cudf::is_dictionary(rhs.type()) ? cudf::dictionary_column_view(rhs).keys_type() : rhs.type(); CUDF_EXPECTS(lhs.type() == rhs_type, "Both inputs must be of the same type"); return copy_if_else(lhs, rhs, !lhs.is_valid(stream), rhs.has_nulls(), boolean_mask, stream, mr); diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp index 82a64d74b1f..4468bc69640 100644 --- a/cpp/tests/copying/copy_tests.cpp +++ b/cpp/tests/copying/copy_tests.cpp @@ -715,4 +715,5 @@ TEST_F(DictionaryCopyIfElseTest, TypeMismatch) cudf::string_scalar input3{"1"}; EXPECT_THROW(cudf::copy_if_else(input1, input3, mask), cudf::logic_error); EXPECT_THROW(cudf::copy_if_else(input3, input2, mask), cudf::logic_error); + EXPECT_THROW(cudf::copy_if_else(input2, input3, mask), cudf::logic_error); }