Skip to content

Commit

Permalink
Add dictionary support to cudf::copy_if_else (#9887)
Browse files Browse the repository at this point in the history
Close #9885 

Adds support for dictionary column types to `cudf::copy_if_else`. The column/scalar versions of this API will accept a scalar type that matches the dictionary's key type. The column/column version will accept 2 dictionary columns with matching key types. The result of the function will be a dictionary that incorporates both sets of keys or the scalar value as appropriate.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mark Harris (https://github.com/harrism)

URL: #9887
  • Loading branch information
davidwendt authored Dec 15, 2021
1 parent db9aef8 commit 0c3f735
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 4 deletions.
5 changes: 5 additions & 0 deletions cpp/include/cudf/dictionary/dictionary_column_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ class dictionary_column_view : private column_view {
*/
column_view keys() const noexcept;

/**
* @brief Returns the `data_type` of the keys child column.
*/
data_type keys_type() const noexcept;

/**
* @brief Returns the number of rows in the keys column.
*/
Expand Down
37 changes: 35 additions & 2 deletions cpp/src/copying/copy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/scatter.hpp>
#include <cudf/dictionary/dictionary_column_view.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/detail/copy_if_else.cuh>
#include <cudf/strings/string_view.cuh>
Expand Down Expand Up @@ -267,6 +268,22 @@ struct copy_if_else_functor_impl<list_view> {
}
};

template <>
struct copy_if_else_functor_impl<dictionary32> {
template <typename Left, typename Right, typename Filter>
std::unique_ptr<column> operator()(Left const& lhs,
Right const& rhs,
size_type size,
bool,
bool,
Filter filter,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr);
}
};

/**
* @brief Functor called by the `type_dispatcher` to invoke copy_if_else on combinations
* of column_view and scalar
Expand Down Expand Up @@ -297,7 +314,6 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type");
CUDF_EXPECTS(boolean_mask.type() == data_type(type_id::BOOL8),
"Boolean mask column must be of type type_id::BOOL8");

Expand All @@ -311,7 +327,11 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
return (!has_nulls || bool_mask_device.is_valid_nocheck(i)) and
bool_mask_device.element<bool>(i);
};
return cudf::type_dispatcher<dispatch_storage_type>(lhs.type(),

// always dispatch on dictionary-type if either input is a dictionary
auto dispatch_type = cudf::is_dictionary(rhs.type()) ? rhs.type() : lhs.type();

return cudf::type_dispatcher<dispatch_storage_type>(dispatch_type,
copy_if_else_functor{},
lhs,
rhs,
Expand All @@ -334,6 +354,8 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
"Boolean mask column must be the same size as lhs and rhs columns");
CUDF_EXPECTS(lhs.size() == rhs.size(), "Both columns must be of the size");
CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type");

return copy_if_else(lhs, rhs, lhs.has_nulls(), rhs.has_nulls(), boolean_mask, stream, mr);
}

Expand All @@ -345,6 +367,11 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
{
CUDF_EXPECTS(boolean_mask.size() == rhs.size(),
"Boolean mask column must be the same size as rhs column");

auto rhs_type =
cudf::is_dictionary(rhs.type()) ? cudf::dictionary_column_view(rhs).keys_type() : rhs.type();
CUDF_EXPECTS(lhs.type() == rhs_type, "Both inputs must be of the same type");

return copy_if_else(lhs, rhs, !lhs.is_valid(stream), rhs.has_nulls(), boolean_mask, stream, mr);
}

Expand All @@ -356,6 +383,11 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
{
CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
"Boolean mask column must be the same size as lhs column");

auto lhs_type =
cudf::is_dictionary(lhs.type()) ? cudf::dictionary_column_view(lhs).keys_type() : lhs.type();
CUDF_EXPECTS(lhs_type == rhs.type(), "Both inputs must be of the same type");

return copy_if_else(lhs, rhs, lhs.has_nulls(), !rhs.is_valid(stream), boolean_mask, stream, mr);
}

Expand All @@ -365,6 +397,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type");
return copy_if_else(
lhs, rhs, !lhs.is_valid(stream), !rhs.is_valid(stream), boolean_mask, stream, mr);
}
Expand Down
8 changes: 6 additions & 2 deletions cpp/src/dictionary/dictionary_column_view.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,12 @@ column_view dictionary_column_view::keys() const noexcept { return child(1); }

size_type dictionary_column_view::keys_size() const noexcept
{
if (size() == 0) return 0;
return keys().size();
return (size() == 0) ? 0 : keys().size();
}

data_type dictionary_column_view::keys_type() const noexcept
{
return (size() == 0) ? data_type{type_id::EMPTY} : keys().type();
}

} // namespace cudf
84 changes: 84 additions & 0 deletions cpp/tests/copying/copy_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/cudf_gtest.hpp>
#include <cudf_test/iterator_utilities.hpp>
#include <cudf_test/type_lists.hpp>

#include <cudf/column/column.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/dictionary/encode.hpp>
#include <cudf/scalar/scalar.hpp>

template <typename T>
Expand Down Expand Up @@ -633,3 +635,85 @@ TYPED_TEST(FixedPointTypes, FixedPointScaleMismatch)

EXPECT_THROW(cudf::copy_if_else(a, b, mask), cudf::logic_error);
}

struct DictionaryCopyIfElseTest : public cudf::test::BaseFixture {
};

TEST_F(DictionaryCopyIfElseTest, ColumnColumn)
{
auto valids = cudf::test::iterators::null_at(2);
std::vector<const char*> h_strings1{"eee", "bb", "", "aa", "bb", "ééé"};
cudf::test::dictionary_column_wrapper<std::string> input1(
h_strings1.begin(), h_strings1.end(), valids);
std::vector<const char*> h_strings2{"zz", "bb", "", "aa", "ééé", "ooo"};
cudf::test::dictionary_column_wrapper<std::string> input2(
h_strings2.begin(), h_strings2.end(), valids);

bool mask[] = {1, 1, 0, 1, 0, 1};
bool mask_v[] = {1, 1, 1, 1, 1, 0};
cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6, mask_v);

auto results = cudf::copy_if_else(input1, input2, mask_w);
auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view()));

std::vector<const char*> h_expected;
for (cudf::size_type idx = 0; idx < static_cast<cudf::size_type>(h_strings1.size()); ++idx) {
if (mask[idx] and mask_v[idx])
h_expected.push_back(h_strings1[idx]);
else
h_expected.push_back(h_strings2[idx]);
}
cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end(), valids);
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(decoded->view(), expected);
}

TEST_F(DictionaryCopyIfElseTest, ColumnScalar)
{
std::string h_string{"eee"};
cudf::string_scalar input1{h_string};
std::vector<const char*> h_strings{"zz", "", "yyy", "w", "ééé", "ooo"};
auto valids = cudf::test::iterators::null_at(1);
cudf::test::dictionary_column_wrapper<std::string> input2(
h_strings.begin(), h_strings.end(), valids);

bool mask[] = {0, 1, 1, 1, 0, 1};
cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6);

auto results = cudf::copy_if_else(input2, input1, mask_w);
auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view()));

std::vector<const char*> h_expected1;
std::vector<const char*> h_expected2;
for (cudf::size_type idx = 0; idx < static_cast<cudf::size_type>(h_strings.size()); ++idx) {
if (mask[idx]) {
h_expected1.push_back(h_strings[idx]);
h_expected2.push_back(h_string.c_str());
} else {
h_expected1.push_back(h_string.c_str());
h_expected2.push_back(h_strings[idx]);
}
}

cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end(), valids);
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(decoded->view(), expected1);

results = cudf::copy_if_else(input1, input2, mask_w);
decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view()));

cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(decoded->view(), expected2);
}

TEST_F(DictionaryCopyIfElseTest, TypeMismatch)
{
cudf::test::dictionary_column_wrapper<int32_t> input1({1, 1, 1, 1});
cudf::test::dictionary_column_wrapper<double> input2({1.0, 1.0, 1.0, 1.0});
cudf::test::fixed_width_column_wrapper<bool> mask({1, 0, 0, 1});

EXPECT_THROW(cudf::copy_if_else(input1, input2, mask), cudf::logic_error);

cudf::string_scalar input3{"1"};
EXPECT_THROW(cudf::copy_if_else(input1, input3, mask), cudf::logic_error);
EXPECT_THROW(cudf::copy_if_else(input3, input2, mask), cudf::logic_error);
EXPECT_THROW(cudf::copy_if_else(input2, input3, mask), cudf::logic_error);
}

0 comments on commit 0c3f735

Please sign in to comment.