Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change cudf::test::make_null_mask to also return null-count #13081

Merged
merged 15 commits into from
Apr 14, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,6 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
size_type bytes = std::distance(chars_begin, chars_end) * sizeof(char);
if (strings_count == 0) return make_empty_column(type_id::STRING);

CUDF_EXPECTS(null_count < strings_count, "null strings column not yet supported");
CUDF_EXPECTS(bytes >= 0, "invalid offsets data");

// build offsets column -- this is the number of strings + 1
Expand Down
40 changes: 20 additions & 20 deletions cpp/include/cudf_test/column_utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -254,26 +254,26 @@ std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view
template <>
inline std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(column_view c)
{
auto const scv = strings_column_view(c);
auto const h_chars = cudf::detail::make_std_vector_sync<char>(
cudf::device_span<char const>(scv.chars().data<char>(), scv.chars().size()),
cudf::get_default_stream());
auto const h_offsets = cudf::detail::make_std_vector_sync(
cudf::device_span<cudf::offset_type const>(
scv.offsets().data<cudf::offset_type>() + scv.offset(), scv.size() + 1),
cudf::get_default_stream());

// build std::string vector from chars and offsets
std::vector<std::string> host_data;
host_data.reserve(c.size());
std::transform(
std::begin(h_offsets),
std::end(h_offsets) - 1,
std::begin(h_offsets) + 1,
std::back_inserter(host_data),
[&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); });

return {host_data, bitmask_to_host(c)};
thrust::host_vector<std::string> host_data(c.size());
if (c.size() > c.null_count()) {
auto const scv = strings_column_view(c);
auto const h_chars = cudf::detail::make_std_vector_sync<char>(
cudf::device_span<char const>(scv.chars().data<char>(), scv.chars().size()),
cudf::get_default_stream());
auto const h_offsets = cudf::detail::make_std_vector_sync(
cudf::device_span<cudf::offset_type const>(
scv.offsets().data<cudf::offset_type>() + scv.offset(), scv.size() + 1),
cudf::get_default_stream());

// build std::string vector from chars and offsets
std::transform(
std::begin(h_offsets),
std::end(h_offsets) - 1,
std::begin(h_offsets) + 1,
host_data.begin(),
[&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); });
}
return std::make_pair(std::move(host_data), bitmask_to_host(c));
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
}

} // namespace cudf::test
Expand Down
107 changes: 62 additions & 45 deletions cpp/include/cudf_test/column_wrapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,16 +240,23 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
* element in `[begin,end)` that evaluated to `true`.
*/
template <typename ValidityIterator>
std::vector<bitmask_type> make_null_mask_vector(ValidityIterator begin, ValidityIterator end)
std::pair<std::vector<bitmask_type>, cudf::size_type> make_null_mask_vector(ValidityIterator begin,
ValidityIterator end)
{
auto const size = cudf::distance(begin, end);
auto const num_words = cudf::bitmask_allocation_size_bytes(size) / sizeof(bitmask_type);

auto null_mask = std::vector<bitmask_type>(num_words, 0);
for (auto i = 0; i < size; ++i)
if (*(begin + i)) set_bit_unsafe(null_mask.data(), i);
auto null_mask = std::vector<bitmask_type>(num_words, 0);
auto null_count = cudf::size_type{0};
for (auto i = 0; i < size; ++i) {
if (*(begin + i)) {
set_bit_unsafe(null_mask.data(), i);
} else {
++null_count;
}
}

return null_mask;
return std::make_pair(std::move(null_mask), null_count);
}

/**
Expand All @@ -266,12 +273,14 @@ std::vector<bitmask_type> make_null_mask_vector(ValidityIterator begin, Validity
* element in `[begin,end)` that evaluated to `true`.
*/
template <typename ValidityIterator>
rmm::device_buffer make_null_mask(ValidityIterator begin, ValidityIterator end)
std::pair<rmm::device_buffer, cudf::size_type> make_null_mask(ValidityIterator begin,
ValidityIterator end)
{
auto null_mask = make_null_mask_vector(begin, end);
return rmm::device_buffer{null_mask.data(),
null_mask.size() * sizeof(decltype(null_mask.front())),
cudf::get_default_stream()};
auto [null_mask, null_count] = make_null_mask_vector(begin, end);
mythrocks marked this conversation as resolved.
Show resolved Hide resolved
auto d_mask = rmm::device_buffer{null_mask.data(),
null_mask.size() * sizeof(decltype(null_mask.front())),
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
cudf::get_default_stream()};
return std::make_pair(std::move(d_mask), null_count);
}

/**
Expand Down Expand Up @@ -319,10 +328,12 @@ class fixed_width_column_wrapper : public detail::column_wrapper {
fixed_width_column_wrapper() : column_wrapper{}
{
std::vector<ElementTo> empty;
wrapped.reset(new cudf::column{
cudf::data_type{cudf::type_to_id<ElementTo>()},
0,
detail::make_elements<ElementTo, SourceElementT>(empty.begin(), empty.end())});
wrapped.reset(
new cudf::column{cudf::data_type{cudf::type_to_id<ElementTo>()},
0,
detail::make_elements<ElementTo, SourceElementT>(empty.begin(), empty.end()),
rmm::device_buffer{},
0});
}

/**
Expand All @@ -349,7 +360,9 @@ class fixed_width_column_wrapper : public detail::column_wrapper {
auto const size = cudf::distance(begin, end);
wrapped.reset(new cudf::column{cudf::data_type{cudf::type_to_id<ElementTo>()},
size,
detail::make_elements<ElementTo, SourceElementT>(begin, end)});
detail::make_elements<ElementTo, SourceElementT>(begin, end),
rmm::device_buffer{},
0});
}

/**
Expand Down Expand Up @@ -379,12 +392,13 @@ class fixed_width_column_wrapper : public detail::column_wrapper {
fixed_width_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
: column_wrapper{}
{
auto const size = cudf::distance(begin, end);
auto const size = cudf::distance(begin, end);
auto [null_mask, null_count] = detail::make_null_mask(v, v + size);
wrapped.reset(new cudf::column{cudf::data_type{cudf::type_to_id<ElementTo>()},
size,
detail::make_elements<ElementTo, SourceElementT>(begin, end),
detail::make_null_mask(v, v + size),
cudf::UNKNOWN_NULL_COUNT});
std::move(null_mask),
null_count});
}

/**
Expand Down Expand Up @@ -547,7 +561,9 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
wrapped.reset(new cudf::column{
data_type,
size,
rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::get_default_stream()}});
rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::get_default_stream()},
rmm::device_buffer{},
0});
}

/**
Expand Down Expand Up @@ -603,17 +619,17 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
{
CUDF_EXPECTS(numeric::is_supported_representation_type<Rep>(), "not valid representation type");

auto const size = cudf::distance(begin, end);
auto const elements = thrust::host_vector<Rep>(begin, end);
auto const id = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10>>();
auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};

auto const size = cudf::distance(begin, end);
auto const elements = thrust::host_vector<Rep>(begin, end);
auto const id = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10>>();
auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
auto [null_mask, null_count] = detail::make_null_mask(v, v + size);
wrapped.reset(new cudf::column{
data_type,
size,
rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::get_default_stream()},
detail::make_null_mask(v, v + size),
cudf::UNKNOWN_NULL_COUNT});
std::move(null_mask),
null_count});
}

/**
Expand Down Expand Up @@ -736,7 +752,7 @@ class strings_column_wrapper : public detail::column_wrapper {
chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
auto d_offsets = cudf::detail::make_device_uvector_sync(
offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
wrapped = cudf::make_strings_column(d_chars, d_offsets);
wrapped = cudf::make_strings_column(d_chars, d_offsets, {}, 0);
}

/**
Expand Down Expand Up @@ -771,16 +787,16 @@ class strings_column_wrapper : public detail::column_wrapper {
strings_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v)
: column_wrapper{}
{
size_type num_strings = std::distance(begin, end);
auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v);
auto null_mask = detail::make_null_mask_vector(v, v + num_strings);
auto d_chars = cudf::detail::make_device_uvector_sync(
size_type num_strings = std::distance(begin, end);
auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v);
auto [null_mask, null_count] = detail::make_null_mask_vector(v, v + num_strings);
auto d_chars = cudf::detail::make_device_uvector_sync(
chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
auto d_offsets = cudf::detail::make_device_uvector_sync(
offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
auto d_bitmask = cudf::detail::make_device_uvector_sync(
null_mask, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask);
wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask, null_count);
}

/**
Expand Down Expand Up @@ -1579,14 +1595,14 @@ class lists_column_wrapper : public detail::column_wrapper {
// increment depth
depth = expected_depth + 1;

auto [null_mask, null_count] = [&] {
if (v.size() <= 0) return std::make_pair(rmm::device_buffer{}, cudf::size_type{0});
return cudf::test::detail::make_null_mask(v.begin(), v.end());
}();

// construct the list column
wrapped =
make_lists_column(cols.size(),
std::move(offsets),
std::move(data),
v.size() <= 0 ? 0 : cudf::UNKNOWN_NULL_COUNT,
v.size() <= 0 ? rmm::device_buffer{}
: cudf::test::detail::make_null_mask(v.begin(), v.end()));
wrapped = make_lists_column(
cols.size(), std::move(offsets), std::move(data), null_count, std::move(null_mask));
}

/**
Expand Down Expand Up @@ -1668,7 +1684,7 @@ class lists_column_wrapper : public detail::column_wrapper {
std::make_unique<column>(lcv.offsets()),
normalize_column(lists_column_view(col).child(),
lists_column_view(expected_hierarchy).child()),
UNKNOWN_NULL_COUNT,
col.null_count(),
copy_bitmask(col));
}

Expand Down Expand Up @@ -1843,12 +1859,13 @@ class structs_column_wrapper : public detail::column_wrapper {
CUDF_EXPECTS(validity.size() <= 0 || static_cast<size_type>(validity.size()) == num_rows,
"Validity buffer must have as many elements as rows in the struct column.");

auto [null_mask, null_count] = [&] {
if (validity.size() <= 0) return std::make_pair(rmm::device_buffer{}, cudf::size_type{0});
return cudf::test::detail::make_null_mask(validity.begin(), validity.end());
}();

wrapped = cudf::make_structs_column(
num_rows,
std::move(child_columns),
validity.size() <= 0 ? 0 : cudf::UNKNOWN_NULL_COUNT,
validity.size() <= 0 ? rmm::device_buffer{}
: detail::make_null_mask(validity.begin(), validity.end()));
num_rows, std::move(child_columns), null_count, std::move(null_mask));
}

template <typename V>
Expand Down
34 changes: 20 additions & 14 deletions cpp/tests/bitmask/bitmask_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ struct BitmaskUtilitiesTest : public cudf::test::BaseFixture {
TEST_F(BitmaskUtilitiesTest, StateNullCount)
{
EXPECT_EQ(0, cudf::state_null_count(cudf::mask_state::UNALLOCATED, 42));
EXPECT_EQ(cudf::UNKNOWN_NULL_COUNT, cudf::state_null_count(cudf::mask_state::UNINITIALIZED, 42));
EXPECT_EQ(42, cudf::state_null_count(cudf::mask_state::ALL_NULL, 42));
EXPECT_EQ(0, cudf::state_null_count(cudf::mask_state::ALL_VALID, 42));
}
Expand Down Expand Up @@ -575,12 +574,13 @@ TEST_F(CopyBitmaskTest, TestZeroOffset)
for (auto& m : validity_bit) {
m = this->generate();
}
auto input_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
auto input_mask =
std::get<0>(cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end()));

int begin_bit = 0;
int end_bit = 800;
auto gold_splice_mask = cudf::test::detail::make_null_mask(validity_bit.begin() + begin_bit,
validity_bit.begin() + end_bit);
auto gold_splice_mask = std::get<0>(cudf::test::detail::make_null_mask(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Our of curiosity, is there a reason you prefer std::get<0> to .first for pairs? For consistency with tuples?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like that this gets us the first value from the structured binding without an unused variable and without resorting to the vagueness of .first and .second.

validity_bit.begin() + begin_bit, validity_bit.begin() + end_bit));

auto splice_mask = cudf::copy_bitmask(
static_cast<const cudf::bitmask_type*>(input_mask.data()), begin_bit, end_bit);
Expand All @@ -597,12 +597,13 @@ TEST_F(CopyBitmaskTest, TestNonZeroOffset)
for (auto& m : validity_bit) {
m = this->generate();
}
auto input_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
auto input_mask =
std::get<0>(cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end()));

int begin_bit = 321;
int end_bit = 998;
auto gold_splice_mask = cudf::test::detail::make_null_mask(validity_bit.begin() + begin_bit,
validity_bit.begin() + end_bit);
auto gold_splice_mask = std::get<0>(cudf::test::detail::make_null_mask(
validity_bit.begin() + begin_bit, validity_bit.begin() + end_bit));

auto splice_mask = cudf::copy_bitmask(
static_cast<const cudf::bitmask_type*>(input_mask.data()), begin_bit, end_bit);
Expand All @@ -621,7 +622,8 @@ TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorContiguous)
for (auto& m : validity_bit) {
m = this->generate();
}
auto gold_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
auto gold_mask =
std::get<0>(cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end()));

rmm::device_buffer copy_mask{gold_mask, cudf::get_default_stream()};
cudf::column original{t,
Expand Down Expand Up @@ -661,18 +663,21 @@ TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorDiscontiguous)
for (auto& m : validity_bit) {
m = this->generate();
}
auto gold_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
auto gold_mask =
std::get<0>(cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end()));
std::vector<cudf::size_type> split{0, 104, 128, 152, 311, 491, 583, 734, 760, num_elements};

std::vector<cudf::column> cols;
std::vector<cudf::column_view> views;
for (unsigned i = 0; i < split.size() - 1; i++) {
auto [null_mask, null_count] = cudf::test::detail::make_null_mask(
validity_bit.begin() + split[i], validity_bit.begin() + split[i + 1]);
cols.emplace_back(
t,
split[i + 1] - split[i],
rmm::device_buffer{sizeof(int) * (split[i + 1] - split[i]), cudf::get_default_stream()},
cudf::test::detail::make_null_mask(validity_bit.begin() + split[i],
validity_bit.begin() + split[i + 1]));
std::move(null_mask),
null_count);
views.push_back(cols.back());
}
rmm::device_buffer concatenated_bitmask = cudf::concatenate_masks(views);
Expand Down Expand Up @@ -706,7 +711,8 @@ TEST_F(MergeBitmaskTest, TestBitmaskAnd)

auto odd_indices =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
auto odd = cudf::test::detail::make_null_mask(odd_indices, odd_indices + input2.num_rows());
auto odd =
std::get<0>(cudf::test::detail::make_null_mask(odd_indices, odd_indices + input2.num_rows()));

EXPECT_EQ(nullptr, result1_mask.data());
CUDF_TEST_EXPECT_EQUAL_BUFFERS(
Expand Down Expand Up @@ -735,8 +741,8 @@ TEST_F(MergeBitmaskTest, TestBitmaskOr)

auto all_but_index3 =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; });
auto null3 =
cudf::test::detail::make_null_mask(all_but_index3, all_but_index3 + input2.num_rows());
auto null3 = std::get<0>(
cudf::test::detail::make_null_mask(all_but_index3, all_but_index3 + input2.num_rows()));

EXPECT_EQ(nullptr, result1_mask.data());
CUDF_TEST_EXPECT_EQUAL_BUFFERS(
Expand Down
9 changes: 6 additions & 3 deletions cpp/tests/bitmask/valid_if_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,9 @@ TEST_F(ValidIfTest, OddsValid)
odds_valid{},
cudf::get_default_stream(),
rmm::mr::get_current_device_resource());
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.first.data(), actual.first.data(), expected.first.size());
EXPECT_EQ(5000, actual.second);
EXPECT_EQ(expected.second, actual.second);
}

TEST_F(ValidIfTest, AllValid)
Expand All @@ -83,8 +84,9 @@ TEST_F(ValidIfTest, AllValid)
all_valid{},
cudf::get_default_stream(),
rmm::mr::get_current_device_resource());
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.first.data(), actual.first.data(), expected.first.size());
EXPECT_EQ(0, actual.second);
EXPECT_EQ(expected.second, actual.second);
}

TEST_F(ValidIfTest, AllNull)
Expand All @@ -96,6 +98,7 @@ TEST_F(ValidIfTest, AllNull)
all_null{},
cudf::get_default_stream(),
rmm::mr::get_current_device_resource());
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.first.data(), actual.first.data(), expected.first.size());
EXPECT_EQ(10000, actual.second);
EXPECT_EQ(expected.second, actual.second);
}
Loading