Skip to content

Commit

Permalink
Change cudf::test::make_null_mask to also return null-count (#13081)
Browse files Browse the repository at this point in the history
Change the `cudf::test::make_null_mask` to return both the null-mask and the null-count. Callers can then use this null-count instead of `UNKNOWN_NULL_COUNT`. These changes include removing `UNKNOWN_NULL_COUNT` usage from the libcudf C++ test source code.

One side-effect found that strings column with all nulls can technically have no children but using `UNKNOWN_NULL_COUNT` allowed the check for this to be bypassed. Therefore many utilities started to fail when `UNKNOWN_NULL_COUNT` was removed. The factory was modified to remove the check which results in an offsets column and an empty chars column as children.

More code will likely need to be change when the `UNKNOWN_NULL_COUNT` is no longer used as a default parameter for factories and other column functions.

No behavior is changed. Since the `cudf::test::make_null_mask` is technically a public API, this PR could be marked as a breaking change as well.

Contributes to: #11968

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #13081
  • Loading branch information
davidwendt authored Apr 14, 2023
1 parent 5c93b44 commit 4481142
Show file tree
Hide file tree
Showing 40 changed files with 626 additions and 618 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,6 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
size_type bytes = std::distance(chars_begin, chars_end) * sizeof(char);
if (strings_count == 0) return make_empty_column(type_id::STRING);

CUDF_EXPECTS(null_count < strings_count, "null strings column not yet supported");
CUDF_EXPECTS(bytes >= 0, "invalid offsets data");

// build offsets column -- this is the number of strings + 1
Expand Down
40 changes: 20 additions & 20 deletions cpp/include/cudf_test/column_utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -254,26 +254,26 @@ std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view
template <>
inline std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(column_view c)
{
auto const scv = strings_column_view(c);
auto const h_chars = cudf::detail::make_std_vector_sync<char>(
cudf::device_span<char const>(scv.chars().data<char>(), scv.chars().size()),
cudf::get_default_stream());
auto const h_offsets = cudf::detail::make_std_vector_sync(
cudf::device_span<cudf::offset_type const>(
scv.offsets().data<cudf::offset_type>() + scv.offset(), scv.size() + 1),
cudf::get_default_stream());

// build std::string vector from chars and offsets
std::vector<std::string> host_data;
host_data.reserve(c.size());
std::transform(
std::begin(h_offsets),
std::end(h_offsets) - 1,
std::begin(h_offsets) + 1,
std::back_inserter(host_data),
[&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); });

return {host_data, bitmask_to_host(c)};
thrust::host_vector<std::string> host_data(c.size());
if (c.size() > c.null_count()) {
auto const scv = strings_column_view(c);
auto const h_chars = cudf::detail::make_std_vector_sync<char>(
cudf::device_span<char const>(scv.chars().data<char>(), scv.chars().size()),
cudf::get_default_stream());
auto const h_offsets = cudf::detail::make_std_vector_sync(
cudf::device_span<cudf::offset_type const>(
scv.offsets().data<cudf::offset_type>() + scv.offset(), scv.size() + 1),
cudf::get_default_stream());

// build std::string vector from chars and offsets
std::transform(
std::begin(h_offsets),
std::end(h_offsets) - 1,
std::begin(h_offsets) + 1,
host_data.begin(),
[&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); });
}
return {std::move(host_data), bitmask_to_host(c)};
}

} // namespace cudf::test
Expand Down
107 changes: 62 additions & 45 deletions cpp/include/cudf_test/column_wrapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,16 +240,23 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
* element in `[begin,end)` that evaluated to `true`.
*/
template <typename ValidityIterator>
std::vector<bitmask_type> make_null_mask_vector(ValidityIterator begin, ValidityIterator end)
std::pair<std::vector<bitmask_type>, cudf::size_type> make_null_mask_vector(ValidityIterator begin,
ValidityIterator end)
{
auto const size = cudf::distance(begin, end);
auto const num_words = cudf::bitmask_allocation_size_bytes(size) / sizeof(bitmask_type);

auto null_mask = std::vector<bitmask_type>(num_words, 0);
for (auto i = 0; i < size; ++i)
if (*(begin + i)) set_bit_unsafe(null_mask.data(), i);
auto null_mask = std::vector<bitmask_type>(num_words, 0);
auto null_count = cudf::size_type{0};
for (auto i = 0; i < size; ++i) {
if (*(begin + i)) {
set_bit_unsafe(null_mask.data(), i);
} else {
++null_count;
}
}

return null_mask;
return {std::move(null_mask), null_count};
}

/**
Expand All @@ -266,12 +273,14 @@ std::vector<bitmask_type> make_null_mask_vector(ValidityIterator begin, Validity
* element in `[begin,end)` that evaluated to `true`.
*/
template <typename ValidityIterator>
rmm::device_buffer make_null_mask(ValidityIterator begin, ValidityIterator end)
std::pair<rmm::device_buffer, cudf::size_type> make_null_mask(ValidityIterator begin,
ValidityIterator end)
{
auto null_mask = make_null_mask_vector(begin, end);
return rmm::device_buffer{null_mask.data(),
null_mask.size() * sizeof(decltype(null_mask.front())),
cudf::get_default_stream()};
auto [null_mask, null_count] = make_null_mask_vector(begin, end);
auto d_mask = rmm::device_buffer{null_mask.data(),
cudf::bitmask_allocation_size_bytes(cudf::distance(begin, end)),
cudf::get_default_stream()};
return {std::move(d_mask), null_count};
}

/**
Expand Down Expand Up @@ -319,10 +328,12 @@ class fixed_width_column_wrapper : public detail::column_wrapper {
fixed_width_column_wrapper() : column_wrapper{}
{
std::vector<ElementTo> empty;
wrapped.reset(new cudf::column{
cudf::data_type{cudf::type_to_id<ElementTo>()},
0,
detail::make_elements<ElementTo, SourceElementT>(empty.begin(), empty.end())});
wrapped.reset(
new cudf::column{cudf::data_type{cudf::type_to_id<ElementTo>()},
0,
detail::make_elements<ElementTo, SourceElementT>(empty.begin(), empty.end()),
rmm::device_buffer{},
0});
}

/**
Expand All @@ -349,7 +360,9 @@ class fixed_width_column_wrapper : public detail::column_wrapper {
auto const size = cudf::distance(begin, end);
wrapped.reset(new cudf::column{cudf::data_type{cudf::type_to_id<ElementTo>()},
size,
detail::make_elements<ElementTo, SourceElementT>(begin, end)});
detail::make_elements<ElementTo, SourceElementT>(begin, end),
rmm::device_buffer{},
0});
}

/**
Expand Down Expand Up @@ -379,12 +392,13 @@ class fixed_width_column_wrapper : public detail::column_wrapper {
fixed_width_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
: column_wrapper{}
{
auto const size = cudf::distance(begin, end);
auto const size = cudf::distance(begin, end);
auto [null_mask, null_count] = detail::make_null_mask(v, v + size);
wrapped.reset(new cudf::column{cudf::data_type{cudf::type_to_id<ElementTo>()},
size,
detail::make_elements<ElementTo, SourceElementT>(begin, end),
detail::make_null_mask(v, v + size),
cudf::UNKNOWN_NULL_COUNT});
std::move(null_mask),
null_count});
}

/**
Expand Down Expand Up @@ -547,7 +561,9 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
wrapped.reset(new cudf::column{
data_type,
size,
rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::get_default_stream()}});
rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::get_default_stream()},
rmm::device_buffer{},
0});
}

/**
Expand Down Expand Up @@ -603,17 +619,17 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
{
CUDF_EXPECTS(numeric::is_supported_representation_type<Rep>(), "not valid representation type");

auto const size = cudf::distance(begin, end);
auto const elements = thrust::host_vector<Rep>(begin, end);
auto const id = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10>>();
auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};

auto const size = cudf::distance(begin, end);
auto const elements = thrust::host_vector<Rep>(begin, end);
auto const id = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10>>();
auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
auto [null_mask, null_count] = detail::make_null_mask(v, v + size);
wrapped.reset(new cudf::column{
data_type,
size,
rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::get_default_stream()},
detail::make_null_mask(v, v + size),
cudf::UNKNOWN_NULL_COUNT});
std::move(null_mask),
null_count});
}

/**
Expand Down Expand Up @@ -736,7 +752,7 @@ class strings_column_wrapper : public detail::column_wrapper {
chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
auto d_offsets = cudf::detail::make_device_uvector_sync(
offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
wrapped = cudf::make_strings_column(d_chars, d_offsets);
wrapped = cudf::make_strings_column(d_chars, d_offsets, {}, 0);
}

/**
Expand Down Expand Up @@ -771,16 +787,16 @@ class strings_column_wrapper : public detail::column_wrapper {
strings_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v)
: column_wrapper{}
{
size_type num_strings = std::distance(begin, end);
auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v);
auto null_mask = detail::make_null_mask_vector(v, v + num_strings);
auto d_chars = cudf::detail::make_device_uvector_sync(
size_type num_strings = std::distance(begin, end);
auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v);
auto [null_mask, null_count] = detail::make_null_mask_vector(v, v + num_strings);
auto d_chars = cudf::detail::make_device_uvector_sync(
chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
auto d_offsets = cudf::detail::make_device_uvector_sync(
offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
auto d_bitmask = cudf::detail::make_device_uvector_sync(
null_mask, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask);
wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask, null_count);
}

/**
Expand Down Expand Up @@ -1579,14 +1595,14 @@ class lists_column_wrapper : public detail::column_wrapper {
// increment depth
depth = expected_depth + 1;

auto [null_mask, null_count] = [&] {
if (v.size() <= 0) return std::make_pair(rmm::device_buffer{}, cudf::size_type{0});
return cudf::test::detail::make_null_mask(v.begin(), v.end());
}();

// construct the list column
wrapped =
make_lists_column(cols.size(),
std::move(offsets),
std::move(data),
v.size() <= 0 ? 0 : cudf::UNKNOWN_NULL_COUNT,
v.size() <= 0 ? rmm::device_buffer{}
: cudf::test::detail::make_null_mask(v.begin(), v.end()));
wrapped = make_lists_column(
cols.size(), std::move(offsets), std::move(data), null_count, std::move(null_mask));
}

/**
Expand Down Expand Up @@ -1668,7 +1684,7 @@ class lists_column_wrapper : public detail::column_wrapper {
std::make_unique<column>(lcv.offsets()),
normalize_column(lists_column_view(col).child(),
lists_column_view(expected_hierarchy).child()),
UNKNOWN_NULL_COUNT,
col.null_count(),
copy_bitmask(col));
}

Expand Down Expand Up @@ -1843,12 +1859,13 @@ class structs_column_wrapper : public detail::column_wrapper {
CUDF_EXPECTS(validity.size() <= 0 || static_cast<size_type>(validity.size()) == num_rows,
"Validity buffer must have as many elements as rows in the struct column.");

auto [null_mask, null_count] = [&] {
if (validity.size() <= 0) return std::make_pair(rmm::device_buffer{}, cudf::size_type{0});
return cudf::test::detail::make_null_mask(validity.begin(), validity.end());
}();

wrapped = cudf::make_structs_column(
num_rows,
std::move(child_columns),
validity.size() <= 0 ? 0 : cudf::UNKNOWN_NULL_COUNT,
validity.size() <= 0 ? rmm::device_buffer{}
: detail::make_null_mask(validity.begin(), validity.end()));
num_rows, std::move(child_columns), null_count, std::move(null_mask));
}

template <typename V>
Expand Down
34 changes: 20 additions & 14 deletions cpp/tests/bitmask/bitmask_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ struct BitmaskUtilitiesTest : public cudf::test::BaseFixture {
TEST_F(BitmaskUtilitiesTest, StateNullCount)
{
EXPECT_EQ(0, cudf::state_null_count(cudf::mask_state::UNALLOCATED, 42));
EXPECT_EQ(cudf::UNKNOWN_NULL_COUNT, cudf::state_null_count(cudf::mask_state::UNINITIALIZED, 42));
EXPECT_EQ(42, cudf::state_null_count(cudf::mask_state::ALL_NULL, 42));
EXPECT_EQ(0, cudf::state_null_count(cudf::mask_state::ALL_VALID, 42));
}
Expand Down Expand Up @@ -575,12 +574,13 @@ TEST_F(CopyBitmaskTest, TestZeroOffset)
for (auto& m : validity_bit) {
m = this->generate();
}
auto input_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
auto input_mask =
std::get<0>(cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end()));

int begin_bit = 0;
int end_bit = 800;
auto gold_splice_mask = cudf::test::detail::make_null_mask(validity_bit.begin() + begin_bit,
validity_bit.begin() + end_bit);
auto gold_splice_mask = std::get<0>(cudf::test::detail::make_null_mask(
validity_bit.begin() + begin_bit, validity_bit.begin() + end_bit));

auto splice_mask = cudf::copy_bitmask(
static_cast<const cudf::bitmask_type*>(input_mask.data()), begin_bit, end_bit);
Expand All @@ -597,12 +597,13 @@ TEST_F(CopyBitmaskTest, TestNonZeroOffset)
for (auto& m : validity_bit) {
m = this->generate();
}
auto input_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
auto input_mask =
std::get<0>(cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end()));

int begin_bit = 321;
int end_bit = 998;
auto gold_splice_mask = cudf::test::detail::make_null_mask(validity_bit.begin() + begin_bit,
validity_bit.begin() + end_bit);
auto gold_splice_mask = std::get<0>(cudf::test::detail::make_null_mask(
validity_bit.begin() + begin_bit, validity_bit.begin() + end_bit));

auto splice_mask = cudf::copy_bitmask(
static_cast<const cudf::bitmask_type*>(input_mask.data()), begin_bit, end_bit);
Expand All @@ -621,7 +622,8 @@ TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorContiguous)
for (auto& m : validity_bit) {
m = this->generate();
}
auto gold_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
auto gold_mask =
std::get<0>(cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end()));

rmm::device_buffer copy_mask{gold_mask, cudf::get_default_stream()};
cudf::column original{t,
Expand Down Expand Up @@ -661,18 +663,21 @@ TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorDiscontiguous)
for (auto& m : validity_bit) {
m = this->generate();
}
auto gold_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
auto gold_mask =
std::get<0>(cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end()));
std::vector<cudf::size_type> split{0, 104, 128, 152, 311, 491, 583, 734, 760, num_elements};

std::vector<cudf::column> cols;
std::vector<cudf::column_view> views;
for (unsigned i = 0; i < split.size() - 1; i++) {
auto [null_mask, null_count] = cudf::test::detail::make_null_mask(
validity_bit.begin() + split[i], validity_bit.begin() + split[i + 1]);
cols.emplace_back(
t,
split[i + 1] - split[i],
rmm::device_buffer{sizeof(int) * (split[i + 1] - split[i]), cudf::get_default_stream()},
cudf::test::detail::make_null_mask(validity_bit.begin() + split[i],
validity_bit.begin() + split[i + 1]));
std::move(null_mask),
null_count);
views.push_back(cols.back());
}
rmm::device_buffer concatenated_bitmask = cudf::concatenate_masks(views);
Expand Down Expand Up @@ -706,7 +711,8 @@ TEST_F(MergeBitmaskTest, TestBitmaskAnd)

auto odd_indices =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
auto odd = cudf::test::detail::make_null_mask(odd_indices, odd_indices + input2.num_rows());
auto odd =
std::get<0>(cudf::test::detail::make_null_mask(odd_indices, odd_indices + input2.num_rows()));

EXPECT_EQ(nullptr, result1_mask.data());
CUDF_TEST_EXPECT_EQUAL_BUFFERS(
Expand Down Expand Up @@ -735,8 +741,8 @@ TEST_F(MergeBitmaskTest, TestBitmaskOr)

auto all_but_index3 =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; });
auto null3 =
cudf::test::detail::make_null_mask(all_but_index3, all_but_index3 + input2.num_rows());
auto null3 = std::get<0>(
cudf::test::detail::make_null_mask(all_but_index3, all_but_index3 + input2.num_rows()));

EXPECT_EQ(nullptr, result1_mask.data());
CUDF_TEST_EXPECT_EQUAL_BUFFERS(
Expand Down
9 changes: 6 additions & 3 deletions cpp/tests/bitmask/valid_if_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,9 @@ TEST_F(ValidIfTest, OddsValid)
odds_valid{},
cudf::get_default_stream(),
rmm::mr::get_current_device_resource());
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.first.data(), actual.first.data(), expected.first.size());
EXPECT_EQ(5000, actual.second);
EXPECT_EQ(expected.second, actual.second);
}

TEST_F(ValidIfTest, AllValid)
Expand All @@ -83,8 +84,9 @@ TEST_F(ValidIfTest, AllValid)
all_valid{},
cudf::get_default_stream(),
rmm::mr::get_current_device_resource());
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.first.data(), actual.first.data(), expected.first.size());
EXPECT_EQ(0, actual.second);
EXPECT_EQ(expected.second, actual.second);
}

TEST_F(ValidIfTest, AllNull)
Expand All @@ -96,6 +98,7 @@ TEST_F(ValidIfTest, AllNull)
all_null{},
cudf::get_default_stream(),
rmm::mr::get_current_device_resource());
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.first.data(), actual.first.data(), expected.first.size());
EXPECT_EQ(10000, actual.second);
EXPECT_EQ(expected.second, actual.second);
}
Loading

0 comments on commit 4481142

Please sign in to comment.