Skip to content

Commit

Permalink
Fix has_nonempty_nulls ignoring column offset (#13647)
Browse files Browse the repository at this point in the history
This fixes `has_nonempty_nulls` that always access the column offsets without considering the starting offset. As such, it may give wrong answer if the input column is sliced.

Closes #13638.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Raza Jafri (https://github.com/razajafri)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: #13647
  • Loading branch information
ttnghia authored Jun 29, 2023
1 parent 086ee94 commit a17ea9c
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 8 deletions.
7 changes: 3 additions & 4 deletions cpp/src/copying/purge_nonempty_nulls.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,10 @@ bool has_nonempty_null_rows(cudf::column_view const& input, rmm::cuda_stream_vie

// Cross-reference nullmask and offsets.
auto const type = input.type().id();
auto const offsets = (type == type_id::STRING) ? (strings_column_view{input}).offsets()
: (lists_column_view{input}).offsets();
auto const offsets = (type == type_id::STRING) ? (strings_column_view{input}).offsets_begin()
: (lists_column_view{input}).offsets_begin();
auto const d_input = cudf::column_device_view::create(input, stream);
auto const is_dirty_row = [d_input = *d_input, offsets = offsets.begin<size_type>()] __device__(
size_type const& row_idx) {
auto const is_dirty_row = [d_input = *d_input, offsets] __device__(size_type const& row_idx) {
return d_input.is_null_nocheck(row_idx) && (offsets[row_idx] != offsets[row_idx + 1]);
};

Expand Down
39 changes: 35 additions & 4 deletions cpp/tests/copying/purge_nonempty_nulls_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,41 @@ using gather_map_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
template <typename T>
using LCW = cudf::test::lists_column_wrapper<T, int32_t>;

struct HasNonEmptyNullsTest : public cudf::test::BaseFixture {};

TEST_F(HasNonEmptyNullsTest, TrivialTest)
{
auto const input = LCW<T>{{{{1, 2, 3, 4}, null_at(2)},
{5},
{6, 7}, // <--- Will be set to NULL. Unsanitized row.
{8, 9, 10}},
no_nulls()}
.release();
EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input));
EXPECT_FALSE(cudf::has_nonempty_nulls(*input));

// Set nullmask, post construction.
cudf::detail::set_null_mask(
input->mutable_view().null_mask(), 2, 3, false, cudf::get_default_stream());
input->set_null_count(1);
EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
}

TEST_F(HasNonEmptyNullsTest, SlicedInputTest)
{
auto const input = cudf::test::strings_column_wrapper{
{"" /*NULL*/, "111", "222", "333", "444", "" /*NULL*/, "", "777", "888", "" /*NULL*/, "101010"},
cudf::test::iterators::nulls_at({0, 5, 9})};

// Split into 2 columns from rows [0, 2) and [2, 10).
auto const result = cudf::split(input, {2});
for (auto const& col : result) {
EXPECT_TRUE(cudf::may_have_nonempty_nulls(col));
EXPECT_FALSE(cudf::has_nonempty_nulls(col));
}
}

struct PurgeNonEmptyNullsTest : public cudf::test::BaseFixture {
/// Helper to run gather() on a single column, and extract the single column from the result.
std::unique_ptr<cudf::column> gather(cudf::column_view const& input,
Expand Down Expand Up @@ -69,15 +104,11 @@ TEST_F(PurgeNonEmptyNullsTest, SingleLevelList)
{8, 9, 10}},
no_nulls()}
.release();
EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input));
EXPECT_FALSE(cudf::has_nonempty_nulls(*input));

// Set nullmask, post construction.
cudf::detail::set_null_mask(
input->mutable_view().null_mask(), 2, 3, false, cudf::get_default_stream());
input->set_null_count(1);
EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
EXPECT_TRUE(cudf::has_nonempty_nulls(*input));

test_purge(*input);

Expand Down

0 comments on commit a17ea9c

Please sign in to comment.