From 8e42e738e8ce82b832903ae9f1ddb592a3082ac2 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Mon, 19 Apr 2021 23:45:14 -0400 Subject: [PATCH] Fix incorrect explode_outer_position column null values in some cases (#7843) The null values in the position column didn't match up to expectations exactly. It can't be directly copied from the exploded column as the exploded column may contain null values that shouldn't be null in the position column. Fixes #7787 Authors: - Mike Wilson (https://github.com/hyperbolic2346) Approvers: - Mark Harris (https://github.com/harrism) - Jason Lowe (https://github.com/jlowe) - Nghia Truong (https://github.com/ttnghia) - Jake Hemstad (https://github.com/jrhemstad) URL: https://github.com/rapidsai/cudf/pull/7843 --- cpp/src/lists/explode.cu | 44 +++-- cpp/tests/lists/explode_tests.cpp | 179 +++++++++++++++++- .../test/java/ai/rapids/cudf/TableTest.java | 2 +- 3 files changed, 204 insertions(+), 21 deletions(-) diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu index 5f6f1c308ac..1fe750e69e0 100644 --- a/cpp/src/lists/explode.cu +++ b/cpp/src/lists/explode.cu @@ -36,6 +36,11 @@ namespace cudf { namespace detail { + +// explode column gather map uses cudf::out_of_bounds_policy::NULLIFY to +// fill nulls where there are invalid indices +constexpr size_type InvalidIndex = -1; + namespace { std::unique_ptr build_table( @@ -62,27 +67,35 @@ std::unique_ptr
build_table( std::vector> columns = gathered_table.release()->release(); - auto inserted = columns.insert(columns.begin() + explode_column_idx, - explode_col_gather_map - ? std::move(detail::gather(table_view({sliced_child}), - explode_col_gather_map->begin(), - explode_col_gather_map->end(), - cudf::out_of_bounds_policy::NULLIFY, - stream, - mr) - ->release()[0]) - : std::make_unique(sliced_child, stream, mr)); + columns.insert(columns.begin() + explode_column_idx, + explode_col_gather_map + ? std::move(detail::gather(table_view({sliced_child}), + explode_col_gather_map->begin(), + explode_col_gather_map->end(), + cudf::out_of_bounds_policy::NULLIFY, + stream, + mr) + ->release()[0]) + : std::make_unique(sliced_child, stream, mr)); if (position_array) { size_type position_size = position_array->size(); - // the null mask for position matches the exploded column's gather map, so copy it over - rmm::device_buffer nullmask = - explode_col_gather_map ? copy_bitmask(*inserted->get()) : rmm::device_buffer(0, stream); + // build the null mask for position based on invalid entries in gather map + auto nullmask = explode_col_gather_map ? valid_if( + explode_col_gather_map->begin(), + explode_col_gather_map->end(), + [] __device__(auto i) { return i != InvalidIndex; }, + stream, + mr) + : std::pair{ + rmm::device_buffer(0, stream), size_type{0}}; + columns.insert(columns.begin() + explode_column_idx, std::make_unique(data_type(type_to_id()), position_size, position_array->release(), - std::move(nullmask))); + nullmask.first, + nullmask.second)); } return std::make_unique
(std::move(columns)); @@ -243,8 +256,7 @@ std::unique_ptr
explode_outer(table_view const& input_table, : offsets[idx] + null_or_empty_offset_p[idx] - 1; gather_map_p[invalid_index] = idx; - // negative one to indicate a null value - explode_col_gather_map_p[invalid_index] = -1; + explode_col_gather_map_p[invalid_index] = InvalidIndex; if (include_position) { position_array[invalid_index] = 0; } } }; diff --git a/cpp/tests/lists/explode_tests.cpp b/cpp/tests/lists/explode_tests.cpp index ded3d2b9193..1685f2793ce 100644 --- a/cpp/tests/lists/explode_tests.cpp +++ b/cpp/tests/lists/explode_tests.cpp @@ -370,6 +370,91 @@ TEST_F(ExplodeTest, NestedStructs) CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); } +TEST_F(ExplodeTest, ListOfStructsWithEmpties) +{ + // a b + // [{1}}] "a" + // [{null}] "b" + // [null] "c" + // [] "d" + // null "e" + + constexpr auto null = 0; + + // row 0. 1 struct that contains a single int + cudf::test::fixed_width_column_wrapper i0{1}; + std::vector> s0_cols; + s0_cols.push_back(i0.release()); + cudf::test::structs_column_wrapper s0(std::move(s0_cols)); + cudf::test::fixed_width_column_wrapper off0{0, 1}; + auto row0 = cudf::make_lists_column(1, off0.release(), s0.release(), 0, rmm::device_buffer{}); + + // row 1. 1 struct that contains a null value + cudf::test::fixed_width_column_wrapper i1{{1}, {false}}; + std::vector> s1_cols; + s1_cols.push_back(i1.release()); + cudf::test::structs_column_wrapper s1(std::move(s1_cols)); + cudf::test::fixed_width_column_wrapper off1{0, 1}; + auto row1 = cudf::make_lists_column(1, off1.release(), s1.release(), 0, rmm::device_buffer{}); + + // row 2. 1 null struct + cudf::test::fixed_width_column_wrapper i2{0}; + std::vector> s2_cols; + s2_cols.push_back(i2.release()); + std::vector r2_valids{false}; + auto s2 = cudf::make_structs_column( + 1, + std::move(s2_cols), + 1, + cudf::test::detail::make_null_mask(r2_valids.begin(), r2_valids.end())); + cudf::test::fixed_width_column_wrapper off2{0, 1}; + auto row2 = cudf::make_lists_column(1, off2.release(), std::move(s2), 0, rmm::device_buffer{}); + + // row 3. empty list. + cudf::test::fixed_width_column_wrapper i3{}; + std::vector> s3_cols; + s3_cols.push_back(i3.release()); + auto s3 = cudf::make_structs_column(0, std::move(s3_cols), 0, rmm::device_buffer{}); + cudf::test::fixed_width_column_wrapper off3{0, 0}; + auto row3 = cudf::make_lists_column(1, off3.release(), std::move(s3), 0, rmm::device_buffer{}); + + // row 4. null list + cudf::test::fixed_width_column_wrapper i4{}; + std::vector> s4_cols; + s4_cols.push_back(i4.release()); + auto s4 = cudf::make_structs_column(0, std::move(s4_cols), 0, rmm::device_buffer{}); + cudf::test::fixed_width_column_wrapper off4{0, 0}; + std::vector r4_valids{false}; + auto row4 = + cudf::make_lists_column(1, + off4.release(), + std::move(s4), + 1, + cudf::test::detail::make_null_mask(r4_valids.begin(), r4_valids.end())); + + // concatenated + auto final_col = + cudf::concatenate(std::vector({*row0, *row1, *row2, *row3, *row4})); + auto s = strings_column_wrapper({"a", "b", "c", "d", "e"}).release(); + + cudf::table_view t({final_col->view(), s->view()}); + + auto ret = cudf::explode(t, 0); + auto expected_numeric_col = fixed_width_column_wrapper{{1, null, null}, {1, 0, 0}}; + + auto expected_a = structs_column_wrapper{{expected_numeric_col}, {1, 1, 0}}.release(); + auto expected_b = strings_column_wrapper({"a", "b", "c"}).release(); + + cudf::table_view expected({expected_a->view(), expected_b->view()}); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + FCW expected_pos_col{0, 0, 0}; + cudf::table_view pos_expected({expected_pos_col, expected_a->view(), expected_b->view()}); + + auto pos_ret = cudf::explode_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + TYPED_TEST(ExplodeTypedTest, ListOfStructs) { // a b @@ -532,7 +617,6 @@ TEST_F(ExplodeOuterTest, SingleNull) FCW expected_pos_col{{0, 0, 1, 0, 0, 1}, {0, 1, 1, 0, 1, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); - auto pos_ret = cudf::explode_outer_position(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); } @@ -624,7 +708,7 @@ TEST_F(ExplodeOuterTest, SequentialNulls) auto ret = cudf::explode_outer(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{{0, 1, 2, 0, 1, 0, 0, 0, 1, 2}, {1, 1, 0, 1, 1, 0, 0, 1, 1, 1}}; + FCW expected_pos_col{{0, 1, 2, 0, 1, 0, 0, 0, 1, 2}, {1, 1, 1, 1, 1, 0, 0, 1, 1, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -753,7 +837,7 @@ TEST_F(ExplodeOuterTest, NullsInList) CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - FCW expected_pos_col{{0, 1, 2, 0, 1, 2, 3, 0, 0, 1, 2}, {1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1}}; + FCW expected_pos_col{{0, 1, 2, 0, 1, 2, 3, 0, 0, 1, 2}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(t, 0); @@ -885,7 +969,7 @@ TEST_F(ExplodeOuterTest, NullsInNestedDoubleExplode) CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); FCW expected_pos_col{{0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1}, - {1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}}; + {1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}; cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); auto pos_ret = cudf::explode_outer_position(first_explode_ret->view(), 0); @@ -931,6 +1015,93 @@ TEST_F(ExplodeOuterTest, NestedStructs) CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); } +TEST_F(ExplodeOuterTest, ListOfStructsWithEmpties) +{ + // a b + // [{1}}] "a" + // [{null}] "b" + // [null] "c" + // [] "d" + // null "e" + + constexpr auto null = 0; + + // row 0. 1 struct that contains a single int + cudf::test::fixed_width_column_wrapper i0{1}; + std::vector> s0_cols; + s0_cols.push_back(i0.release()); + cudf::test::structs_column_wrapper s0(std::move(s0_cols)); + cudf::test::fixed_width_column_wrapper off0{0, 1}; + auto row0 = cudf::make_lists_column(1, off0.release(), s0.release(), 0, rmm::device_buffer{}); + + // row 1. 1 struct that contains a null value + cudf::test::fixed_width_column_wrapper i1{{1}, {false}}; + std::vector> s1_cols; + s1_cols.push_back(i1.release()); + cudf::test::structs_column_wrapper s1(std::move(s1_cols)); + cudf::test::fixed_width_column_wrapper off1{0, 1}; + auto row1 = cudf::make_lists_column(1, off1.release(), s1.release(), 0, rmm::device_buffer{}); + + // row 2. 1 null struct + cudf::test::fixed_width_column_wrapper i2{0}; + std::vector> s2_cols; + s2_cols.push_back(i2.release()); + std::vector r2_valids{false}; + auto s2 = cudf::make_structs_column( + 1, + std::move(s2_cols), + 1, + cudf::test::detail::make_null_mask(r2_valids.begin(), r2_valids.end())); + cudf::test::fixed_width_column_wrapper off2{0, 1}; + auto row2 = cudf::make_lists_column(1, off2.release(), std::move(s2), 0, rmm::device_buffer{}); + + // row 3. empty list. + cudf::test::fixed_width_column_wrapper i3{}; + std::vector> s3_cols; + s3_cols.push_back(i3.release()); + auto s3 = cudf::make_structs_column(0, std::move(s3_cols), 0, rmm::device_buffer{}); + cudf::test::fixed_width_column_wrapper off3{0, 0}; + auto row3 = cudf::make_lists_column(1, off3.release(), std::move(s3), 0, rmm::device_buffer{}); + + // row 4. null list + cudf::test::fixed_width_column_wrapper i4{}; + std::vector> s4_cols; + s4_cols.push_back(i4.release()); + auto s4 = cudf::make_structs_column(0, std::move(s4_cols), 0, rmm::device_buffer{}); + cudf::test::fixed_width_column_wrapper off4{0, 0}; + std::vector r4_valids{false}; + auto row4 = + cudf::make_lists_column(1, + off4.release(), + std::move(s4), + 1, + cudf::test::detail::make_null_mask(r4_valids.begin(), r4_valids.end())); + + // concatenated + auto final_col = + cudf::concatenate(std::vector({*row0, *row1, *row2, *row3, *row4})); + auto s = strings_column_wrapper({"a", "b", "c", "d", "e"}).release(); + + cudf::table_view t({final_col->view(), s->view()}); + + auto ret = cudf::explode_outer(t, 0); + + auto expected_numeric_col = + fixed_width_column_wrapper{{1, null, null, null, null}, {1, 0, 0, 0, 0}}; + + auto expected_a = structs_column_wrapper{{expected_numeric_col}, {1, 1, 0, 0, 0}}.release(); + auto expected_b = strings_column_wrapper({"a", "b", "c", "d", "e"}).release(); + + cudf::table_view expected({expected_a->view(), expected_b->view()}); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + FCW expected_pos_col{{0, 0, 0, null, null}, {1, 1, 1, 0, 0}}; + cudf::table_view pos_expected({expected_pos_col, expected_a->view(), expected_b->view()}); + + auto pos_ret = cudf::explode_outer_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + TYPED_TEST(ExplodeOuterTypedTest, ListOfStructs) { // a b diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index d7321a665c5..670ec585da3 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -5169,7 +5169,7 @@ private Table[] buildExplodeTestTableWithNestedTypes(boolean pos, boolean outer) Table.TestBuilder expectedBuilder = new Table.TestBuilder(); if (pos) { if (outer) { - expectedBuilder.column(0, 1, 2, 0, 1, 0, null, null); + expectedBuilder.column(0, 1, 2, 0, 1, 0, 0, null); } else { expectedBuilder.column(0, 1, 2, 0, 1, 0, 0); }