From db21232968e91ae2374990b11783bed6b3a0dc23 Mon Sep 17 00:00:00 2001 From: Nghia Truong <ttnghia@users.noreply.github.com> Date: Fri, 7 May 2021 08:17:12 -0600 Subject: [PATCH] Fix struct scatter to correctly cascade null_mask to children columns (#8176) This fixes a bug in `scatter.cuh` where the struct column failed to cascade its null_mask to its children. Typically, this is automatically done during struct construction. Unfortunately, the current scatter function constructs a structs column first, then updates the parent null_mask later thus it fails to update its children's null_mask. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - MithunR (https://github.com/mythrocks) - Robert Maynard (https://github.com/robertmaynard) URL: https://github.com/rapidsai/cudf/pull/8176 --- cpp/include/cudf/detail/scatter.cuh | 29 ++- cpp/tests/copying/scatter_struct_tests.cpp | 249 ++++++++------------- 2 files changed, 113 insertions(+), 165 deletions(-) diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh index 68f4adfadc9..410cd213618 100644 --- a/cpp/include/cudf/detail/scatter.cuh +++ b/cpp/include/cudf/detail/scatter.cuh @@ -296,7 +296,7 @@ struct column_scatterer_impl<struct_view> { // We still need to call `gather_bitmask` even when the source's children are not nullable, // as if the target's children have null_masks, those null_masks need to be updated after - // being scattered onto + // being scattered onto. auto const child_nullable = std::any_of(structs_src.child_begin(), structs_src.child_end(), [](auto const& col) { return col.nullable(); }) or @@ -315,7 +315,7 @@ struct column_scatterer_impl<struct_view> { mr); } - // Need to put the result column in a vector to call `gather_bitmask` + // Need to put the result column in a vector to call `gather_bitmask`. std::vector<std::unique_ptr<column>> result; result.emplace_back(cudf::make_structs_column(target.size(), std::move(output_struct_members), @@ -325,7 +325,7 @@ struct column_scatterer_impl<struct_view> { mr)); // Only gather bitmask from the target column for the rows that have not been scattered onto - // The bitmask from the source column will be gathered at the top level `scatter()` call + // The bitmask from the source column will be gathered at the top level `scatter()` call. if (target.nullable()) { auto const gather_map = scatter_to_gather_complement(scatter_map_begin, scatter_map_end, target.size(), stream); @@ -402,7 +402,7 @@ std::unique_ptr<table> scatter( CUDF_EXPECTS(std::distance(scatter_map_begin, scatter_map_end) <= source.num_rows(), "scatter map size should be <= to number of rows in source"); - // Transform negative indices to index + target size + // Transform negative indices to index + target size. auto updated_scatter_map_begin = thrust::make_transform_iterator(scatter_map_begin, index_converter<MapType>{target.num_rows()}); auto updated_scatter_map_end = @@ -425,7 +425,7 @@ std::unique_ptr<table> scatter( }); // We still need to call `gather_bitmask` even when the source columns are not nullable, - // as if the target has null_mask, that null_mask needs to be updated after scattering + // as if the target has null_mask, that null_mask needs to be updated after scattering. auto const nullable = std::any_of(source.begin(), source.end(), [](auto const& col) { return col.nullable(); }) or std::any_of(target.begin(), target.end(), [](auto const& col) { return col.nullable(); }); @@ -433,6 +433,25 @@ std::unique_ptr<table> scatter( auto const gather_map = scatter_to_gather( updated_scatter_map_begin, updated_scatter_map_end, target.num_rows(), stream); gather_bitmask(source, gather_map.begin(), result, gather_bitmask_op::PASSTHROUGH, stream, mr); + + // For struct columns, we need to superimpose the null_mask of the parent over the null_mask of + // the children. + std::for_each(result.begin(), result.end(), [=](auto& col) { + auto const col_view = col->view(); + if (col_view.type().id() == type_id::STRUCT and col_view.nullable()) { + auto const num_rows = col_view.size(); + auto const null_count = col_view.null_count(); + auto contents = col->release(); + + // Children null_mask will be superimposed during structs column construction. + col = cudf::make_structs_column(num_rows, + std::move(contents.children), + null_count, + std::move(*contents.null_mask), + stream, + mr); + } + }); } return std::make_unique<table>(std::move(result)); } diff --git a/cpp/tests/copying/scatter_struct_tests.cpp b/cpp/tests/copying/scatter_struct_tests.cpp index 3993664168b..4ca805f2c18 100644 --- a/cpp/tests/copying/scatter_struct_tests.cpp +++ b/cpp/tests/copying/scatter_struct_tests.cpp @@ -17,23 +17,21 @@ #include <cudf_test/base_fixture.hpp> #include <cudf_test/column_utilities.hpp> #include <cudf_test/column_wrapper.hpp> +#include <cudf_test/iterator_utilities.hpp> #include <cudf_test/type_lists.hpp> #include <cudf/copying.hpp> -#include <cudf/detail/iterator.cuh> #include <cudf/lists/lists_column_view.hpp> #include <cudf/table/table_view.hpp> -#include <cudf/utilities/error.hpp> - -#include <memory> using bools_col = cudf::test::fixed_width_column_wrapper<bool>; using int32s_col = cudf::test::fixed_width_column_wrapper<int32_t>; using structs_col = cudf::test::structs_column_wrapper; using strings_col = cudf::test::strings_column_wrapper; -constexpr int32_t null{0}; // Mark for null child elements -constexpr int32_t XXX{0}; // Mark for null struct elements +constexpr bool print_all{false}; // For debugging +constexpr int32_t null{0}; // Mark for null child elements +constexpr int32_t XXX{0}; // Mark for null struct elements template <typename T> struct TypedStructScatterTest : public cudf::test::BaseFixture { @@ -47,15 +45,18 @@ using TestTypes = cudf::test::Concat<cudf::test::IntegralTypes, TYPED_TEST_CASE(TypedStructScatterTest, TestTypes); namespace { -void test_scatter(std::unique_ptr<cudf::column> const& structs_src, - std::unique_ptr<cudf::column> const& structs_tgt, - std::unique_ptr<cudf::column> const& structs_expected, - std::unique_ptr<cudf::column> const& scatter_map) +auto no_null() { return cudf::test::iterator_no_null(); } + +auto null_at(cudf::size_type idx) { return cudf::test::iterator_with_null_at(idx); } + +auto scatter_structs(std::unique_ptr<cudf::column> const& structs_src, + std::unique_ptr<cudf::column> const& structs_tgt, + std::unique_ptr<cudf::column> const& scatter_map) { auto const source = cudf::table_view{std::vector<cudf::column_view>{structs_src->view()}}; auto const target = cudf::table_view{std::vector<cudf::column_view>{structs_tgt->view()}}; auto const result = cudf::scatter(source, scatter_map->view(), target); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(structs_expected->view(), result->get_column(0)); + return result->get_column(0); } } // namespace @@ -65,14 +66,14 @@ TYPED_TEST(TypedStructScatterTest, EmptyInputTest) using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>; auto child_col_src = col_wrapper{}; - auto const structs_src = structs_col{{child_col_src}, std::vector<bool>{}}.release(); + auto const structs_src = structs_col{{child_col_src}}.release(); auto child_col_tgt = col_wrapper{}; - auto const structs_tgt = structs_col{{child_col_tgt}, std::vector<bool>{}}.release(); + auto const structs_tgt = structs_col{{child_col_tgt}}.release(); auto const scatter_map = int32s_col{}.release(); - test_scatter(structs_src, structs_tgt, structs_src, scatter_map); - test_scatter(structs_src, structs_tgt, structs_tgt, scatter_map); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *structs_src, scatter_structs(structs_src, structs_tgt, scatter_map), print_all); } // Test case when only the scatter map is empty @@ -80,81 +81,49 @@ TYPED_TEST(TypedStructScatterTest, EmptyScatterMapTest) { using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>; - auto child_col_src = - col_wrapper{{0, 1, 2, 3, null, XXX}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}; - auto const structs_src = structs_col{ - {child_col_src}, cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i != 5; - })}.release(); - - auto child_col_tgt = - col_wrapper{{50, null, 70, XXX, 90, 100}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}; - auto const structs_tgt = structs_col{ - {child_col_tgt}, cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i != 3; - })}.release(); + auto child_col_src = col_wrapper{{0, 1, 2, 3, null, XXX}, null_at(4)}; + auto const structs_src = structs_col{{child_col_src}, null_at(5)}.release(); + + auto child_col_tgt = col_wrapper{{50, null, 70, XXX, 90, 100}, null_at(1)}; + auto const structs_tgt = structs_col{{child_col_tgt}, null_at(3)}.release(); auto const scatter_map = int32s_col{}.release(); - test_scatter(structs_src, structs_tgt, structs_tgt, scatter_map); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *structs_tgt, scatter_structs(structs_src, structs_tgt, scatter_map), print_all); } TYPED_TEST(TypedStructScatterTest, ScatterAsCopyTest) { using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>; - auto child_col_src = - col_wrapper{{0, 1, 2, 3, null, XXX}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}; - auto const structs_src = structs_col{ - {child_col_src}, cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i != 5; - })}.release(); - - auto child_col_tgt = - col_wrapper{{50, null, 70, XXX, 90, 100}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}; - auto const structs_tgt = structs_col{ - {child_col_tgt}, cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i != 3; - })}.release(); + auto child_col_src = col_wrapper{{0, 1, 2, 3, null, XXX}, null_at(4)}; + auto const structs_src = structs_col{{child_col_src}, null_at(5)}.release(); + + auto child_col_tgt = col_wrapper{{50, null, 70, XXX, 90, 100}, null_at(1)}; + auto const structs_tgt = structs_col{{child_col_tgt}, null_at(3)}.release(); // Scatter as copy: the target should be the same as source auto const scatter_map = int32s_col{0, 1, 2, 3, 4, 5}.release(); - test_scatter(structs_src, structs_tgt, structs_src, scatter_map); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *structs_src, scatter_structs(structs_src, structs_tgt, scatter_map), print_all); } TYPED_TEST(TypedStructScatterTest, ScatterAsLeftShiftTest) { using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>; - auto child_col_src = - col_wrapper{{0, 1, 2, 3, null, XXX}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}; - auto const structs_src = structs_col{ - {child_col_src}, cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i != 5; - })}.release(); - - auto child_col_tgt = - col_wrapper{{50, null, 70, XXX, 90, 100}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}; - auto const structs_tgt = structs_col{ - {child_col_tgt}, cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i != 3; - })}.release(); - - auto child_col_expected = - col_wrapper{{2, 3, null, XXX, 0, 1}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2; })}; - auto structs_expected = structs_col{ - {child_col_expected}, cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i != 3; - })}.release(); + auto child_col_src = col_wrapper{{0, 1, 2, 3, null, XXX}, null_at(4)}; + auto const structs_src = structs_col{{child_col_src}, null_at(5)}.release(); + + auto child_col_tgt = col_wrapper{{50, null, 70, XXX, 90, 100}, null_at(1)}; + auto const structs_tgt = structs_col{{child_col_tgt}, null_at(3)}.release(); + + auto child_col_expected = col_wrapper{{2, 3, null, XXX, 0, 1}, null_at(2)}; + auto structs_expected = structs_col{{child_col_expected}, null_at(3)}.release(); auto const scatter_map = int32s_col{-2, -1, 0, 1, 2, 3}.release(); - test_scatter(structs_src, structs_tgt, structs_expected, scatter_map); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *structs_expected, scatter_structs(structs_src, structs_tgt, scatter_map), print_all); } TYPED_TEST(TypedStructScatterTest, SimpleScatterTests) @@ -162,44 +131,26 @@ TYPED_TEST(TypedStructScatterTest, SimpleScatterTests) using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>; // Source data - auto child_col_src = - col_wrapper{{0, 1, 2, 3, null, XXX}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}; - auto const structs_src = structs_col{ - {child_col_src}, cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i != 5; - })}.release(); + auto child_col_src = col_wrapper{{0, 1, 2, 3, null, XXX}, null_at(4)}; + auto const structs_src = structs_col{{child_col_src}, null_at(5)}.release(); // Target data - auto child_col_tgt = - col_wrapper{{50, null, 70, XXX, 90, 100}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}; - auto const structs_tgt = structs_col{ - {child_col_tgt}, cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i != 3; - })}.release(); + auto child_col_tgt = col_wrapper{{50, null, 70, XXX, 90, 100}, null_at(1)}; + auto const structs_tgt = structs_col{{child_col_tgt}, null_at(3)}.release(); // Expected data - auto child_col_expected1 = - col_wrapper{{1, null, 70, XXX, 0, 2}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}; - auto const structs_expected1 = structs_col{ - {child_col_expected1}, cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i != 3; - })}.release(); - auto const scatter_map1 = int32s_col{-2, 0, 5}.release(); - test_scatter(structs_src, structs_tgt, structs_expected1, scatter_map1); + auto child_col_expected1 = col_wrapper{{1, null, 70, XXX, 0, 2}, null_at(1)}; + auto const structs_expected1 = structs_col{{child_col_expected1}, null_at(3)}.release(); + auto const scatter_map1 = int32s_col{-2, 0, 5}.release(); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *structs_expected1, scatter_structs(structs_src, structs_tgt, scatter_map1), print_all); // Expected data - auto child_col_expected2 = - col_wrapper{{1, null, 70, 3, 0, 2}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}; - auto const structs_expected2 = structs_col{ - {child_col_expected2}, cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return true; - })}.release(); - auto const scatter_map2 = int32s_col{-2, 0, 5, 3}.release(); - test_scatter(structs_src, structs_tgt, structs_expected2, scatter_map2); + auto child_col_expected2 = col_wrapper{{1, null, 70, 3, 0, 2}, null_at(1)}; + auto const structs_expected2 = structs_col{{child_col_expected2}, no_null()}.release(); + auto const scatter_map2 = int32s_col{-2, 0, 5, 3}.release(); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *structs_expected2, scatter_structs(structs_src, structs_tgt, scatter_map2), print_all); } TYPED_TEST(TypedStructScatterTest, ComplexDataScatterTest) @@ -209,61 +160,41 @@ TYPED_TEST(TypedStructScatterTest, ComplexDataScatterTest) // Source data auto names_column_src = - strings_col{{"Newton", "Washington", "Cherry", "Kiwi", "Lemon", "Tomato"}, - cudf::detail::make_counting_transform_iterator(0, [](auto) { return true; })}; - auto ages_column_src = - col_wrapper{{5, 10, 15, 20, 25, 30}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}; + strings_col{{"Newton", "Washington", "Cherry", "Kiwi", "Lemon", "Tomato" /*XXX*/}, no_null()}; + auto ages_column_src = col_wrapper{{5, 10, 15, 20, null, XXX}, null_at(4)}; auto is_human_col_src = - bools_col{{true, true, false, false, false, false}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; })}; + bools_col{{true, true, false, false /*null*/, false, false /*XXX*/}, null_at(3)}; + auto const structs_src = + structs_col{{names_column_src, ages_column_src, is_human_col_src}, null_at(5)}.release(); // Target data - auto names_column_tgt = - strings_col{{"String 0", "String 1", "String 2", "String 3", "String 4", "String 5"}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0; })}; - auto ages_column_tgt = - col_wrapper{{50, 60, 70, 80, 90, 100}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}; - auto is_human_col_tgt = - bools_col{{true, true, true, true, true, true}, - cudf::detail::make_counting_transform_iterator(0, [](auto) { return true; })}; + auto names_column_tgt = strings_col{ + {"String 0" /*null*/, "String 1", "String 2" /*XXX*/, "String 3", "String 4", "String 5"}, + null_at(0)}; + auto ages_column_tgt = col_wrapper{{50, null, XXX, 80, 90, 100}, null_at(1)}; + auto is_human_col_tgt = bools_col{{true, true, true /*XXX*/, true, true, true}, no_null()}; + auto const structs_tgt = + structs_col{{names_column_tgt, ages_column_tgt, is_human_col_tgt}, null_at(2)}.release(); // Expected data - auto names_column_expected = - strings_col{{"String 0", "Lemon", "Kiwi", "Cherry", "Washington", "Newton"}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0; })}; - auto ages_column_expected = - col_wrapper{{50, 25, 20, 15, 10, 5}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}; + auto names_column_expected = strings_col{ + {"String 0" /*null*/, "Lemon", "Kiwi", "Cherry", "Washington", "Newton"}, null_at(0)}; + auto ages_column_expected = col_wrapper{{50, null, 20, 15, 10, 5}, null_at(1)}; auto is_human_col_expected = - bools_col{{true, false, false, false, true, true}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2; })}; - - auto const structs_src = structs_col{ - {names_column_src, ages_column_src, is_human_col_src}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i != 5; - })}.release(); - auto const structs_tgt = structs_col{ - {names_column_tgt, ages_column_tgt, is_human_col_tgt}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i != 2; - })}.release(); - auto const structs_expected = structs_col{ - {names_column_expected, ages_column_expected, is_human_col_expected}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return true; - })}.release(); + bools_col{{true, false, false /*null*/, false, true, true}, null_at(2)}; + auto const structs_expected = + structs_col{{names_column_expected, ages_column_expected, is_human_col_expected}, no_null()} + .release(); // The first element of the target is not overwritten auto const scatter_map = int32s_col{-1, 4, 3, 2, 1}.release(); - test_scatter(structs_src, structs_tgt, structs_expected, scatter_map); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *structs_expected, scatter_structs(structs_src, structs_tgt, scatter_map), print_all); } TYPED_TEST(TypedStructScatterTest, ScatterStructOfListsTest) { - // Testing gather() on struct<list<numeric>> + // Testing scatter() on struct<list<numeric>> using lists_col = cudf::test::lists_column_wrapper<TypeParam, int32_t>; // Source data @@ -289,26 +220,24 @@ TYPED_TEST(TypedStructScatterTest, ScatterStructOfListsTest) // The first 2 elements of the target is not overwritten auto const scatter_map = int32s_col{-3, -2, -1, 5, 4, 3, 2}.release(); - test_scatter(structs_src, structs_tgt, structs_expected, scatter_map); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *structs_expected, scatter_structs(structs_src, structs_tgt, scatter_map), print_all); } -TYPED_TEST(TypedStructScatterTest, ScatterSourceSmallerThanTarget) +TYPED_TEST(TypedStructScatterTest, SourceSmallerThanTargetScatterTest) { - using namespace cudf; - using namespace cudf::test; + using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>; - using fixed_width = fixed_width_column_wrapper<TypeParam, int32_t>; - using structs_col = structs_column_wrapper; - using scatter_map_col = fixed_width_column_wrapper<offset_type, int32_t>; + auto child_col_src = col_wrapper{22, 55}; + auto const structs_src = structs_col{{child_col_src}}.release(); - auto source_child = fixed_width{22, 55}; - auto target_child = fixed_width{0, 1, 2, 3, 4, 5, 6}; - auto expected_child = fixed_width{0, 1, 22, 3, 4, 55, 6}; + auto child_col_tgt = col_wrapper{0, 1, 2, 3, 4, 5, 6}; + auto const structs_tgt = structs_col{{child_col_tgt}}.release(); - auto const source = structs_col{{source_child}}.release(); - auto const target = structs_col{{target_child}}.release(); - auto const scatter_map = scatter_map_col{2, 5}.release(); - auto const expected = structs_col{{expected_child}}.release(); + auto child_col_expected = col_wrapper{0, 1, 22, 3, 4, 55, 6}; + auto const structs_expected = structs_col{{child_col_expected}}.release(); - test_scatter(source, target, expected, scatter_map); + auto const scatter_map = int32s_col{2, 5}.release(); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *structs_expected, scatter_structs(structs_src, structs_tgt, scatter_map), print_all); }