From 3bbec6e1e59971adb87cc20802532ec1626123b4 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 16 Sep 2021 19:27:19 -0600 Subject: [PATCH] Fix `gather` for sliced input structs column (#9218) This PR fixes the `gather` API for structs columns when the input is a sliced column. Previously, `gather` calls `child_begin()` and `child_end()` to access the children column so if the input structs column is sliced then the output is incorrect. This closes #9213, and is blocked by https://github.com/rapidsai/cudf/pull/9194 due to conflict work. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - MithunR (https://github.com/mythrocks) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/9218 --- cpp/include/cudf/detail/gather.cuh | 27 +++++++---- cpp/tests/copying/gather_struct_tests.cpp | 58 +++++++++++++++++++++++ 2 files changed, 76 insertions(+), 9 deletions(-) diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 94433975744..06de9ff2716 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -450,16 +450,24 @@ struct column_gatherer_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - structs_column_view structs_column(column); - auto gather_map_size{std::distance(gather_map_begin, gather_map_end)}; + auto const gather_map_size = std::distance(gather_map_begin, gather_map_end); if (gather_map_size == 0) { return empty_like(column); } + // Gathering needs to operate on the sliced children since they need to take into account the + // offset of the parent structs column. + std::vector sliced_children; + std::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(column.num_children()), + std::back_inserter(sliced_children), + [structs_view = structs_column_view{column}](auto const idx) { + return structs_view.get_sliced_child(idx); + }); + std::vector> output_struct_members; - std::transform(structs_column.child_begin(), - structs_column.child_end(), + std::transform(sliced_children.begin(), + sliced_children.end(), std::back_inserter(output_struct_members), - [&gather_map_begin, &gather_map_end, nullify_out_of_bounds, stream, mr]( - cudf::column_view const& col) { + [&](auto const& col) { return cudf::type_dispatcher(col.type(), column_gatherer{}, col, @@ -471,14 +479,15 @@ struct column_gatherer_impl { }); auto const nullable = - nullify_out_of_bounds or std::any_of(structs_column.child_begin(), - structs_column.child_end(), + nullify_out_of_bounds || std::any_of(sliced_children.begin(), + sliced_children.end(), [](auto const& col) { return col.nullable(); }); + if (nullable) { gather_bitmask( // Table view of struct column. cudf::table_view{ - std::vector{structs_column.child_begin(), structs_column.child_end()}}, + std::vector{sliced_children.begin(), sliced_children.end()}}, gather_map_begin, output_struct_members, nullify_out_of_bounds ? gather_bitmask_op::NULLIFY : gather_bitmask_op::DONT_CHECK, diff --git a/cpp/tests/copying/gather_struct_tests.cpp b/cpp/tests/copying/gather_struct_tests.cpp index f6047cf14af..6927e8ab476 100644 --- a/cpp/tests/copying/gather_struct_tests.cpp +++ b/cpp/tests/copying/gather_struct_tests.cpp @@ -177,6 +177,64 @@ TYPED_TEST(TypedStructGatherTest, TestSimpleStructGather) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(output->view(), expected_output); } +TYPED_TEST(TypedStructGatherTest, TestSlicedStructsColumnGatherNoNulls) +{ + auto const structs_original = [] { + auto child1 = fixed_width_column_wrapper{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + auto child2 = strings_column_wrapper{ + "One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine", "Ten"}; + return structs_column_wrapper{{child1, child2}}; + }(); + + auto const expected = [] { + auto child1 = fixed_width_column_wrapper{6, 10, 8}; + auto child2 = strings_column_wrapper{"Six", "Ten", "Eight"}; + return structs_column_wrapper{{child1, child2}}; + }(); + + auto const structs = cudf::slice(structs_original, {4, 10})[0]; + auto const gather_map = fixed_width_column_wrapper{1, 5, 3}; + auto const result = cudf::gather(cudf::table_view{{structs}}, gather_map)->get_column(0); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.view(), expected); +} + +TYPED_TEST(TypedStructGatherTest, TestSlicedStructsColumnGatherWithNulls) +{ + auto constexpr null = int32_t{0}; // null at child + auto constexpr XXX = int32_t{0}; // null at parent + + auto const structs_original = [] { + auto child1 = fixed_width_column_wrapper{ + {null, XXX, 3, null, null, 6, XXX, null, null, 10}, nulls_at({0, 3, 4, 7, 8})}; + auto child2 = strings_column_wrapper{{"One", + "" /*NULL at both parent and child*/, + "Three", + "" /*NULL*/, + "Five", + "" /*NULL*/, + "" /*NULL at parent*/, + "" /*NULL*/, + "Nine", + "" /*NULL*/}, + nulls_at({1, 3, 5, 7, 9})}; + return structs_column_wrapper{{child1, child2}, nulls_at({1, 6})}; + }(); + + auto const expected = [] { + auto child1 = fixed_width_column_wrapper{{6, 10, null, XXX}, null_at(2)}; + auto child2 = strings_column_wrapper{{ + "" /*NULL*/, "" /*NULL*/, "Nine", "" /*NULL at parent*/ + }, + nulls_at({0, 1})}; + return structs_column_wrapper{{child1, child2}, null_at(3)}; + }(); + + auto const structs = cudf::slice(structs_original, {4, 10})[0]; + auto const gather_map = fixed_width_column_wrapper{1, 5, 4, 2}; + auto const result = cudf::gather(cudf::table_view{{structs}}, gather_map)->get_column(0); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.view(), expected); +} + TYPED_TEST(TypedStructGatherTest, TestNullifyOnNonNullInput) { // Test that the null masks of the struct output (and its children) are set correctly,