diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 91f67fd0420..73373953544 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -359,6 +359,7 @@ add_library( src/join/mixed_join_size_kernel_nulls.cu src/join/mixed_join_size_kernels_semi.cu src/join/semi_join.cu + src/lists/apply_boolean_mask.cu src/lists/contains.cu src/lists/combine/concatenate_list_elements.cu src/lists/combine/concatenate_rows.cu diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp new file mode 100644 index 00000000000..0e9f2ec16c4 --- /dev/null +++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +namespace cudf::lists::detail { + +/** + * @copydoc cudf::lists::apply_boolean_mask(lists_column_view const&, lists_column_view const&, + * rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches + */ +std::unique_ptr apply_boolean_mask( + lists_column_view const& input, + lists_column_view const& boolean_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace cudf::lists::detail diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp new file mode 100644 index 00000000000..c7a9731eb65 --- /dev/null +++ b/cpp/include/cudf/lists/stream_compaction.hpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +namespace cudf::lists { + +/** + * @brief Filters elements in each row of `input` LIST column using `boolean_mask` + * LIST of booleans as a mask. + * + * Given an input `LIST` column and a list-of-bools column, the function produces + * a new `LIST` column of the same type as `input`, where each element is copied + * from the input row *only* if the corresponding `boolean_mask` is non-null and `true`. + * + * E.g. + * @code{.pseudo} + * input = { {0,1,2}, {3,4}, {5,6,7}, {8,9} }; + * boolean_mask = { {0,1,1}, {1,0}, {1,1,1}, {0,0} }; + * results = { {1,2}, {3}, {5,6,7}, {} }; + * @endcode + * + * `input` and `boolean_mask` must have the same number of rows. + * The output column has the same number of rows as the input column. + * An element is copied to an output row *only* if the corresponding boolean_mask element is `true`. + * An output row is invalid only if the input row is invalid. + * + * @throws cudf::logic_error if `boolean_mask` is not a "lists of bools" column + * @throws cudf::logic_error if `input` and `boolean_mask` have different number of rows + * + * @param input The input list column view to be filtered + * @param boolean_mask A nullable list of bools column used to filter `input` elements + * @param mr Device memory resource used to allocate the returned table's device memory + * @return List column of the same type as `input`, containing filtered list rows + */ +std::unique_ptr apply_boolean_mask( + lists_column_view const& input, + lists_column_view const& boolean_mask, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace cudf::lists diff --git a/cpp/src/lists/apply_boolean_mask.cu b/cpp/src/lists/apply_boolean_mask.cu new file mode 100644 index 00000000000..670e99dfbc8 --- /dev/null +++ b/cpp/src/lists/apply_boolean_mask.cu @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace cudf::lists { +namespace detail { + +std::unique_ptr apply_boolean_mask(lists_column_view const& input, + lists_column_view const& boolean_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(boolean_mask.child().type().id() == type_id::BOOL8, "Mask must be of type BOOL8."); + CUDF_EXPECTS(input.size() == boolean_mask.size(), + "Boolean masks column must have same number of rows as input."); + auto const num_rows = input.size(); + + if (num_rows == 0) { return cudf::empty_like(input.parent()); } + + auto constexpr offset_data_type = data_type{type_id::INT32}; + + auto const boolean_mask_sliced_child = boolean_mask.get_sliced_child(stream); + + auto const make_filtered_child = [&] { + auto filtered = + cudf::detail::apply_boolean_mask( + cudf::table_view{{input.get_sliced_child(stream)}}, boolean_mask_sliced_child, stream, mr) + ->release(); + return std::move(filtered.front()); + }; + + auto const make_output_offsets = [&] { + auto boolean_mask_sliced_offsets = + cudf::detail::slice( + boolean_mask.offsets(), {boolean_mask.offset(), boolean_mask.size() + 1}, stream) + .front(); + auto const sizes = cudf::reduction::segmented_sum(boolean_mask_sliced_child, + boolean_mask_sliced_offsets, + offset_data_type, + null_policy::EXCLUDE, + stream); + auto const d_sizes = column_device_view::create(*sizes, stream); + auto const sizes_begin = cudf::detail::make_null_replacement_iterator(*d_sizes, offset_type{0}); + auto const sizes_end = sizes_begin + sizes->size(); + auto output_offsets = cudf::make_numeric_column( + offset_data_type, num_rows + 1, mask_state::UNALLOCATED, stream, mr); + auto output_offsets_view = output_offsets->mutable_view(); + + // Could have attempted an exclusive_scan(), but it would not compute the last entry. + // Instead, inclusive_scan(), followed by writing `0` to the head of the offsets column. + thrust::inclusive_scan(rmm::exec_policy(stream), + sizes_begin, + sizes_end, + output_offsets_view.begin() + 1); + CUDF_CUDA_TRY(cudaMemsetAsync( + output_offsets_view.begin(), 0, sizeof(offset_type), stream.value())); + return output_offsets; + }; + + return cudf::make_lists_column(input.size(), + make_output_offsets(), + make_filtered_child(), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + stream, + mr); +} +} // namespace detail + +std::unique_ptr apply_boolean_mask(lists_column_view const& input, + lists_column_view const& boolean_mask, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_boolean_mask(input, boolean_mask, rmm::cuda_stream_default, mr); +} + +} // namespace cudf::lists diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e016f47616b..467b78b2028 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -469,6 +469,7 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp) # * lists tests ---------------------------------------------------------------------------------- ConfigureTest( LISTS_TEST + lists/apply_boolean_mask_test.cpp lists/combine/concatenate_list_elements_tests.cpp lists/combine/concatenate_rows_tests.cpp lists/contains_tests.cpp diff --git a/cpp/tests/lists/apply_boolean_mask_test.cpp b/cpp/tests/lists/apply_boolean_mask_test.cpp new file mode 100644 index 00000000000..a5b036210ba --- /dev/null +++ b/cpp/tests/lists/apply_boolean_mask_test.cpp @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cudf::test { + +using namespace iterators; +using cudf::lists_column_view; +using cudf::lists::apply_boolean_mask; + +template +using lists = lists_column_wrapper; +using filter_t = lists_column_wrapper; + +template +using fwcw = fixed_width_column_wrapper; +using offsets = fwcw; +using strings = strings_column_wrapper; + +auto constexpr X = int32_t{0}; // Placeholder for NULL. + +struct ApplyBooleanMaskTest : public BaseFixture { +}; + +template +struct ApplyBooleanMaskTypedTest : ApplyBooleanMaskTest { +}; + +TYPED_TEST_SUITE(ApplyBooleanMaskTypedTest, cudf::test::NumericTypes); + +TYPED_TEST(ApplyBooleanMaskTypedTest, StraightLine) +{ + using T = TypeParam; + auto input = lists{{0, 1, 2, 3}, {4, 5}, {6, 7, 8, 9}, {0, 1}, {2, 3, 4, 5}, {6, 7}}.release(); + auto filter = filter_t{{1, 0, 1, 0}, {1, 0}, {1, 0, 1, 0}, {1, 0}, {1, 0, 1, 0}, {1, 0}}; + + { + // Unsliced. + auto filtered = apply_boolean_mask(lists_column_view{*input}, lists_column_view{filter}); + auto expected = lists{{0, 2}, {4}, {6, 8}, {0}, {2, 4}, {6}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*filtered, expected); + } + { + // Sliced input: Remove the first row. + auto sliced = cudf::slice(*input, {1, input->size()}).front(); + // == lists_t {{4, 5}, {6, 7, 8, 9}, {0, 1}, {2, 3, 4, 5}, {6, 7}}; + auto filter = filter_t{{0, 1}, {0, 1, 0, 1}, {1, 1}, {0, 1, 0, 1}, {0, 0}}; + auto filtered = apply_boolean_mask(lists_column_view{sliced}, lists_column_view{filter}); + auto expected = lists{{5}, {7, 9}, {0, 1}, {3, 5}, {}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*filtered, expected); + } +} + +TYPED_TEST(ApplyBooleanMaskTypedTest, NullElementsInTheListRows) +{ + using T = TypeParam; + auto input = + lists{ + {0, 1, 2, 3}, + lists{{X, 5}, null_at(0)}, + {6, 7, 8, 9}, + {0, 1}, + lists{{X, 3, 4, X}, nulls_at({0, 3})}, + lists{{X, X}, nulls_at({0, 1})}, + } + .release(); + auto filter = filter_t{{1, 0, 1, 0}, {1, 0}, {1, 0, 1, 0}, {1, 0}, {1, 0, 1, 0}, {1, 0}}; + + { + // Unsliced. + auto filtered = apply_boolean_mask(lists_column_view{*input}, lists_column_view{filter}); + auto expected = lists{{0, 2}, + lists{{X}, null_at(0)}, + {6, 8}, + {0}, + lists{{X, 4}, null_at(0)}, + lists{{X}, null_at(0)}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*filtered, expected); + } + { + // Sliced input: Remove the first row. + auto sliced = cudf::slice(*input, {1, input->size()}).front(); + // == lists_t {{X, 5}, {6, 7, 8, 9}, {0, 1}, {X, 3, 4, X}, {X, X}}; + auto filter = filter_t{{0, 1}, {0, 1, 0, 1}, {1, 1}, {0, 1, 0, 1}, {0, 0}}; + auto filtered = apply_boolean_mask(lists_column_view{sliced}, lists_column_view{filter}); + auto expected = lists{{5}, {7, 9}, {0, 1}, lists{{3, X}, null_at(1)}, {}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*filtered, expected); + } +} + +TYPED_TEST(ApplyBooleanMaskTypedTest, NullListRowsInTheInputColumn) +{ + using T = TypeParam; + auto input = + lists{{{0, 1, 2, 3}, {}, {6, 7, 8, 9}, {}, {2, 3, 4, 5}, {6, 7}}, nulls_at({1, 3})} + .release(); + auto filter = filter_t{{1, 0, 1, 0}, {}, {1, 0, 1, 0}, {}, {1, 0, 1, 0}, {1, 0}}; + + { + // Unsliced. + auto filtered = apply_boolean_mask(lists_column_view{*input}, lists_column_view{filter}); + auto expected = lists{{{0, 2}, {}, {6, 8}, {}, {2, 4}, {6}}, nulls_at({1, 3})}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*filtered, expected); + } + { + // Sliced input: Remove the first row. + auto sliced = cudf::slice(*input, {1, input->size()}).front(); + // == lists_t{{{}, {6, 7, 8, 9}, {}, {2, 3, 4, 5}, {6, 7}}, nulls_at({0,2})}; + auto filter = filter_t{{}, {0, 1, 0, 1}, {}, {0, 1, 0, 1}, {0, 0}}; + auto filtered = apply_boolean_mask(lists_column_view{sliced}, lists_column_view{filter}); + auto expected = lists{{{}, {7, 9}, {}, {3, 5}, {}}, nulls_at({0, 2})}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*filtered, expected); + } + { + // Sliced input: Remove the first two rows. + auto sliced = cudf::slice(*input, {2, input->size()}).front(); + // == lists_t{{{6, 7, 8, 9}, {}, {2, 3, 4, 5}, {6, 7}}, null_at(1)}; + auto filter = filter_t{{0, 1, 0, 1}, {}, {0, 1, 0, 1}, {0, 0}}; + auto filtered = apply_boolean_mask(lists_column_view{sliced}, lists_column_view{filter}); + auto expected = lists{{{7, 9}, {}, {3, 5}, {}}, null_at(1)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*filtered, expected); + } +} + +TYPED_TEST(ApplyBooleanMaskTypedTest, StructInput) +{ + using T = TypeParam; + using fwcw = fwcw; + + auto constexpr num_input_rows = 7; + auto const input = [] { + auto child_num = fwcw{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + auto child_str = strings{"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}; + auto const null_mask_begin = null_at(5); + auto const null_mask_end = null_mask_begin + num_input_rows; + return cudf::make_lists_column(num_input_rows, + offsets{0, 2, 3, 6, 6, 8, 8, 10}.release(), + structs_column_wrapper{{child_num, child_str}}.release(), + 1, + detail::make_null_mask(null_mask_begin, null_mask_end)); + }(); + { + // Unsliced. + // The input should now look as follows: (String child dropped for brevity.) + // Input: {[0, 1], [2], [3, 4, 5], [], [6, 7], [], [8, 9]} + auto const filter = filter_t{{1, 1}, {0}, {0, 1, 0}, {}, {1, 0}, {}, {0, 1}}; + auto const result = apply_boolean_mask(lists_column_view{*input}, lists_column_view{filter}); + auto const expected = [] { + auto child_num = fwcw{0, 1, 4, 6, 9}; + auto child_str = strings{"0", "1", "4", "6", "9"}; + auto const null_mask_begin = null_at(5); + auto const null_mask_end = null_mask_begin + num_input_rows; + return cudf::make_lists_column(num_input_rows, + offsets{0, 2, 2, 3, 3, 4, 4, 5}.release(), + structs_column_wrapper{{child_num, child_str}}.release(), + 1, + detail::make_null_mask(null_mask_begin, null_mask_end)); + }(); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); + } + { + // Sliced. Remove the first row. + auto const sliced_input = cudf::slice(*input, {1, input->size()}).front(); + // The input should now look as follows: (String child dropped for brevity.) + // Input: {[2], [3, 4, 5], [], [6, 7], [], [8, 9]} + auto const filter = filter_t{{0}, {0, 1, 0}, {}, {1, 0}, {}, {0, 1}}; + auto const result = + apply_boolean_mask(lists_column_view{sliced_input}, lists_column_view{filter}); + auto const expected = [] { + auto child_num = fwcw{4, 6, 9}; + auto child_str = strings{"4", "6", "9"}; + auto const null_mask_begin = null_at(4); + auto const null_mask_end = null_mask_begin + num_input_rows; + return cudf::make_lists_column(num_input_rows - 1, + offsets{0, 0, 1, 1, 2, 2, 3}.release(), + structs_column_wrapper{{child_num, child_str}}.release(), + 1, + detail::make_null_mask(null_mask_begin, null_mask_end)); + }(); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); + } +} + +TEST_F(ApplyBooleanMaskTest, Trivial) +{ + auto const input = lists{}; + auto const filter = filter_t{}; + auto const result = apply_boolean_mask(lists_column_view{input}, lists_column_view{filter}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, lists{}); +} + +TEST_F(ApplyBooleanMaskTest, Failure) +{ + { + // Invalid mask type. + auto const input = lists{{1, 2, 3}, {4, 5, 6}}; + auto const filter = lists{{0, 0, 0}}; + CUDF_EXPECT_THROW_MESSAGE( + apply_boolean_mask(lists_column_view{input}, lists_column_view{filter}), + "Mask must be of type BOOL8."); + } + { + // Mismatched number of rows. + auto const input = lists{{1, 2, 3}, {4, 5, 6}}; + auto const filter = filter_t{{0, 0, 0}}; + CUDF_EXPECT_THROW_MESSAGE( + apply_boolean_mask(lists_column_view{input}, lists_column_view{filter}), + "Boolean masks column must have same number of rows as input."); + } +} +} // namespace cudf::test