From e0ec1a579a44a2a19e4e77149b9fcfe24d301c59 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 3 Mar 2021 10:35:22 -0700 Subject: [PATCH 01/25] Add drop_list_duplicates implementation --- cpp/CMakeLists.txt | 1 + .../cudf/lists/drop_list_duplicates.hpp | 64 ++++++++ cpp/include/doxygen_groups.h | 1 + cpp/src/lists/drop_list_duplicates.cu | 144 ++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + 5 files changed, 211 insertions(+) create mode 100644 cpp/include/cudf/lists/drop_list_duplicates.hpp create mode 100644 cpp/src/lists/drop_list_duplicates.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d9d4c6bfd79..c6162846e94 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -253,6 +253,7 @@ add_library(cudf src/lists/copying/segmented_gather.cu src/lists/count_elements.cu src/lists/extract.cu + src/lists/drop_list_duplicates.cu src/lists/lists_column_factories.cu src/lists/lists_column_view.cu src/lists/segmented_sort.cu diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp new file mode 100644 index 00000000000..69639d351b0 --- /dev/null +++ b/cpp/include/cudf/lists/drop_list_duplicates.hpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +namespace cudf { +namespace lists { +/** + * @addtogroup lists_drop_duplicates + * @{ + * @file + */ + +/** + * @brief Create a new column by removing duplicated elements from each list in the given + * lists_column + * + * Given an `input` list_column_view, the list elements in the column are copied to an output column + * such that their duplicated entries are dropped. Adopted from stream compaction, the definition of + * uniqueness depends on the value of @p keep: + * - KEEP_FIRST: only the first of a sequence of duplicate entries is copied + * - KEEP_LAST: only the last of a sequence of duplicate entries is copied + * - KEEP_NONE: all duplicate entries are dropped out + * + * @param[in] lists_column the input lists_column_view + * @param[in] keep keep first entry, last entry, or no entries if duplicates found + * @param[in] nulls_equal flag to denote nulls are considered equal + * @param[in] mr Device memory resource used to allocate the returned column + * + * * @code{.pseudo} + * input = { {1, 1, 2, 1, 3}, {4}, {5, 6, 6, 6, 5} } + * if keep is KEEP_FIRST, the output will be { {1, 2, 3}, {4}, {5, 6} } + * if keep is KEEP_LAST, the output will be { {2, 1, 3}, {4}, {6, 5} } + * if keep is KEEP_NONE, the output will be { {2, 3}, {4}, {} } + * @endcode + * + * @return A list column with lists having unique entries as specified by the `keep` policy. + */ +std::unique_ptr drop_list_duplicates( + lists_column_view const& lists_column, + duplicate_keep_option keep, + null_equality nulls_equal = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of group +} // namespace lists +} // namespace cudf diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index e732a13e67c..c99b60e38c9 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -146,6 +146,7 @@ * @defgroup lists_contains Searching * @defgroup lists_gather Gathering * @defgroup lists_elements Counting + * @defgroup lists_drop_duplicates Dropping duplicated elements * @} * @defgroup nvtext_apis NVText * @{ diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu new file mode 100644 index 00000000000..90cf4a1af89 --- /dev/null +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +#include + +#include + +namespace cudf { +namespace lists { +namespace detail { + +/** + * @copydoc cudf::lists::drop_list_duplicates + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, + duplicate_keep_option keep, + null_equality nulls_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (lists_column.is_empty()) return cudf::empty_like(lists_column.parent()); + + /* + * Given input = { {1, 1, 2, 1, 3}, {4}, {5, 6, 6, 6, 5} } + * - if keep is KEEP_FIRST, the output will be { {1, 2, 3}, {4}, {5, 6} } + * - if keep is KEEP_LAST, the output will be { {2, 1, 3}, {4}, {6, 5} } + * - if keep is KEEP_NONE, the output will be { {2, 3}, {4}, {} } + * + * 1. Generate ordered indices for each list element + * ordered_indices = { {0, 1, 2, 3, 4}, {5}, {6, 7, 8, 9, 10} } + * + * 2. Call sort_list on the lists and indices using the list entries as keys + * sorted_lists = { {1, 1, 1, 2, 3}, {4}, {5, 5, 6, 6, 6} }, and + * sorted_indices = { {0, 1, 3, 2, 4}, {5}, {6, 10, 7, 8, 9} } + * + * 3. Remove list indices if the list entries are duplicated + * - with keep is KEEP_FIRST: sorted_unique_indices = { {0, 2, 4}, {5}, {6, 7} } + * - with keep is KEEP_LAST: sorted_unique_indices = { {3, 2, 4}, {5}, {10, 9} } + * - with keep is KEEP_NONE: sorted_unique_indices = { {2, 4}, {5}, {} } + * + * 4. Call sort_lists on the sorted_unique_indices to obtain the final list indices + * - with keep is KEEP_FIRST: sorted_unique_indices = { {0, 2, 4}, {5}, {6, 7} } + * - with keep is KEEP_LAST: sorted_unique_indices = { {2, 3, 4}, {5}, {9, 10} } + * - with keep is KEEP_NONE: sorted_unique_indices = { {2, 4}, {5}, {} } + * + * 5. Gather list entries using the sorted_unique_indices as gather map + * (remember to deal with null elements) + * + * Corner cases: + * - null entries in a list: depending on the nulls_equal policy, if it is set to EQUAL then + * only one null entry is kept. Which null entry to keep---it is specified by the keep policy. + * - null rows: just return null rows, nothing changes. + * - NaN entries in a list: NaNs should be treated as equal, thus only one NaN value is kept. + * Again, which value to keep depends on the keep policy. + * - Nested types are not supported---the function should throw logic_error. + */ + +#if 0 + auto const offsets_column = lists_column.offsets(); + + // create a column_view with attributes of the parent and data from the offsets + column_view annotated_offsets(data_type{type_id::INT32}, + lists_column.size() + 1, + offsets_column.data(), + lists_column.null_mask(), + lists_column.null_count(), + lists_column.offset()); + + // create a gather map for extracting elements from the child column + auto gather_map = make_fixed_width_column( + data_type{type_id::INT32}, annotated_offsets.size() - 1, mask_state::UNALLOCATED, stream); + auto d_gather_map = gather_map->mutable_view().data(); + auto const child_column = lists_column.child(); + + // build the gather map using the offsets and the provided index + auto const d_column = column_device_view::create(annotated_offsets, stream); + if (index < 0) + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(gather_map->size()), + d_gather_map, + map_index_fn{*d_column, index, child_column.size()}); + else + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(gather_map->size()), + d_gather_map, + map_index_fn{*d_column, index, child_column.size()}); + + // Gather only the unique entries + auto result = cudf::detail::gather(table_view({child_column}), + d_gather_map, + d_gather_map + gather_map->size(), + out_of_bounds_policy::NULLIFY, // nullify-out-of-bounds + stream, + mr) + ->release(); + + // Set zero size for the null_mask if there is no null element + if (result.front()->null_count() == 0) + result.front()->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); + + return std::unique_ptr(std::move(result.front())); +#endif + + return empty_like(lists_column.parent()); +} + +} // namespace detail + +/** + * @copydoc cudf::lists::drop_list_duplicates + */ +std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, + duplicate_keep_option keep, + null_equality nulls_equal, + rmm::mr::device_memory_resource* mr) +{ + return detail::drop_list_duplicates( + lists_column, keep, nulls_equal, rmm::cuda_stream_default, mr); +} + +} // namespace lists +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index ea3dd8560fd..60137900273 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -432,6 +432,7 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp) ConfigureTest(LISTS_TEST lists/contains_tests.cpp lists/count_elements_tests.cpp + lists/drop_list_duplicates_tests.cpp lists/extract_tests.cpp lists/sort_lists_tests.cpp) From cae114bcde5f10f22604c0a58c60ce5499d25804 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 3 Mar 2021 10:35:42 -0700 Subject: [PATCH 02/25] Add tests for drop_list_duplicates --- .../lists/drop_list_duplicates_tests.cpp | 162 ++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 cpp/tests/lists/drop_list_duplicates_tests.cpp diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp new file mode 100644 index 00000000000..c5ec21a98eb --- /dev/null +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//#include +#include + +#include +#include +#include + +//#include +//#include + +struct DropListDuplicatesTest : public cudf::test::BaseFixture { +}; + +TEST_F(DropListDuplicatesTest, NonNullTable) +{ + cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 5}}; + cudf::test::fixed_width_column_wrapper col2{{4, 5, 3, 4, 9, 4}}; + cudf::test::fixed_width_column_wrapper col1_key{{20, 20, 20, 19, 21, 9}}; + cudf::test::fixed_width_column_wrapper col2_key{{19, 19, 20, 20, 9, 21}}; + + cudf::table_view input{{col1, col2, col1_key, col2_key}}; + std::vector keys{2, 3}; + + // Keep first of duplicate + // The expected table would be sorted in ascending order with respect to keys + cudf::test::fixed_width_column_wrapper exp_col1_first{{5, 5, 5, 3, 8}}; + cudf::test::fixed_width_column_wrapper exp_col2_first{{4, 4, 4, 3, 9}}; + cudf::test::fixed_width_column_wrapper exp_col1_key_first{{9, 19, 20, 20, 21}}; + cudf::test::fixed_width_column_wrapper exp_col2_key_first{{21, 20, 19, 20, 9}}; + cudf::table_view expected_first{ + {exp_col1_first, exp_col2_first, exp_col1_key_first, exp_col2_key_first}}; + + auto got_first = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); + + // keep last of duplicate + cudf::test::fixed_width_column_wrapper exp_col1_last{{5, 5, 4, 3, 8}}; + cudf::test::fixed_width_column_wrapper exp_col2_last{{4, 4, 5, 3, 9}}; + cudf::test::fixed_width_column_wrapper exp_col1_key_last{{9, 19, 20, 20, 21}}; + cudf::test::fixed_width_column_wrapper exp_col2_key_last{{21, 20, 19, 20, 9}}; + cudf::table_view expected_last{ + {exp_col1_last, exp_col2_last, exp_col1_key_last, exp_col2_key_last}}; + + auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); + + // Keep unique + cudf::test::fixed_width_column_wrapper exp_col1_unique{{5, 5, 3, 8}}; + cudf::test::fixed_width_column_wrapper exp_col2_unique{{4, 4, 3, 9}}; + cudf::test::fixed_width_column_wrapper exp_col1_key_unique{{9, 19, 20, 21}}; + cudf::test::fixed_width_column_wrapper exp_col2_key_unique{{21, 20, 20, 9}}; + cudf::table_view expected_unique{ + {exp_col1_unique, exp_col2_unique, exp_col1_key_unique, exp_col2_key_unique}}; + + auto got_unique = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); +} + +TEST_F(DropListDuplicatesTest, WithNull) +{ + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; + cudf::table_view input{{col, key}}; + std::vector keys{1}; + + // Keep first of duplicate + cudf::test::fixed_width_column_wrapper exp_col_first{{4, 5, 5, 8}, {0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_first{{20, 19, 20, 21}, {0, 1, 1, 1}}; + cudf::table_view expected_first{{exp_col_first, exp_key_col_first}}; + auto got_first = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); + + // Keep last of duplicate + cudf::test::fixed_width_column_wrapper exp_col_last{{3, 1, 5, 8}, {1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_last{{20, 19, 20, 21}, {0, 1, 1, 1}}; + cudf::table_view expected_last{{exp_col_last, exp_key_col_last}}; + auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); + + // Keep unique of duplicate + cudf::test::fixed_width_column_wrapper exp_col_unique{{5, 8}, {1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_unique{{20, 21}, {1, 1}}; + cudf::table_view expected_unique{{exp_col_unique, exp_key_col_unique}}; + auto got_unique = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); +} + +TEST_F(DropListDuplicatesTest, StringKeyColumn) +{ + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::strings_column_wrapper key_col{{"all", "new", "all", "new", "the", "strings"}, + {1, 1, 1, 0, 1, 1}}; + cudf::table_view input{{col, key_col}}; + std::vector keys{1}; + cudf::test::fixed_width_column_wrapper exp_col_last{{5, 3, 4, 1, 8}, {1, 1, 0, 1, 1}}; + cudf::test::strings_column_wrapper exp_key_col_last{{"new", "all", "new", "strings", "the"}, + {0, 1, 1, 1, 1}}; + cudf::table_view expected_last{{exp_col_last, exp_key_col_last}}; + + auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); +} + +TEST_F(DropListDuplicatesTest, EmptyInputTable) +{ + cudf::test::fixed_width_column_wrapper col(std::initializer_list{}); + cudf::table_view input{{col}}; + std::vector keys{1, 2}; + + auto got = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); +} + +TEST_F(DropListDuplicatesTest, NoColumnInputTable) +{ + cudf::table_view input{std::vector()}; + std::vector keys{1, 2}; + + auto got = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); +} + +TEST_F(DropListDuplicatesTest, EmptyKeys) +{ + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper empty_col{}; + cudf::table_view input{{col}}; + std::vector keys{}; + + auto got = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); +} From d07265f350233b1d781cc34905e3cdc335cd2923 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 3 Mar 2021 13:47:25 -0700 Subject: [PATCH 03/25] Draft implementation for drop_list_duplicates --- .../cudf/lists/drop_list_duplicates.hpp | 28 +- cpp/src/lists/drop_list_duplicates.cu | 522 +++++++++++++++--- 2 files changed, 448 insertions(+), 102 deletions(-) diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp index 69639d351b0..1da69b09d81 100644 --- a/cpp/include/cudf/lists/drop_list_duplicates.hpp +++ b/cpp/include/cudf/lists/drop_list_duplicates.hpp @@ -16,7 +16,6 @@ #pragma once #include -#include #include #include @@ -29,33 +28,30 @@ namespace lists { */ /** - * @brief Create a new column by removing duplicated elements from each list in the given + * @brief Create a new column by removing duplicated entries from each list in the given * lists_column * - * Given an `input` list_column_view, the list elements in the column are copied to an output column - * such that their duplicated entries are dropped. Adopted from stream compaction, the definition of - * uniqueness depends on the value of @p keep: - * - KEEP_FIRST: only the first of a sequence of duplicate entries is copied - * - KEEP_LAST: only the last of a sequence of duplicate entries is copied - * - KEEP_NONE: all duplicate entries are dropped out + * Given an `input` list_column_view, the list rows in the column are copied to an output column + * such that their duplicated entries are dropped out to keep only unique entries. The order of + * those entries are not guaranteed to be preserved as in the input column. + * + * If any row in the input column contains nested types, cudf::logic_error will be thrown. * * @param[in] lists_column the input lists_column_view - * @param[in] keep keep first entry, last entry, or no entries if duplicates found * @param[in] nulls_equal flag to denote nulls are considered equal * @param[in] mr Device memory resource used to allocate the returned column * - * * @code{.pseudo} - * input = { {1, 1, 2, 1, 3}, {4}, {5, 6, 6, 6, 5} } - * if keep is KEEP_FIRST, the output will be { {1, 2, 3}, {4}, {5, 6} } - * if keep is KEEP_LAST, the output will be { {2, 1, 3}, {4}, {6, 5} } - * if keep is KEEP_NONE, the output will be { {2, 3}, {4}, {} } + * @code{.pseudo} + * If the input is { {1, 1, 2, 1, 3}, {4}, {5, 6, 6, 6, 5} } + * Then a valid output can be { {1, 2, 3}, {4}, {5, 6} } + * Note that permuting the entries of each sublist in this output also produces another valid + * output. * @endcode * - * @return A list column with lists having unique entries as specified by the `keep` policy. + * @return A list column with list elements having unique entries. */ std::unique_ptr drop_list_duplicates( lists_column_view const& lists_column, - duplicate_keep_option keep, null_equality nulls_equal = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 90cf4a1af89..107ebc5f51c 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -16,16 +16,305 @@ #include #include #include +#include #include +#include +#include +#include +#include #include +#include +#include #include +#include +#include +#include +#include #include namespace cudf { namespace lists { namespace detail { +struct SortPairs { + template + void SortPairsAscending(KeyT const* keys_in, + KeyT* keys_out, + ValueT const* values_in, + ValueT* values_out, + int num_items, + int num_segments, + OffsetIteratorT begin_offsets, + OffsetIteratorT end_offsets, + rmm::cuda_stream_view stream) + { + rmm::device_buffer d_temp_storage; + size_t temp_storage_bytes = 0; + cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage.data(), + temp_storage_bytes, + keys_in, + keys_out, + values_in, + values_out, + num_items, + num_segments, + begin_offsets, + end_offsets, + 0, + sizeof(KeyT) * 8, + stream.value()); + d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream}; + + cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage.data(), + temp_storage_bytes, + keys_in, + keys_out, + values_in, + values_out, + num_items, + num_segments, + begin_offsets, + end_offsets, + 0, + sizeof(KeyT) * 8, + stream.value()); + } + + template + std::enable_if_t(), std::unique_ptr> operator()( + column_view const& child, + column_view const& segment_offsets, + null_order null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + auto child_table = segmented_sort_by_key(table_view{{child}}, + table_view{{child}}, + segment_offsets, + {order::ASCENDING}, + {null_precedence}, + stream, + mr); + return std::move(child_table->release().front()); + } + + template + std::enable_if_t(), std::unique_ptr> operator()( + column_view const& child, + column_view const& offsets, + null_order null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + auto output = + cudf::detail::allocate_like(child, child.size(), mask_allocation_policy::NEVER, stream, mr); + mutable_column_view mutable_output_view = output->mutable_view(); + + auto keys = [&]() { + if (child.nullable()) { + rmm::device_uvector keys(child.size(), stream); + auto const null_replace_T = null_precedence == null_order::AFTER + ? std::numeric_limits::max() + : std::numeric_limits::min(); + auto device_child = column_device_view::create(child, stream); + auto keys_in = + cudf::detail::make_null_replacement_iterator(*device_child, null_replace_T); + thrust::copy_n(rmm::exec_policy(stream), keys_in, child.size(), keys.begin()); + return keys; + } + return rmm::device_uvector{0, stream}; + }(); + + std::unique_ptr sorted_indices = cudf::make_numeric_column( + data_type(type_to_id()), child.size(), mask_state::UNALLOCATED, stream, mr); + mutable_column_view mutable_indices_view = sorted_indices->mutable_view(); + thrust::sequence(rmm::exec_policy(stream), + mutable_indices_view.begin(), + mutable_indices_view.end(), + 0); + + SortPairsAscending(child.nullable() ? keys.data() : child.begin(), + mutable_output_view.begin(), + mutable_indices_view.begin(), + mutable_indices_view.begin(), + child.size(), + offsets.size() - 1, + offsets.begin(), + offsets.begin() + 1, + stream); + + std::vector> output_cols; + output_cols.push_back(std::move(output)); + // rearrange the null_mask. + cudf::detail::gather_bitmask(cudf::table_view{{child}}, + mutable_indices_view.begin(), + output_cols, + cudf::detail::gather_bitmask_op::DONT_CHECK, + stream, + mr); + return std::move(output_cols.front()); + } +}; +#if 1 +namespace { +template +struct unique_copy_fn { + /** + * @brief Functor for unique_copy() + * + * The logic here is equivalent to: + * @code + * ((keep == duplicate_keep_option::KEEP_LAST) || + * (i == 0 || !comp(iter[i], iter[i - 1]))) && + * ((keep == duplicate_keep_option::KEEP_FIRST) || + * (i == last_index || !comp(iter[i], iter[i + 1]))) + * @endcode + * + * It is written this way so that the `comp` comparator + * function appears only once minimizing the inlining + * required and reducing the compile time. + */ + __device__ bool operator()(size_type i) + { + size_type boundary = 0; + size_type offset = 1; + auto keep_option = duplicate_keep_option::KEEP_LAST; + do { + if ((keep_option != duplicate_keep_option::KEEP_FIRST) && (i != boundary) && + comp(iter[i], iter[i - offset])) { + return false; + } + keep_option = duplicate_keep_option::KEEP_FIRST; + boundary = last_index; + offset = -offset; + } while (offset < 0); + return true; + } + + InputIterator iter; + BinaryPredicate comp; + size_type const last_index; +}; +} // anonymous namespace + +/** + * @brief Copies unique elements from the range [first, last) to output iterator `output`. + * + * In a consecutive group of duplicate elements, depending on parameter `keep`, + * only the first element is copied, or the last element is copied or neither is copied. + * + * @return End of the range to which the elements are copied. + */ +template +OutputIterator unique_copy(InputIterator first, + InputIterator last, + OutputIterator output, + BinaryPredicate comp, + rmm::cuda_stream_view stream) +{ + size_type const last_index = thrust::distance(first, last) - 1; + return thrust::copy_if(rmm::exec_policy(stream), + first, + last, + thrust::counting_iterator(0), + output, + unique_copy_fn{first, comp, last_index}); +} + +/** + * @brief Create a column_view of index values which represents the row values of a given column + * without duplicated elements + * + * Given an `input` column_view, the output column `unique_indices` is generated such that it + * copies row indices of the input column at rows without duplicate elements + * + * @param[in] input The input column_view + * @param[out] unique_indices Column to store the indices of rows with unique elements + * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, + * nulls are not equal if null_equality::UNEQUAL + * @param[in] stream CUDA stream used for device memory operations and kernel launches + * + * @return column_view of unique row indices + */ +std::unique_ptr get_unique_entries(column_view const& input, + column_view const& entry_offsets, + + null_equality nulls_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // sort only indices + + // extract unique indices + auto device_input_table = cudf::table_device_view::create(table_view{{input}}, stream); + + auto unique_indices = cudf::make_numeric_column( + data_type{type_id::INT32}, input.size(), mask_state::UNALLOCATED, stream); + auto mutable_unique_indices_view = unique_indices->mutable_view(); + + auto comp = row_equality_comparator( + *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL); + + auto begin = thrust::make_counting_iterator(0); + auto end = begin + input.size(); + auto result_end = + unique_copy(begin, end, mutable_unique_indices_view.begin(), comp, stream); + + auto indices = cudf::detail::slice( + unique_indices->view(), + 0, + thrust::distance(mutable_unique_indices_view.begin(), result_end)); + + return cudf::detail::gather(table_view{{input, entry_offsets}}, + indices, + cudf::out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); + // return std::move(tbl->release().front()); +} + +#endif + +/** + * @brief Populate offsets for all corresponding indices of the entries + * + * Given an `offsets` column_view and a number of entries, generate an array of corresponding + * offsets for all the entry indices + * + * @code{.pseudo} + * size = 9, offsets = { 0, 4, 6, 10 } + * output = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 2 } + * @endcode + * + * @param[in] size The number of entries to populate offsets + * @param[out] offsets Column view to the offsets + * @param[in] stream CUDA stream used for device memory operations and kernel launches + * + * @return a device vector of entry offsets + */ +std::unique_ptr get_entry_offsets(size_type size, + column_view const& offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto entry_offsets = make_numeric_column( + data_type(type_to_id()), size, mask_state::UNALLOCATED, stream, mr); + auto entry_offsets_view = entry_offsets->mutable_view(); + + auto offset_begin = offsets.begin(); + auto offsets_minus_one = + thrust::make_transform_iterator(offset_begin, [] __device__(auto i) { return i - 1; }); + auto counting_iter = thrust::make_counting_iterator(0); + thrust::lower_bound(rmm::exec_policy(stream), + offsets_minus_one, + offsets_minus_one + offsets.size(), + counting_iter, + counting_iter + size, + entry_offsets_view.begin()); + return entry_offsets; +} /** * @copydoc cudf::lists::drop_list_duplicates @@ -33,95 +322,158 @@ namespace detail { * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, - duplicate_keep_option keep, null_equality nulls_equal, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + /* + * Given input = { {1, 1, 2, 1, 3}, {4}, {5, 6, 6, 6, 5} } + * The output can be { {1, 2, 3}, {4}, {5, 6} } + * + * 1. Call segmented sort on the lists, obtaining + * sorted_lists = { {1, 1, 1, 2, 3}, {4}, {5, 5, 6, 6, 6} }, and + * + * 2. Generate ordered indices for the list entries + * indices = { {0, 1, 2, 3, 4}, {5}, {6, 7, 8, 9, 10} } + * + * 3. Remove list indices if the list entries are duplicated, obtaining + * unique_indices = { {0, 3, 4}, {5}, {6, 8} } + * + * 4. Gather list entries using the unique_indices as gather map + * (remember to deal with null elements) + * + * 5. Regenerate list offsets and null mask + * + * Corner cases: + * - null entries in a list: depending on the nulls_equal policy, if it is set to EQUAL then + * only one null entry is kept. + * - null rows: just return null rows, nothing changes. + * - NaN entries in a list: NaNs should be treated as equal, thus only one NaN value is kept. + * - Nested types are not supported---the function should throw logic_error. + */ + if (lists_column.is_empty()) return cudf::empty_like(lists_column.parent()); + if (cudf::is_nested(lists_column.child().type())) + CUDF_FAIL("Nested types are not supported in drop_list_duplicates."); - /* - * Given input = { {1, 1, 2, 1, 3}, {4}, {5, 6, 6, 6, 5} } - * - if keep is KEEP_FIRST, the output will be { {1, 2, 3}, {4}, {5, 6} } - * - if keep is KEEP_LAST, the output will be { {2, 1, 3}, {4}, {6, 5} } - * - if keep is KEEP_NONE, the output will be { {2, 3}, {4}, {} } - * - * 1. Generate ordered indices for each list element - * ordered_indices = { {0, 1, 2, 3, 4}, {5}, {6, 7, 8, 9, 10} } - * - * 2. Call sort_list on the lists and indices using the list entries as keys - * sorted_lists = { {1, 1, 1, 2, 3}, {4}, {5, 5, 6, 6, 6} }, and - * sorted_indices = { {0, 1, 3, 2, 4}, {5}, {6, 10, 7, 8, 9} } - * - * 3. Remove list indices if the list entries are duplicated - * - with keep is KEEP_FIRST: sorted_unique_indices = { {0, 2, 4}, {5}, {6, 7} } - * - with keep is KEEP_LAST: sorted_unique_indices = { {3, 2, 4}, {5}, {10, 9} } - * - with keep is KEEP_NONE: sorted_unique_indices = { {2, 4}, {5}, {} } - * - * 4. Call sort_lists on the sorted_unique_indices to obtain the final list indices - * - with keep is KEEP_FIRST: sorted_unique_indices = { {0, 2, 4}, {5}, {6, 7} } - * - with keep is KEEP_LAST: sorted_unique_indices = { {2, 3, 4}, {5}, {9, 10} } - * - with keep is KEEP_NONE: sorted_unique_indices = { {2, 4}, {5}, {} } - * - * 5. Gather list entries using the sorted_unique_indices as gather map - * (remember to deal with null elements) - * - * Corner cases: - * - null entries in a list: depending on the nulls_equal policy, if it is set to EQUAL then - * only one null entry is kept. Which null entry to keep---it is specified by the keep policy. - * - null rows: just return null rows, nothing changes. - * - NaN entries in a list: NaNs should be treated as equal, thus only one NaN value is kept. - * Again, which value to keep depends on the keep policy. - * - Nested types are not supported---the function should throw logic_error. - */ - -#if 0 - auto const offsets_column = lists_column.offsets(); - - // create a column_view with attributes of the parent and data from the offsets - column_view annotated_offsets(data_type{type_id::INT32}, - lists_column.size() + 1, - offsets_column.data(), - lists_column.null_mask(), - lists_column.null_count(), - lists_column.offset()); - - // create a gather map for extracting elements from the child column - auto gather_map = make_fixed_width_column( - data_type{type_id::INT32}, annotated_offsets.size() - 1, mask_state::UNALLOCATED, stream); - auto d_gather_map = gather_map->mutable_view().data(); - auto const child_column = lists_column.child(); - - // build the gather map using the offsets and the provided index - auto const d_column = column_device_view::create(annotated_offsets, stream); - if (index < 0) - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(gather_map->size()), - d_gather_map, - map_index_fn{*d_column, index, child_column.size()}); - else - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(gather_map->size()), - d_gather_map, - map_index_fn{*d_column, index, child_column.size()}); - - // Gather only the unique entries - auto result = cudf::detail::gather(table_view({child_column}), - d_gather_map, - d_gather_map + gather_map->size(), - out_of_bounds_policy::NULLIFY, // nullify-out-of-bounds - stream, - mr) - ->release(); - - // Set zero size for the null_mask if there is no null element - if (result.front()->null_count() == 0) - result.front()->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); - - return std::unique_ptr(std::move(result.front())); -#endif + /* + * 1. Call segmented sort on the lists + */ + // std::unique_ptr sorted_lists; + // if (nulls_equal == null_equality::EQUAL) + // sorted_lists = cudf::lists::sort_lists(lists_column, order::ASCENDING, null_order::AFTER); + // else + // sorted_lists = cudf::lists::sort_lists(lists_column, order::ASCENDING, null_order::BEFORE); + + auto print = [](const cudf::column_view& arr) { + thrust::host_vector host_data(arr.size()); + CUDA_TRY(cudaMemcpy( + host_data.data(), arr.data(), arr.size() * sizeof(int), cudaMemcpyDeviceToHost)); + for (int i = 0; i < arr.size(); ++i) { printf("%d\n", host_data[i]); } + printf("\n\n\n"); + }; + + // Get the original offsets, which may not start from 0 + auto segment_offsets = cudf::detail::slice( + lists_column.offsets(), {lists_column.offset(), lists_column.offsets().size()}, stream)[0]; + // Make 0-based offsets so we can create a new column using those offsets + auto output_offset = allocate_like(segment_offsets, mask_allocation_policy::RETAIN, mr); + thrust::transform(rmm::exec_policy(stream), + segment_offsets.begin(), + segment_offsets.end(), + output_offset->mutable_view().begin(), + [first = segment_offsets.begin()] __device__(auto offset_index) { + return offset_index - *first; + }); + + // return sorted_lists; + + /* + * 2,3. Generate ordered indices for the list entries and remove list indices if the list entries + * are duplicated + */ + auto const child = lists_column.get_sliced_child(stream); + auto const entry_offsets = get_entry_offsets(child.size(), output_offset->view(), stream, mr); + + // if (nulls_equal == null_equality::EQUAL) + // sorted_lists = cudf::lists::sort_lists(lists_column, order::ASCENDING, null_order::AFTER); + // else + // sorted_lists = cudf::lists::sort_lists(lists_column, order::ASCENDING, null_order::BEFORE); + + auto output_child = type_dispatcher( + child.type(), SortPairs{}, child, output_offset->view(), null_order::AFTER, stream, mr); + // output_child = type_dispatcher( + // child.type(), MakeUniquePairs{}, child, entry_offsets, null_precedence, stream, mr); + + printf("line %d\n", __LINE__); + + printf("line %d\n", __LINE__); + + auto output_child_and_entry_offsets = get_unique_entries(output_child->view(), + entry_offsets->view(), + + nulls_equal, + stream, + mr) + ->release(); + + printf("line %d\n", __LINE__); + + auto null_mask = cudf::detail::copy_bitmask(lists_column.parent(), stream, mr); + + printf("line %d\n", __LINE__); + + print(output_child->view()); + + print(output_offset->view()); + + print(output_child_and_entry_offsets.back()->view()); + /* + * 0 4 5 8 10 + */ + + auto unique_entry_offsets = output_child_and_entry_offsets.back()->view().data(); + + auto counting_iter = thrust::make_counting_iterator(0); + // auto result_end = + thrust::copy_if(rmm::exec_policy(stream), + counting_iter, + counting_iter + child.size(), + output_offset->mutable_view().begin(), + [size = child.size(), unique_entry_offsets] __device__(size_type i) -> bool { + return i == 0 || i == size || + unique_entry_offsets[i] != unique_entry_offsets[i - 1]; + }); + + print(output_offset->view()); + + // Assemble list column & return + return make_lists_column(lists_column.size(), + std::move(output_offset), + std::move(output_child_and_entry_offsets.front()), + lists_column.null_count(), + std::move(null_mask)); + + /* + * 4. Gather list entries using the unique_indices as gather map + */ + // return cudf::detail::gather(lists_data, + // unique_indices, + // cudf::out_of_bounds_policy::DONT_CHECK, + // cudf::detail::negative_index_policy::NOT_ALLOWED, + // stream, + // mr); + + /* + * 5. Regenerate list offsets and null mask + */ + + // cudf::detail::gather_bitmask(cudf::table_view{{child}}, + // mutable_indices_view.begin(), + // output_cols, + // cudf::detail::gather_bitmask_op::DONT_CHECK, + // stream, + // mr); return empty_like(lists_column.parent()); } @@ -132,12 +484,10 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu * @copydoc cudf::lists::drop_list_duplicates */ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, - duplicate_keep_option keep, null_equality nulls_equal, rmm::mr::device_memory_resource* mr) { - return detail::drop_list_duplicates( - lists_column, keep, nulls_equal, rmm::cuda_stream_default, mr); + return detail::drop_list_duplicates(lists_column, nulls_equal, rmm::cuda_stream_default, mr); } } // namespace lists From 1fdb2a43c6d820e7546fbfa2bdf214332ba59d10 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 3 Mar 2021 13:49:06 -0700 Subject: [PATCH 04/25] Draft test for drop_list_duplicates --- .../lists/drop_list_duplicates_tests.cpp | 57 ++++++++++++++++--- 1 file changed, 49 insertions(+), 8 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index c5ec21a98eb..aa25800028e 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -20,6 +20,7 @@ #include #include #include +#include //#include //#include @@ -27,6 +28,44 @@ struct DropListDuplicatesTest : public cudf::test::BaseFixture { }; +TEST_F(DropListDuplicatesTest, NonNullTable) +{ + using LCW = cudf::test::lists_column_wrapper; + + LCW l1{{1, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; + + auto sliced_list = cudf::slice(l1, {1, 5})[0]; + + // Ascending + + LCW expected{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + auto results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{sliced_list}, + cudf::null_equality::EQUAL); + printf("line %d\n", __LINE__); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, true); +} + +TEST_F(DropListDuplicatesTest, NonNullStringTable) +{ + // using LCW = cudf::test::lists_column_wrapper; + + using LCW = cudf::test::lists_column_wrapper; + // LCW l1{{"hello", "apple"}, "world", "world", "my"}; + + LCW l1{{1, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; + + auto sliced_list = cudf::slice(l1, {1, 5})[0]; + + // Ascending + + LCW expected{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + auto results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{sliced_list}, + cudf::null_equality::EQUAL); + printf("line %d\n", __LINE__); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, true); +} + +#if 0 TEST_F(DropListDuplicatesTest, NonNullTable) { cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 5}}; @@ -86,8 +125,8 @@ TEST_F(DropListDuplicatesTest, WithNull) cudf::test::fixed_width_column_wrapper exp_col_first{{4, 5, 5, 8}, {0, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper exp_key_col_first{{20, 19, 20, 21}, {0, 1, 1, 1}}; cudf::table_view expected_first{{exp_col_first, exp_key_col_first}}; - auto got_first = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + auto got_first = drop_duplicates( + input, keys, cudf::duplicate_keep_option::KEEP_FIRST, cudf::null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); @@ -131,8 +170,8 @@ TEST_F(DropListDuplicatesTest, EmptyInputTable) cudf::table_view input{{col}}; std::vector keys{1, 2}; - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + auto got = drop_duplicates( + input, keys, cudf::duplicate_keep_option::KEEP_FIRST, cudf::null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); } @@ -142,8 +181,8 @@ TEST_F(DropListDuplicatesTest, NoColumnInputTable) cudf::table_view input{std::vector()}; std::vector keys{1, 2}; - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + auto got = drop_duplicates( + input, keys, cudf::duplicate_keep_option::KEEP_FIRST, cudf::null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); } @@ -155,8 +194,10 @@ TEST_F(DropListDuplicatesTest, EmptyKeys) cudf::table_view input{{col}}; std::vector keys{}; - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + auto got = drop_duplicates( + input, keys, cudf::duplicate_keep_option::KEEP_FIRST, cudf::null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); } + +#endif From 4e9231e28651750cf263ddd47a65c62a5641da3a Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 5 Mar 2021 10:16:29 -0700 Subject: [PATCH 05/25] Complete implementation - fix bugs and handle empty lists --- cpp/src/lists/drop_list_duplicates.cu | 595 +++++++++----------------- 1 file changed, 197 insertions(+), 398 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 107ebc5f51c..b12c957ce17 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -13,469 +13,268 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include + #include -#include #include -#include #include #include #include -#include #include +#include #include -#include #include -#include #include #include -#include #include namespace cudf { namespace lists { namespace detail { -struct SortPairs { - template - void SortPairsAscending(KeyT const* keys_in, - KeyT* keys_out, - ValueT const* values_in, - ValueT* values_out, - int num_items, - int num_segments, - OffsetIteratorT begin_offsets, - OffsetIteratorT end_offsets, - rmm::cuda_stream_view stream) - { - rmm::device_buffer d_temp_storage; - size_t temp_storage_bytes = 0; - cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage.data(), - temp_storage_bytes, - keys_in, - keys_out, - values_in, - values_out, - num_items, - num_segments, - begin_offsets, - end_offsets, - 0, - sizeof(KeyT) * 8, - stream.value()); - d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream}; - - cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage.data(), - temp_storage_bytes, - keys_in, - keys_out, - values_in, - values_out, - num_items, - num_segments, - begin_offsets, - end_offsets, - 0, - sizeof(KeyT) * 8, - stream.value()); - } - - template - std::enable_if_t(), std::unique_ptr> operator()( - column_view const& child, - column_view const& segment_offsets, - null_order null_precedence, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - { - auto child_table = segmented_sort_by_key(table_view{{child}}, - table_view{{child}}, - segment_offsets, - {order::ASCENDING}, - {null_precedence}, - stream, - mr); - return std::move(child_table->release().front()); - } - - template - std::enable_if_t(), std::unique_ptr> operator()( - column_view const& child, - column_view const& offsets, - null_order null_precedence, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - { - auto output = - cudf::detail::allocate_like(child, child.size(), mask_allocation_policy::NEVER, stream, mr); - mutable_column_view mutable_output_view = output->mutable_view(); - - auto keys = [&]() { - if (child.nullable()) { - rmm::device_uvector keys(child.size(), stream); - auto const null_replace_T = null_precedence == null_order::AFTER - ? std::numeric_limits::max() - : std::numeric_limits::min(); - auto device_child = column_device_view::create(child, stream); - auto keys_in = - cudf::detail::make_null_replacement_iterator(*device_child, null_replace_T); - thrust::copy_n(rmm::exec_policy(stream), keys_in, child.size(), keys.begin()); - return keys; - } - return rmm::device_uvector{0, stream}; - }(); - - std::unique_ptr sorted_indices = cudf::make_numeric_column( - data_type(type_to_id()), child.size(), mask_state::UNALLOCATED, stream, mr); - mutable_column_view mutable_indices_view = sorted_indices->mutable_view(); - thrust::sequence(rmm::exec_policy(stream), - mutable_indices_view.begin(), - mutable_indices_view.end(), - 0); - - SortPairsAscending(child.nullable() ? keys.data() : child.begin(), - mutable_output_view.begin(), - mutable_indices_view.begin(), - mutable_indices_view.begin(), - child.size(), - offsets.size() - 1, - offsets.begin(), - offsets.begin() + 1, - stream); - - std::vector> output_cols; - output_cols.push_back(std::move(output)); - // rearrange the null_mask. - cudf::detail::gather_bitmask(cudf::table_view{{child}}, - mutable_indices_view.begin(), - output_cols, - cudf::detail::gather_bitmask_op::DONT_CHECK, - stream, - mr); - return std::move(output_cols.front()); - } -}; -#if 1 -namespace { -template -struct unique_copy_fn { - /** - * @brief Functor for unique_copy() - * - * The logic here is equivalent to: - * @code - * ((keep == duplicate_keep_option::KEEP_LAST) || - * (i == 0 || !comp(iter[i], iter[i - 1]))) && - * ((keep == duplicate_keep_option::KEEP_FIRST) || - * (i == last_index || !comp(iter[i], iter[i + 1]))) - * @endcode - * - * It is written this way so that the `comp` comparator - * function appears only once minimizing the inlining - * required and reducing the compile time. - */ - __device__ bool operator()(size_type i) - { - size_type boundary = 0; - size_type offset = 1; - auto keep_option = duplicate_keep_option::KEEP_LAST; - do { - if ((keep_option != duplicate_keep_option::KEEP_FIRST) && (i != boundary) && - comp(iter[i], iter[i - offset])) { - return false; - } - keep_option = duplicate_keep_option::KEEP_FIRST; - boundary = last_index; - offset = -offset; - } while (offset < 0); - return true; - } - - InputIterator iter; - BinaryPredicate comp; - size_type const last_index; -}; -} // anonymous namespace - /** - * @brief Copies unique elements from the range [first, last) to output iterator `output`. + * @brief Copy list entries and entry list offsets ignoring duplicates * - * In a consecutive group of duplicate elements, depending on parameter `keep`, - * only the first element is copied, or the last element is copied or neither is copied. + * Given an array of all entries flattened from a list column and an array that maps each entry to + * the offset of the list containing that entry, those entries and list offsets are copied into new + * arrays such that the duplicated entries within each list will be ignored. * - * @return End of the range to which the elements are copied. - */ -template -OutputIterator unique_copy(InputIterator first, - InputIterator last, - OutputIterator output, - BinaryPredicate comp, - rmm::cuda_stream_view stream) -{ - size_type const last_index = thrust::distance(first, last) - 1; - return thrust::copy_if(rmm::exec_policy(stream), - first, - last, - thrust::counting_iterator(0), - output, - unique_copy_fn{first, comp, last_index}); -} - -/** - * @brief Create a column_view of index values which represents the row values of a given column - * without duplicated elements + * @param all_lists_entries The input array containing all list entries + * @param entries_list_offsets A map from list entries to their corresponding list offsets + * @param nulls_equal Flag to specify whether null entries should be considered equal + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device resource used to allocate memory * - * Given an `input` column_view, the output column `unique_indices` is generated such that it - * copies row indices of the input column at rows without duplicate elements - * - * @param[in] input The input column_view - * @param[out] unique_indices Column to store the indices of rows with unique elements - * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, - * nulls are not equal if null_equality::UNEQUAL - * @param[in] stream CUDA stream used for device memory operations and kernel launches - * - * @return column_view of unique row indices + * @return A pair of columns, the first one contains unique list entries and the second one contains + * their corresponding list offsets */ -std::unique_ptr
get_unique_entries(column_view const& input, - column_view const& entry_offsets, - - null_equality nulls_equal, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +template +std::vector> get_unique_entries_and_list_offsets( + column_view const& all_lists_entries, + column_view const& entries_list_offsets, + null_equality nulls_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - // sort only indices - - // extract unique indices - auto device_input_table = cudf::table_device_view::create(table_view{{input}}, stream); - - auto unique_indices = cudf::make_numeric_column( - data_type{type_id::INT32}, input.size(), mask_state::UNALLOCATED, stream); - auto mutable_unique_indices_view = unique_indices->mutable_view(); - - auto comp = row_equality_comparator( + /* Create an intermediate table, since the comparator only work on tables */ + auto const device_input_table = + cudf::table_device_view::create(table_view{{all_lists_entries}}, stream); + auto const comp = row_equality_comparator( *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL); - auto begin = thrust::make_counting_iterator(0); - auto end = begin + input.size(); - auto result_end = - unique_copy(begin, end, mutable_unique_indices_view.begin(), comp, stream); - - auto indices = cudf::detail::slice( - unique_indices->view(), - 0, - thrust::distance(mutable_unique_indices_view.begin(), result_end)); + auto const num_entries = all_lists_entries.size(); + /* Allocate memory to store the indices of the unique entries */ + auto const unique_indices = cudf::make_numeric_column( + entries_list_offsets.type(), num_entries, mask_state::UNALLOCATED, stream); + auto const unique_indices_begin = unique_indices->mutable_view().begin(); + + auto const copy_end = + thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + thrust::counting_iterator(0), + unique_indices_begin, + [last_index = num_entries - 1, comp] __device__(size_type i) { + return i == last_index || !comp(i, i + 1); + }); + auto const indices = cudf::detail::slice( + unique_indices->view(), 0, thrust::distance(unique_indices_begin, copy_end)); - return cudf::detail::gather(table_view{{input, entry_offsets}}, + /* Collect unique entries and entry list offsets */ + return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}}, indices, cudf::out_of_bounds_policy::DONT_CHECK, cudf::detail::negative_index_policy::NOT_ALLOWED, stream, - mr); - // return std::move(tbl->release().front()); + mr) + ->release(); } -#endif - /** - * @brief Populate offsets for all corresponding indices of the entries + * @brief Generate a 0-based offset column for a lists column * - * Given an `offsets` column_view and a number of entries, generate an array of corresponding - * offsets for all the entry indices + * Given a lists_column_view, which may have a non-zero offset, generate a new column containing + * 0-based list offsets. This is done by subtracting each of the input list offset by the first + * offset. * * @code{.pseudo} - * size = 9, offsets = { 0, 4, 6, 10 } - * output = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 2 } + * Given a list column having offsets = { 3, 7, 9, 13 }, + * then output_offsets = { 0, 4, 6, 10 } * @endcode * - * @param[in] size The number of entries to populate offsets - * @param[out] offsets Column view to the offsets - * @param[in] stream CUDA stream used for device memory operations and kernel launches + * @param lists_column The input lists column + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device resource used to allocate memory * - * @return a device vector of entry offsets + * @return A column containing 0-based list offsets */ -std::unique_ptr get_entry_offsets(size_type size, - column_view const& offsets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr generate_clean_offsets(lists_column_view const& lists_column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - auto entry_offsets = make_numeric_column( - data_type(type_to_id()), size, mask_state::UNALLOCATED, stream, mr); - auto entry_offsets_view = entry_offsets->mutable_view(); + auto const list_offsets = + cudf::detail::slice( + lists_column.offsets(), {lists_column.offset(), lists_column.offsets().size()}, stream) + .front(); + auto output_offsets = allocate_like(list_offsets, mask_allocation_policy::NEVER, mr); + thrust::transform( + rmm::exec_policy(stream), + list_offsets.begin(), + list_offsets.end(), + output_offsets->mutable_view().begin(), + [first = list_offsets.begin()] __device__(auto offset) { return offset - *first; }); + return output_offsets; +} - auto offset_begin = offsets.begin(); - auto offsets_minus_one = - thrust::make_transform_iterator(offset_begin, [] __device__(auto i) { return i - 1; }); - auto counting_iter = thrust::make_counting_iterator(0); - thrust::lower_bound(rmm::exec_policy(stream), - offsets_minus_one, - offsets_minus_one + offsets.size(), - counting_iter, - counting_iter + size, - entry_offsets_view.begin()); - return entry_offsets; +/** + * @brief Populate list offsets for all list entries + * + * Given an `offsets` column_view containing offsets of a lists column and a number of all list + * entries in the column, generate an array that maps from each list entry to the offset of the list + * containing that entry. + * + * @code{.pseudo} + * num_entries = 10, offsets = { 0, 4, 6, 10 } + * output = { 1, 1, 1, 1, 2, 2, 3, 3, 3, 3 } + * @endcode + * + * @param num_entries The number of list entries + * @param offsets Column view to the list offsets + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device resource used to allocate memory + * + * @return A column containing entry list offsets + */ +std::unique_ptr generate_entry_list_offsets(size_type num_entries, + column_view const& offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto entry_list_offsets = + make_numeric_column(offsets.type(), num_entries, mask_state::UNALLOCATED, stream, mr); + thrust::upper_bound(rmm::exec_policy(stream), + offsets.begin(), + offsets.begin() + offsets.size(), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + entry_list_offsets->mutable_view().begin()); + return entry_list_offsets; } /** - * @copydoc cudf::lists::drop_list_duplicates + * @brief Generate list offsets from entry offsets * - * @param stream CUDA stream used for device memory operations and kernel launches. + * Generate an array of list offsets for the final result lists column. The list + * offsets of the original lists column are also taken into account to make sure the result lists + * column will have the same empty list rows (if any) as in the original lists column. + * + * @param[in] num_entries The number of unique entries after removing duplicates + * @param[in] entries_list_offsets The mapping from list entries to their list offsets + * @param[out] original_offsets The list offsets of the original lists column, which + * will also be used to store the new list offsets + * @param[in] stream CUDA stream used for device memory operations and kernel launches + * @param[in] mr Device resource used to allocate memory */ -std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, - null_equality nulls_equal, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +void generate_offsets(size_type num_entries, + column_view const& entries_list_offsets, + mutable_column_view const& original_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { + auto const new_offsets = allocate_like(original_offsets, mask_allocation_policy::NEVER, mr); /* - * Given input = { {1, 1, 2, 1, 3}, {4}, {5, 6, 6, 6, 5} } - * The output can be { {1, 2, 3}, {4}, {5, 6} } - * - * 1. Call segmented sort on the lists, obtaining - * sorted_lists = { {1, 1, 1, 2, 3}, {4}, {5, 5, 6, 6, 6} }, and - * - * 2. Generate ordered indices for the list entries - * indices = { {0, 1, 2, 3, 4}, {5}, {6, 7, 8, 9, 10} } - * - * 3. Remove list indices if the list entries are duplicated, obtaining - * unique_indices = { {0, 3, 4}, {5}, {6, 8} } - * - * 4. Gather list entries using the unique_indices as gather map - * (remember to deal with null elements) - * - * 5. Regenerate list offsets and null mask - * - * Corner cases: - * - null entries in a list: depending on the nulls_equal policy, if it is set to EQUAL then - * only one null entry is kept. - * - null rows: just return null rows, nothing changes. - * - NaN entries in a list: NaNs should be treated as equal, thus only one NaN value is kept. - * - Nested types are not supported---the function should throw logic_error. + * Firstly, generate list offsets for the unique entries, ignoring empty lists (if any) + * If entries_list_offsets = {1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3 }, num_entries = 11, + * then output = { 0, 4, 5, 8, 10 } */ - - if (lists_column.is_empty()) return cudf::empty_like(lists_column.parent()); - if (cudf::is_nested(lists_column.child().type())) - CUDF_FAIL("Nested types are not supported in drop_list_duplicates."); + auto const end_copy = thrust::copy_if( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries + 1), + new_offsets->mutable_view().begin(), + [num_entries, offsets_ptr = entries_list_offsets.begin()] __device__(size_type i) + -> bool { return i == 0 || i == num_entries || offsets_ptr[i] != offsets_ptr[i - 1]; }); /* - * 1. Call segmented sort on the lists + * Generate a prefix sum of number of empty lists, storing inplace to the original lists + * offsets + * If the original list offsets is { 0, 0, 5, 5, 6, 6 } (there are 2 empty lists), + * and new_offsets = { 0, 4, 6 }, + * then output = { 0, 1, 1, 2, 2, 3} */ - // std::unique_ptr sorted_lists; - // if (nulls_equal == null_equality::EQUAL) - // sorted_lists = cudf::lists::sort_lists(lists_column, order::ASCENDING, null_order::AFTER); - // else - // sorted_lists = cudf::lists::sort_lists(lists_column, order::ASCENDING, null_order::BEFORE); - - auto print = [](const cudf::column_view& arr) { - thrust::host_vector host_data(arr.size()); - CUDA_TRY(cudaMemcpy( - host_data.data(), arr.data(), arr.size() * sizeof(int), cudaMemcpyDeviceToHost)); - for (int i = 0; i < arr.size(); ++i) { printf("%d\n", host_data[i]); } - printf("\n\n\n"); - }; - - // Get the original offsets, which may not start from 0 - auto segment_offsets = cudf::detail::slice( - lists_column.offsets(), {lists_column.offset(), lists_column.offsets().size()}, stream)[0]; - // Make 0-based offsets so we can create a new column using those offsets - auto output_offset = allocate_like(segment_offsets, mask_allocation_policy::RETAIN, mr); - thrust::transform(rmm::exec_policy(stream), - segment_offsets.begin(), - segment_offsets.end(), - output_offset->mutable_view().begin(), - [first = segment_offsets.begin()] __device__(auto offset_index) { - return offset_index - *first; - }); - - // return sorted_lists; + auto const iter_trans_begin = cudf::detail::make_counting_transform_iterator( + 0, [offsets = original_offsets.begin()] __device__(size_type i) -> size_type { + return (i > 0 && offsets[i] == offsets[i - 1]) ? 1 : 0; + }); + thrust::inclusive_scan(rmm::exec_policy(stream), + iter_trans_begin, + iter_trans_begin + original_offsets.size(), + original_offsets.begin()); /* - * 2,3. Generate ordered indices for the list entries and remove list indices if the list entries - * are duplicated + * Generate the final list offsets + * If the original list offsets are { 0, 0, 5, 5, 6, 6 }, the new offsets are { 0, 4, 6 }, + * and the prefix sums of empty lists are { 0, 1, 1, 2, 2, 3 }, + * then output = { 0, 0, 4, 4, 5, 5 } */ - auto const child = lists_column.get_sliced_child(stream); - auto const entry_offsets = get_entry_offsets(child.size(), output_offset->view(), stream, mr); - - // if (nulls_equal == null_equality::EQUAL) - // sorted_lists = cudf::lists::sort_lists(lists_column, order::ASCENDING, null_order::AFTER); - // else - // sorted_lists = cudf::lists::sort_lists(lists_column, order::ASCENDING, null_order::BEFORE); - - auto output_child = type_dispatcher( - child.type(), SortPairs{}, child, output_offset->view(), null_order::AFTER, stream, mr); - // output_child = type_dispatcher( - // child.type(), MakeUniquePairs{}, child, entry_offsets, null_precedence, stream, mr); - - printf("line %d\n", __LINE__); - - printf("line %d\n", __LINE__); - - auto output_child_and_entry_offsets = get_unique_entries(output_child->view(), - entry_offsets->view(), - - nulls_equal, - stream, - mr) - ->release(); - - printf("line %d\n", __LINE__); - - auto null_mask = cudf::detail::copy_bitmask(lists_column.parent(), stream, mr); - - printf("line %d\n", __LINE__); + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(original_offsets.size()), + original_offsets.begin(), + [prefix_sum_empty_lists = original_offsets.begin(), + offsets = new_offsets->view().begin()] __device__(size_type i) -> size_type { + return offsets[i - prefix_sum_empty_lists[i]]; + }); +} +/** + * @copydoc cudf::lists::drop_list_duplicates + * + * @param stream CUDA stream used for device memory operations and kernel launches + */ +std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, + null_equality nulls_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (lists_column.is_empty()) return cudf::empty_like(lists_column.parent()); + if (cudf::is_nested(lists_column.child().type())) + CUDF_FAIL("Nested types are not supported in drop_list_duplicates."); - print(output_child->view()); + /* Call segmented sort on the list elements */ + auto const sorted_lists = lists::sort_lists(lists_column, order::ASCENDING, null_order::AFTER); - print(output_offset->view()); + /* Flatten all entries (depth = 1) of the lists column */ + auto const all_lists_entries = lists_column_view(sorted_lists->view()).get_sliced_child(stream); - print(output_child_and_entry_offsets.back()->view()); - /* - * 0 4 5 8 10 - */ + /* Generate a 0-based offset column */ + auto lists_offsets = detail::generate_clean_offsets(lists_column, stream, mr); - auto unique_entry_offsets = output_child_and_entry_offsets.back()->view().data(); + /* Generate a mapping from list entries to offsets of the lists containing those entries */ + auto const entries_list_offsets = detail::generate_entry_list_offsets( + all_lists_entries.size(), lists_offsets->view(), stream, mr); - auto counting_iter = thrust::make_counting_iterator(0); - // auto result_end = - thrust::copy_if(rmm::exec_policy(stream), - counting_iter, - counting_iter + child.size(), - output_offset->mutable_view().begin(), - [size = child.size(), unique_entry_offsets] __device__(size_type i) -> bool { - return i == 0 || i == size || - unique_entry_offsets[i] != unique_entry_offsets[i - 1]; - }); + /* Copy non-duplicated entries (along with their list offsets) to new arrays */ + auto unique_entries_and_list_offsets = + all_lists_entries.has_nulls() + ? detail::get_unique_entries_and_list_offsets( + all_lists_entries, entries_list_offsets->view(), nulls_equal, stream, mr) + : detail::get_unique_entries_and_list_offsets( + all_lists_entries, entries_list_offsets->view(), nulls_equal, stream, mr); - print(output_offset->view()); + /* Generate offsets for the new lists column */ + detail::generate_offsets(unique_entries_and_list_offsets.front()->size(), + unique_entries_and_list_offsets.back()->view(), + lists_offsets->mutable_view(), + stream, + mr); - // Assemble list column & return + /* Construct a new list column without duplicated entries */ return make_lists_column(lists_column.size(), - std::move(output_offset), - std::move(output_child_and_entry_offsets.front()), + std::move(lists_offsets), + std::move(unique_entries_and_list_offsets.front()), lists_column.null_count(), - std::move(null_mask)); - - /* - * 4. Gather list entries using the unique_indices as gather map - */ - // return cudf::detail::gather(lists_data, - // unique_indices, - // cudf::out_of_bounds_policy::DONT_CHECK, - // cudf::detail::negative_index_policy::NOT_ALLOWED, - // stream, - // mr); - - /* - * 5. Regenerate list offsets and null mask - */ - - // cudf::detail::gather_bitmask(cudf::table_view{{child}}, - // mutable_indices_view.begin(), - // output_cols, - // cudf::detail::gather_bitmask_op::DONT_CHECK, - // stream, - // mr); - - return empty_like(lists_column.parent()); + cudf::detail::copy_bitmask(lists_column.parent(), stream, mr)); } } // namespace detail From 5a9228881dbe6b13d9db139d5c884d668af60996 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 5 Mar 2021 14:20:52 -0700 Subject: [PATCH 06/25] Update documentation for drop_list_duplicates --- .../cudf/lists/drop_list_duplicates.hpp | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp index 1da69b09d81..e5e53bd6035 100644 --- a/cpp/include/cudf/lists/drop_list_duplicates.hpp +++ b/cpp/include/cudf/lists/drop_list_duplicates.hpp @@ -28,27 +28,32 @@ namespace lists { */ /** - * @brief Create a new column by removing duplicated entries from each list in the given - * lists_column + * @brief Create a new lists column by removing duplicated entries from each list element in the + * given lists column * - * Given an `input` list_column_view, the list rows in the column are copied to an output column - * such that their duplicated entries are dropped out to keep only unique entries. The order of - * those entries are not guaranteed to be preserved as in the input column. + * Given an `input` lists_column_view, the list elements in the column are copied to an output lists + * column such that their duplicated entries are dropped out to keep only the unique ones. The + * order of those entries within each list are not guaranteed to be preserved as in the input. * - * If any row in the input column contains nested types, cudf::logic_error will be thrown. + * Notes: + * - If any row (list element) in the input column contains nested types, cudf::logic_error will be + * thrown. + * - By current implementation, entries in the output lists are sorted by ascending order (nulls + * last), but this is not guaranteed in future implementation. * - * @param[in] lists_column the input lists_column_view - * @param[in] nulls_equal flag to denote nulls are considered equal - * @param[in] mr Device memory resource used to allocate the returned column + * @param lists_column The input lists_column_view + * @param nulls_equal Flag to specify whether null entries should be considered equal + * @param mr Device resource used to allocate memory * * @code{.pseudo} - * If the input is { {1, 1, 2, 1, 3}, {4}, {5, 6, 6, 6, 5} } - * Then a valid output can be { {1, 2, 3}, {4}, {5, 6} } - * Note that permuting the entries of each sublist in this output also produces another valid + * lists_column = { {1, 1, 2, 1, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} } + * output = { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} } + * + * Note that permuting the entries of each list in this output also produces another valid * output. * @endcode * - * @return A list column with list elements having unique entries. + * @return A list column with list elements having unique entries */ std::unique_ptr drop_list_duplicates( lists_column_view const& lists_column, From eaee403f13e6d4f0c53a4cdfd079c15af63bd59b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 5 Mar 2021 19:31:37 -0700 Subject: [PATCH 07/25] Complete unit tests for drop_list_duplicates --- .../lists/drop_list_duplicates_tests.cpp | 265 +++++++----------- 1 file changed, 104 insertions(+), 161 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index aa25800028e..84f5beaf320 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -14,190 +14,133 @@ * limitations under the License. */ -//#include #include #include -#include #include -#include -//#include -//#include - -struct DropListDuplicatesTest : public cudf::test::BaseFixture { -}; - -TEST_F(DropListDuplicatesTest, NonNullTable) +template +void test_once(cudf::column_view const& input, + LCW const& expected, + cudf::null_equality nulls_equal = cudf::null_equality::EQUAL) { - using LCW = cudf::test::lists_column_wrapper; - - LCW l1{{1, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; - - auto sliced_list = cudf::slice(l1, {1, 5})[0]; - - // Ascending - - LCW expected{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; - auto results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{sliced_list}, - cudf::null_equality::EQUAL); - printf("line %d\n", __LINE__); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, true); + auto const results = + cudf::lists::drop_list_duplicates(cudf::lists_column_view{input}, nulls_equal); + if (equal_test) + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, true); + else + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected, true); } -TEST_F(DropListDuplicatesTest, NonNullStringTable) -{ - // using LCW = cudf::test::lists_column_wrapper; - - using LCW = cudf::test::lists_column_wrapper; - // LCW l1{{"hello", "apple"}, "world", "world", "my"}; - - LCW l1{{1, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; - - auto sliced_list = cudf::slice(l1, {1, 5})[0]; - - // Ascending - - LCW expected{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; - auto results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{sliced_list}, - cudf::null_equality::EQUAL); - printf("line %d\n", __LINE__); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, true); -} +struct DropListDuplicatesTest : public cudf::test::BaseFixture { +}; -#if 0 -TEST_F(DropListDuplicatesTest, NonNullTable) +TEST_F(DropListDuplicatesTest, InvalidCasesTests) { - cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 5}}; - cudf::test::fixed_width_column_wrapper col2{{4, 5, 3, 4, 9, 4}}; - cudf::test::fixed_width_column_wrapper col1_key{{20, 20, 20, 19, 21, 9}}; - cudf::test::fixed_width_column_wrapper col2_key{{19, 19, 20, 20, 9, 21}}; - - cudf::table_view input{{col1, col2, col1_key, col2_key}}; - std::vector keys{2, 3}; - - // Keep first of duplicate - // The expected table would be sorted in ascending order with respect to keys - cudf::test::fixed_width_column_wrapper exp_col1_first{{5, 5, 5, 3, 8}}; - cudf::test::fixed_width_column_wrapper exp_col2_first{{4, 4, 4, 3, 9}}; - cudf::test::fixed_width_column_wrapper exp_col1_key_first{{9, 19, 20, 20, 21}}; - cudf::test::fixed_width_column_wrapper exp_col2_key_first{{21, 20, 19, 20, 9}}; - cudf::table_view expected_first{ - {exp_col1_first, exp_col2_first, exp_col1_key_first, exp_col2_key_first}}; - - auto got_first = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); - - // keep last of duplicate - cudf::test::fixed_width_column_wrapper exp_col1_last{{5, 5, 4, 3, 8}}; - cudf::test::fixed_width_column_wrapper exp_col2_last{{4, 4, 5, 3, 9}}; - cudf::test::fixed_width_column_wrapper exp_col1_key_last{{9, 19, 20, 20, 21}}; - cudf::test::fixed_width_column_wrapper exp_col2_key_last{{21, 20, 19, 20, 9}}; - cudf::table_view expected_last{ - {exp_col1_last, exp_col2_last, exp_col1_key_last, exp_col2_key_last}}; - - auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); - - // Keep unique - cudf::test::fixed_width_column_wrapper exp_col1_unique{{5, 5, 3, 8}}; - cudf::test::fixed_width_column_wrapper exp_col2_unique{{4, 4, 3, 9}}; - cudf::test::fixed_width_column_wrapper exp_col1_key_unique{{9, 19, 20, 21}}; - cudf::test::fixed_width_column_wrapper exp_col2_key_unique{{21, 20, 20, 9}}; - cudf::table_view expected_unique{ - {exp_col1_unique, exp_col2_unique, exp_col1_key_unique, exp_col2_key_unique}}; - - auto got_unique = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); + using ILCW = cudf::test::lists_column_wrapper; + using FLCW = cudf::test::lists_column_wrapper; + using SLCW = cudf::test::lists_column_wrapper; + + /* Lists of nested types are not supported */ + EXPECT_THROW(cudf::lists::drop_list_duplicates(cudf::lists_column_view{ILCW{ILCW{{1, 2}, {3}}}}), + cudf::logic_error); + EXPECT_THROW(cudf::lists::drop_list_duplicates(cudf::lists_column_view{FLCW{FLCW{{1, 2}, {3}}}}), + cudf::logic_error); + EXPECT_THROW( + cudf::lists::drop_list_duplicates(cudf::lists_column_view{SLCW{SLCW{SLCW{"dummy string"}}}}), + cudf::logic_error); } -TEST_F(DropListDuplicatesTest, WithNull) +TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull) { - cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; - cudf::table_view input{{col, key}}; - std::vector keys{1}; - - // Keep first of duplicate - cudf::test::fixed_width_column_wrapper exp_col_first{{4, 5, 5, 8}, {0, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_first{{20, 19, 20, 21}, {0, 1, 1, 1}}; - cudf::table_view expected_first{{exp_col_first, exp_key_col_first}}; - auto got_first = drop_duplicates( - input, keys, cudf::duplicate_keep_option::KEEP_FIRST, cudf::null_equality::EQUAL); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); - - // Keep last of duplicate - cudf::test::fixed_width_column_wrapper exp_col_last{{3, 1, 5, 8}, {1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_last{{20, 19, 20, 21}, {0, 1, 1, 1}}; - cudf::table_view expected_last{{exp_col_last, exp_key_col_last}}; - auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); - - // Keep unique of duplicate - cudf::test::fixed_width_column_wrapper exp_col_unique{{5, 8}, {1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_unique{{20, 21}, {1, 1}}; - cudf::table_view expected_unique{{exp_col_unique, exp_key_col_unique}}; - auto got_unique = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); + using float_type = double; + using LCW = cudf::test::lists_column_wrapper; + + /* Trivial cases */ + test_once(LCW{{}}, LCW{{}}); + test_once(LCW{{0, 1, 2, 3, 4, 5}, {}}, LCW{{0, 1, 2, 3, 4, 5}, {}}); + + /* Multiple empty lists */ + test_once(LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, + LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); + + /* Lists contain inf/nan */ + auto const inf = std::numeric_limits::infinity(); + auto const nan = std::numeric_limits::quiet_NaN(); + test_once(LCW{0, 1, 2, 0, 1, 2, 0, 1, 2, inf, inf, inf}, LCW{0, 1, 2, inf}); + test_once(LCW{0, 1, 2, 0, 1, 2, 0, 1, 2, nan, nan, nan}, LCW{0, 1, 2, nan}); + test_once(LCW{nan, nan, nan, 0, 1, 2, 0, 1, 2, 0, 1, 2, inf, inf, inf}, + LCW{0, 1, 2, nan, inf}); + test_once(LCW{nan, inf, nan, inf, nan, inf}, LCW{nan, inf}); } -TEST_F(DropListDuplicatesTest, StringKeyColumn) +TEST_F(DropListDuplicatesTest, IntegerTestsNonNull) { - cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::strings_column_wrapper key_col{{"all", "new", "all", "new", "the", "strings"}, - {1, 1, 1, 0, 1, 1}}; - cudf::table_view input{{col, key_col}}; - std::vector keys{1}; - cudf::test::fixed_width_column_wrapper exp_col_last{{5, 3, 4, 1, 8}, {1, 1, 0, 1, 1}}; - cudf::test::strings_column_wrapper exp_key_col_last{{"new", "all", "new", "strings", "the"}, - {0, 1, 1, 1, 1}}; - cudf::table_view expected_last{{exp_col_last, exp_key_col_last}}; - - auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); -} + using LCW = cudf::test::lists_column_wrapper; -TEST_F(DropListDuplicatesTest, EmptyInputTable) -{ - cudf::test::fixed_width_column_wrapper col(std::initializer_list{}); - cudf::table_view input{{col}}; - std::vector keys{1, 2}; + /* Trivial cases */ + test_once(LCW{{}}, LCW{{}}); + test_once(LCW{{0, 1, 2, 3, 4, 5}, {}}, LCW{{0, 1, 2, 3, 4, 5}, {}}); - auto got = drop_duplicates( - input, keys, cudf::duplicate_keep_option::KEEP_FIRST, cudf::null_equality::EQUAL); + /* Multiple empty lists */ + test_once(LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, + LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); + /* Sliced list column */ + auto const list0 = LCW{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; + auto const list1 = cudf::slice(list0, {1, 5})[0]; + test_once(list0, LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list1, LCW{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); } -TEST_F(DropListDuplicatesTest, NoColumnInputTable) +TEST_F(DropListDuplicatesTest, IntegerTestsWithNulls) { - cudf::table_view input{std::vector()}; - std::vector keys{1, 2}; - - auto got = drop_duplicates( - input, keys, cudf::duplicate_keep_option::KEEP_FIRST, cudf::null_equality::EQUAL); - - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); + using LCW = cudf::test::lists_column_wrapper; + auto const null = std::numeric_limits::max(); + + /* null lists */ + test_once( + LCW{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 && i != 3; })}, + LCW{ + {{1, 2, 3, 4}, {5}, {}, {}, {8, 9, 10}, {6, 7}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 && i != 3; })}); + + /* null entries are equal */ + test_once( + LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, + LCW{{1, 3, 5, 7, 9, null}, std::initializer_list{true, true, true, true, true, false}}); + + /* nulls entries are not equal */ + test_once( + LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, + LCW{ + {1, 3, 5, 7, 9, null, null, null, null, null}, + std::initializer_list{true, true, true, true, true, false, false, false, false, false}}, + cudf::null_equality::UNEQUAL); } -TEST_F(DropListDuplicatesTest, EmptyKeys) +TEST_F(DropListDuplicatesTest, StringTestsNonNull) { - cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper empty_col{}; - cudf::table_view input{{col}}; - std::vector keys{}; - - auto got = drop_duplicates( - input, keys, cudf::duplicate_keep_option::KEEP_FIRST, cudf::null_equality::EQUAL); - - CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); + using LCW = cudf::test::lists_column_wrapper; + + /* Trivial cases */ + test_once(LCW{{}}, LCW{{}}); + test_once(LCW{"this", "is", "a", "string"}, LCW{"a", "is", "string", "this"}); + + /* One list column */ + test_once(LCW{"this", "is", "is", "is", "a", "string", "string"}, + LCW{"a", "is", "string", "this"}); + + /* Multiple lists column */ + test_once(LCW{LCW{"this", "is", "a", "no duplicate", "string"}, + LCW{"this", "is", "is", "a", "one duplicate", "string"}, + LCW{"this", "is", "is", "is", "a", "two duplicates", "string"}, + LCW{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}, + LCW{LCW{"a", "is", "no duplicate", "string", "this"}, + LCW{"a", "is", "one duplicate", "string", "this"}, + LCW{"a", "is", "string", "this", "two duplicates"}, + LCW{"a", "is", "string", "this", "three duplicates"}}); } - -#endif From fbb72612505b3d1f76c60037fc3cb89cf418f732 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Sun, 7 Mar 2021 14:10:35 -0700 Subject: [PATCH 08/25] Rewrite unit tests for drop_list_duplicates --- .../lists/drop_list_duplicates_tests.cpp | 146 +++++++++++------- 1 file changed, 86 insertions(+), 60 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 84f5beaf320..ca82da52b17 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -19,6 +19,12 @@ #include #include +using float_type = float; +using int_type = int32_t; +using INT_LCW = cudf::test::lists_column_wrapper; +using FLT_LCW = cudf::test::lists_column_wrapper; +using STR_LCW = cudf::test::lists_column_wrapper; + template void test_once(cudf::column_view const& input, LCW const& expected, @@ -37,86 +43,79 @@ struct DropListDuplicatesTest : public cudf::test::BaseFixture { TEST_F(DropListDuplicatesTest, InvalidCasesTests) { - using ILCW = cudf::test::lists_column_wrapper; - using FLCW = cudf::test::lists_column_wrapper; - using SLCW = cudf::test::lists_column_wrapper; - /* Lists of nested types are not supported */ - EXPECT_THROW(cudf::lists::drop_list_duplicates(cudf::lists_column_view{ILCW{ILCW{{1, 2}, {3}}}}), - cudf::logic_error); - EXPECT_THROW(cudf::lists::drop_list_duplicates(cudf::lists_column_view{FLCW{FLCW{{1, 2}, {3}}}}), - cudf::logic_error); EXPECT_THROW( - cudf::lists::drop_list_duplicates(cudf::lists_column_view{SLCW{SLCW{SLCW{"dummy string"}}}}), + cudf::lists::drop_list_duplicates(cudf::lists_column_view{INT_LCW{INT_LCW{{1, 2}, {3}}}}), + cudf::logic_error); + EXPECT_THROW( + cudf::lists::drop_list_duplicates(cudf::lists_column_view{FLT_LCW{FLT_LCW{{1, 2}, {3}}}}), cudf::logic_error); + EXPECT_THROW(cudf::lists::drop_list_duplicates( + cudf::lists_column_view{STR_LCW{STR_LCW{STR_LCW{"dummy string"}}}}), + cudf::logic_error); } TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull) { - using float_type = double; - using LCW = cudf::test::lists_column_wrapper; - /* Trivial cases */ - test_once(LCW{{}}, LCW{{}}); - test_once(LCW{{0, 1, 2, 3, 4, 5}, {}}, LCW{{0, 1, 2, 3, 4, 5}, {}}); + test_once(FLT_LCW{{}}, FLT_LCW{{}}); + test_once(FLT_LCW{{0, 1, 2, 3, 4, 5}, {}}, FLT_LCW{{0, 1, 2, 3, 4, 5}, {}}); /* Multiple empty lists */ - test_once(LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, - LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); + test_once(FLT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, + FLT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); + auto constexpr inf = std::numeric_limits::infinity(); + auto constexpr nan = std::numeric_limits::quiet_NaN(); /* Lists contain inf/nan */ - auto const inf = std::numeric_limits::infinity(); - auto const nan = std::numeric_limits::quiet_NaN(); - test_once(LCW{0, 1, 2, 0, 1, 2, 0, 1, 2, inf, inf, inf}, LCW{0, 1, 2, inf}); - test_once(LCW{0, 1, 2, 0, 1, 2, 0, 1, 2, nan, nan, nan}, LCW{0, 1, 2, nan}); - test_once(LCW{nan, nan, nan, 0, 1, 2, 0, 1, 2, 0, 1, 2, inf, inf, inf}, - LCW{0, 1, 2, nan, inf}); - test_once(LCW{nan, inf, nan, inf, nan, inf}, LCW{nan, inf}); + test_once(FLT_LCW{0, 1, 2, 0, 1, 2, 0, 1, 2, inf, inf, inf}, FLT_LCW{0, 1, 2, inf}); + test_once(FLT_LCW{0, 1, 2, 0, 1, 2, 0, 1, 2, nan, nan, nan}, FLT_LCW{0, 1, 2, nan}); + test_once(FLT_LCW{nan, nan, nan, 0, 1, 2, 0, 1, 2, 0, 1, 2, inf, inf, inf}, + FLT_LCW{0, 1, 2, nan, inf}); + test_once(FLT_LCW{nan, inf, nan, inf, nan, inf}, FLT_LCW{nan, inf}); } TEST_F(DropListDuplicatesTest, IntegerTestsNonNull) { - using LCW = cudf::test::lists_column_wrapper; - /* Trivial cases */ - test_once(LCW{{}}, LCW{{}}); - test_once(LCW{{0, 1, 2, 3, 4, 5}, {}}, LCW{{0, 1, 2, 3, 4, 5}, {}}); + test_once(INT_LCW{{}}, INT_LCW{{}}); + test_once(INT_LCW{{0, 1, 2, 3, 4, 5}, {}}, INT_LCW{{0, 1, 2, 3, 4, 5}, {}}); /* Multiple empty lists */ - test_once(LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, - LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); + test_once(INT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, + INT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); /* Sliced list column */ - auto const list0 = LCW{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; + auto const list0 = INT_LCW{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; auto const list1 = cudf::slice(list0, {1, 5})[0]; - test_once(list0, LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list1, LCW{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list0, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list1, INT_LCW{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); } TEST_F(DropListDuplicatesTest, IntegerTestsWithNulls) { - using LCW = cudf::test::lists_column_wrapper; - auto const null = std::numeric_limits::max(); + auto constexpr null = std::numeric_limits::max(); /* null lists */ - test_once( - LCW{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 && i != 3; })}, - LCW{ - {{1, 2, 3, 4}, {5}, {}, {}, {8, 9, 10}, {6, 7}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 && i != 3; })}); + test_once(INT_LCW{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 2 && i != 3; })}, + INT_LCW{{{1, 2, 3, 4}, {5}, {}, {}, {8, 9, 10}, {6, 7}}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 2 && i != 3; })}); /* null entries are equal */ test_once( - LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, - LCW{{1, 3, 5, 7, 9, null}, std::initializer_list{true, true, true, true, true, false}}); + INT_LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, + INT_LCW{{1, 3, 5, 7, 9, null}, + std::initializer_list{true, true, true, true, true, false}}); /* nulls entries are not equal */ test_once( - LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, - LCW{ + INT_LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, + INT_LCW{ {1, 3, 5, 7, 9, null, null, null, null, null}, std::initializer_list{true, true, true, true, true, false, false, false, false, false}}, cudf::null_equality::UNEQUAL); @@ -124,23 +123,50 @@ TEST_F(DropListDuplicatesTest, IntegerTestsWithNulls) TEST_F(DropListDuplicatesTest, StringTestsNonNull) { - using LCW = cudf::test::lists_column_wrapper; - /* Trivial cases */ - test_once(LCW{{}}, LCW{{}}); - test_once(LCW{"this", "is", "a", "string"}, LCW{"a", "is", "string", "this"}); + test_once(STR_LCW{{}}, STR_LCW{{}}); + test_once(STR_LCW{"this", "is", "a", "string"}, STR_LCW{"a", "is", "string", "this"}); /* One list column */ - test_once(LCW{"this", "is", "is", "is", "a", "string", "string"}, - LCW{"a", "is", "string", "this"}); + test_once(STR_LCW{"this", "is", "is", "is", "a", "string", "string"}, + STR_LCW{"a", "is", "string", "this"}); /* Multiple lists column */ - test_once(LCW{LCW{"this", "is", "a", "no duplicate", "string"}, - LCW{"this", "is", "is", "a", "one duplicate", "string"}, - LCW{"this", "is", "is", "is", "a", "two duplicates", "string"}, - LCW{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}, - LCW{LCW{"a", "is", "no duplicate", "string", "this"}, - LCW{"a", "is", "one duplicate", "string", "this"}, - LCW{"a", "is", "string", "this", "two duplicates"}, - LCW{"a", "is", "string", "this", "three duplicates"}}); + test_once( + STR_LCW{STR_LCW{"this", "is", "a", "no duplicate", "string"}, + STR_LCW{"this", "is", "is", "a", "one duplicate", "string"}, + STR_LCW{"this", "is", "is", "is", "a", "two duplicates", "string"}, + STR_LCW{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}, + STR_LCW{STR_LCW{"a", "is", "no duplicate", "string", "this"}, + STR_LCW{"a", "is", "one duplicate", "string", "this"}, + STR_LCW{"a", "is", "string", "this", "two duplicates"}, + STR_LCW{"a", "is", "string", "this", "three duplicates"}}); +} + +TEST_F(DropListDuplicatesTest, StringTestsWithNulls) +{ + auto const null = std::string(""); + + /* One list column with null entries */ + test_once( + STR_LCW{{"this", null, "is", "is", "is", "a", null, "string", null, "string"}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 1 && i != 6 && i != 8; })}, + STR_LCW{{"a", "is", "string", "this", null}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}); + + /* Multiple lists column with null lists and null entries */ + test_once( + STR_LCW{{STR_LCW{{"this", null, "is", null, "a", null, "no duplicate", null, "string"}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0; })}, + STR_LCW{}, + STR_LCW{"this", "is", "is", "a", "one duplicate", "string"}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}, + STR_LCW{ + {STR_LCW{{"a", "is", "no duplicate", "string", "this", null}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i <= 4; })}, + STR_LCW{}, + STR_LCW{"a", "is", "one duplicate", "string", "this"}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}); } From 23395a678f1fffedb2de0b1758f0b1752c0678f1 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Sun, 7 Mar 2021 15:26:29 -0700 Subject: [PATCH 09/25] Update conda recipes --- conda/recipes/libcudf/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 885a22870bb..e0bb80eb4eb 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -132,6 +132,7 @@ test: - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp - test -f $PREFIX/include/cudf/lists/detail/copying.hpp - test -f $PREFIX/include/cudf/lists/count_elements.hpp + - test -f $PREFIX/include/cudf/lists/drop_list_duplicates.hpp - test -f $PREFIX/include/cudf/lists/extract.hpp - test -f $PREFIX/include/cudf/lists/contains.hpp - test -f $PREFIX/include/cudf/lists/gather.hpp From e865030838475cc2a0cb712509bbb45c65db2d79 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Sun, 7 Mar 2021 18:54:22 -0700 Subject: [PATCH 10/25] Remove nan from testing, since comparing with nan produces undefined order. --- .../lists/drop_list_duplicates_tests.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index ca82da52b17..cd4d9271f73 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -65,14 +65,17 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull) test_once(FLT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, FLT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); - auto constexpr inf = std::numeric_limits::infinity(); - auto constexpr nan = std::numeric_limits::quiet_NaN(); - /* Lists contain inf/nan */ - test_once(FLT_LCW{0, 1, 2, 0, 1, 2, 0, 1, 2, inf, inf, inf}, FLT_LCW{0, 1, 2, inf}); - test_once(FLT_LCW{0, 1, 2, 0, 1, 2, 0, 1, 2, nan, nan, nan}, FLT_LCW{0, 1, 2, nan}); - test_once(FLT_LCW{nan, nan, nan, 0, 1, 2, 0, 1, 2, 0, 1, 2, inf, inf, inf}, - FLT_LCW{0, 1, 2, nan, inf}); - test_once(FLT_LCW{nan, inf, nan, inf, nan, inf}, FLT_LCW{nan, inf}); + auto constexpr p_inf = std::numeric_limits::infinity(); + auto constexpr m_inf = -std::numeric_limits::infinity(); + /* + * Lists contain inf + * We can't test for lists containing nan because the order of nan is + * undefined after sorting + */ + test_once(FLT_LCW{0, 1, 2, 0, 1, 2, 0, 1, 2, p_inf, p_inf, p_inf}, + FLT_LCW{0, 1, 2, p_inf}); + test_once(FLT_LCW{p_inf, 0, m_inf, 0, p_inf, 0, m_inf, 0, p_inf, 0, m_inf}, + FLT_LCW{m_inf, 0, p_inf}); } TEST_F(DropListDuplicatesTest, IntegerTestsNonNull) From 81769516babe3ff8e786e31e618742402e3174bf Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 8 Mar 2021 12:50:48 -0700 Subject: [PATCH 11/25] Address review comments: Change doxygen group, change thrust::copy_if to thrust::unique_copy, change comment style, and fix bug (incorrect offset handling for sliced column) --- cpp/include/doxygen_groups.h | 2 +- cpp/src/lists/drop_list_duplicates.cu | 74 ++++++++++++--------------- 2 files changed, 35 insertions(+), 41 deletions(-) diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index c99b60e38c9..3f3efdb7626 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -146,7 +146,7 @@ * @defgroup lists_contains Searching * @defgroup lists_gather Gathering * @defgroup lists_elements Counting - * @defgroup lists_drop_duplicates Dropping duplicated elements + * @defgroup lists_drop_duplicates Filtering * @} * @defgroup nvtext_apis NVText * @{ diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index b12c957ce17..f4588736769 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -56,31 +56,28 @@ std::vector> get_unique_entries_and_list_offsets( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - /* Create an intermediate table, since the comparator only work on tables */ + // Create an intermediate table, since the comparator only work on tables auto const device_input_table = cudf::table_device_view::create(table_view{{all_lists_entries}}, stream); auto const comp = row_equality_comparator( *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL); auto const num_entries = all_lists_entries.size(); - /* Allocate memory to store the indices of the unique entries */ + // Allocate memory to store the indices of the unique entries auto const unique_indices = cudf::make_numeric_column( entries_list_offsets.type(), num_entries, mask_state::UNALLOCATED, stream); auto const unique_indices_begin = unique_indices->mutable_view().begin(); auto const copy_end = - thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - thrust::counting_iterator(0), - unique_indices_begin, - [last_index = num_entries - 1, comp] __device__(size_type i) { - return i == last_index || !comp(i, i + 1); - }); + thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + unique_indices_begin, + [comp] __device__(size_type i, size_type j) { return comp(i, j); }); + + // Collect unique entries and entry list offsets auto const indices = cudf::detail::slice( unique_indices->view(), 0, thrust::distance(unique_indices_begin, copy_end)); - - /* Collect unique entries and entry list offsets */ return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}}, indices, cudf::out_of_bounds_policy::DONT_CHECK, @@ -113,10 +110,12 @@ std::unique_ptr generate_clean_offsets(lists_column_view const& lists_co rmm::mr::device_memory_resource* mr) { auto const list_offsets = - cudf::detail::slice( - lists_column.offsets(), {lists_column.offset(), lists_column.offsets().size()}, stream) + cudf::detail::slice(lists_column.offsets(), + {lists_column.offset(), lists_column.offset() + lists_column.size() + 1}, + stream) .front(); - auto output_offsets = allocate_like(list_offsets, mask_allocation_policy::NEVER, mr); + auto output_offsets = make_numeric_column( + list_offsets.type(), lists_column.size() + 1, mask_state::UNALLOCATED, stream, mr); thrust::transform( rmm::exec_policy(stream), list_offsets.begin(), @@ -182,11 +181,10 @@ void generate_offsets(size_type num_entries, rmm::mr::device_memory_resource* mr) { auto const new_offsets = allocate_like(original_offsets, mask_allocation_policy::NEVER, mr); - /* - * Firstly, generate list offsets for the unique entries, ignoring empty lists (if any) - * If entries_list_offsets = {1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3 }, num_entries = 11, - * then output = { 0, 4, 5, 8, 10 } - */ + + // Firstly, generate list offsets for the unique entries, ignoring empty lists (if any) + // If entries_list_offsets = {1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3 }, num_entries = 11, + // then output = { 0, 4, 5, 8, 10 } auto const end_copy = thrust::copy_if( rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -195,13 +193,11 @@ void generate_offsets(size_type num_entries, [num_entries, offsets_ptr = entries_list_offsets.begin()] __device__(size_type i) -> bool { return i == 0 || i == num_entries || offsets_ptr[i] != offsets_ptr[i - 1]; }); - /* - * Generate a prefix sum of number of empty lists, storing inplace to the original lists - * offsets - * If the original list offsets is { 0, 0, 5, 5, 6, 6 } (there are 2 empty lists), - * and new_offsets = { 0, 4, 6 }, - * then output = { 0, 1, 1, 2, 2, 3} - */ + // Generate a prefix sum of number of empty lists, storing inplace to the original lists + // offsets + // If the original list offsets is { 0, 0, 5, 5, 6, 6 } (there are 2 empty lists), + // and new_offsets = { 0, 4, 6 }, + // then output = { 0, 1, 1, 2, 2, 3} auto const iter_trans_begin = cudf::detail::make_counting_transform_iterator( 0, [offsets = original_offsets.begin()] __device__(size_type i) -> size_type { return (i > 0 && offsets[i] == offsets[i - 1]) ? 1 : 0; @@ -211,12 +207,10 @@ void generate_offsets(size_type num_entries, iter_trans_begin + original_offsets.size(), original_offsets.begin()); - /* - * Generate the final list offsets - * If the original list offsets are { 0, 0, 5, 5, 6, 6 }, the new offsets are { 0, 4, 6 }, - * and the prefix sums of empty lists are { 0, 1, 1, 2, 2, 3 }, - * then output = { 0, 0, 4, 4, 5, 5 } - */ + // Generate the final list offsets + // If the original list offsets are { 0, 0, 5, 5, 6, 6 }, the new offsets are { 0, 4, 6 }, + // and the prefix sums of empty lists are { 0, 1, 1, 2, 2, 3 }, + // then output = { 0, 0, 4, 4, 5, 5 } thrust::transform( rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -241,20 +235,20 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu if (cudf::is_nested(lists_column.child().type())) CUDF_FAIL("Nested types are not supported in drop_list_duplicates."); - /* Call segmented sort on the list elements */ + // Call segmented sort on the list elements auto const sorted_lists = lists::sort_lists(lists_column, order::ASCENDING, null_order::AFTER); - /* Flatten all entries (depth = 1) of the lists column */ + // Flatten all entries (depth = 1) of the lists column auto const all_lists_entries = lists_column_view(sorted_lists->view()).get_sliced_child(stream); - /* Generate a 0-based offset column */ + // Generate a 0-based offset column auto lists_offsets = detail::generate_clean_offsets(lists_column, stream, mr); - /* Generate a mapping from list entries to offsets of the lists containing those entries */ + // Generate a mapping from list entries to offsets of the lists containing those entries auto const entries_list_offsets = detail::generate_entry_list_offsets( all_lists_entries.size(), lists_offsets->view(), stream, mr); - /* Copy non-duplicated entries (along with their list offsets) to new arrays */ + // Copy non-duplicated entries (along with their list offsets) to new arrays auto unique_entries_and_list_offsets = all_lists_entries.has_nulls() ? detail::get_unique_entries_and_list_offsets( @@ -262,14 +256,14 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu : detail::get_unique_entries_and_list_offsets( all_lists_entries, entries_list_offsets->view(), nulls_equal, stream, mr); - /* Generate offsets for the new lists column */ + // Generate offsets for the new lists column detail::generate_offsets(unique_entries_and_list_offsets.front()->size(), unique_entries_and_list_offsets.back()->view(), lists_offsets->mutable_view(), stream, mr); - /* Construct a new list column without duplicated entries */ + // Construct a new list column without duplicated entries return make_lists_column(lists_column.size(), std::move(lists_offsets), std::move(unique_entries_and_list_offsets.front()), From 985d3c30ef9afe2f855adb6d9f55fcf032a99ede Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 8 Mar 2021 12:59:24 -0700 Subject: [PATCH 12/25] Address review comments: Change comment style, and add an TODO item to the unit tests --- .../lists/drop_list_duplicates_tests.cpp | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index cd4d9271f73..4fa59869b5f 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -43,7 +43,7 @@ struct DropListDuplicatesTest : public cudf::test::BaseFixture { TEST_F(DropListDuplicatesTest, InvalidCasesTests) { - /* Lists of nested types are not supported */ + // Lists of nested types are not supported EXPECT_THROW( cudf::lists::drop_list_duplicates(cudf::lists_column_view{INT_LCW{INT_LCW{{1, 2}, {3}}}}), cudf::logic_error); @@ -57,21 +57,20 @@ TEST_F(DropListDuplicatesTest, InvalidCasesTests) TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull) { - /* Trivial cases */ + // Trivial cases test_once(FLT_LCW{{}}, FLT_LCW{{}}); test_once(FLT_LCW{{0, 1, 2, 3, 4, 5}, {}}, FLT_LCW{{0, 1, 2, 3, 4, 5}, {}}); - /* Multiple empty lists */ + // Multiple empty lists test_once(FLT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, FLT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); auto constexpr p_inf = std::numeric_limits::infinity(); auto constexpr m_inf = -std::numeric_limits::infinity(); - /* - * Lists contain inf - * We can't test for lists containing nan because the order of nan is - * undefined after sorting - */ + + // Lists contain inf + // We can't test for lists containing nan because the order of nan is + // undefined after sorting test_once(FLT_LCW{0, 1, 2, 0, 1, 2, 0, 1, 2, p_inf, p_inf, p_inf}, FLT_LCW{0, 1, 2, p_inf}); test_once(FLT_LCW{p_inf, 0, m_inf, 0, p_inf, 0, m_inf, 0, p_inf, 0, m_inf}, @@ -80,17 +79,18 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull) TEST_F(DropListDuplicatesTest, IntegerTestsNonNull) { - /* Trivial cases */ + // Trivial cases test_once(INT_LCW{{}}, INT_LCW{{}}); test_once(INT_LCW{{0, 1, 2, 3, 4, 5}, {}}, INT_LCW{{0, 1, 2, 3, 4, 5}, {}}); - /* Multiple empty lists */ + // Multiple empty lists test_once(INT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, INT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); - /* Sliced list column */ + // Sliced list column auto const list0 = INT_LCW{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; auto const list1 = cudf::slice(list0, {1, 5})[0]; + // TODO: add a test for cudf::slice(list0, {1, 3})[0] after the issue#7530 is fixed test_once(list0, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); test_once(list1, INT_LCW{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); } @@ -99,7 +99,7 @@ TEST_F(DropListDuplicatesTest, IntegerTestsWithNulls) { auto constexpr null = std::numeric_limits::max(); - /* null lists */ + // null lists test_once(INT_LCW{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}}, cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i != 2 && i != 3; })}, @@ -107,14 +107,14 @@ TEST_F(DropListDuplicatesTest, IntegerTestsWithNulls) cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i != 2 && i != 3; })}); - /* null entries are equal */ + // null entries are equal test_once( INT_LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, INT_LCW{{1, 3, 5, 7, 9, null}, std::initializer_list{true, true, true, true, true, false}}); - /* nulls entries are not equal */ + // nulls entries are not equal test_once( INT_LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, @@ -126,15 +126,15 @@ TEST_F(DropListDuplicatesTest, IntegerTestsWithNulls) TEST_F(DropListDuplicatesTest, StringTestsNonNull) { - /* Trivial cases */ + // Trivial cases test_once(STR_LCW{{}}, STR_LCW{{}}); test_once(STR_LCW{"this", "is", "a", "string"}, STR_LCW{"a", "is", "string", "this"}); - /* One list column */ + // One list column test_once(STR_LCW{"this", "is", "is", "is", "a", "string", "string"}, STR_LCW{"a", "is", "string", "this"}); - /* Multiple lists column */ + // Multiple lists column test_once( STR_LCW{STR_LCW{"this", "is", "a", "no duplicate", "string"}, STR_LCW{"this", "is", "is", "a", "one duplicate", "string"}, @@ -150,7 +150,7 @@ TEST_F(DropListDuplicatesTest, StringTestsWithNulls) { auto const null = std::string(""); - /* One list column with null entries */ + // One list column with null entries test_once( STR_LCW{{"this", null, "is", "is", "is", "a", null, "string", null, "string"}, cudf::detail::make_counting_transform_iterator( @@ -158,7 +158,7 @@ TEST_F(DropListDuplicatesTest, StringTestsWithNulls) STR_LCW{{"a", "is", "string", "this", null}, cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}); - /* Multiple lists column with null lists and null entries */ + // Multiple lists column with null lists and null entries test_once( STR_LCW{{STR_LCW{{"this", null, "is", null, "a", null, "no duplicate", null, "string"}, cudf::detail::make_counting_transform_iterator( From e2357103430b47b639ea1de07c303b59f35eb624 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 8 Mar 2021 13:19:32 -0700 Subject: [PATCH 13/25] Fix a bug in unique copy that forgot to check whether 2 entries are from the same list or not --- cpp/src/lists/drop_list_duplicates.cu | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index f4588736769..1487c51095d 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -68,12 +68,13 @@ std::vector> get_unique_entries_and_list_offsets( entries_list_offsets.type(), num_entries, mask_state::UNALLOCATED, stream); auto const unique_indices_begin = unique_indices->mutable_view().begin(); - auto const copy_end = - thrust::unique_copy(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - unique_indices_begin, - [comp] __device__(size_type i, size_type j) { return comp(i, j); }); + auto const copy_end = thrust::unique_copy( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + unique_indices_begin, + [list_offsets = entries_list_offsets.begin(), comp] __device__( + size_type i, size_type j) { return list_offsets[i] == list_offsets[j] && comp(i, j); }); // Collect unique entries and entry list offsets auto const indices = cudf::detail::slice( From 897adc90404b963a9ee81cef8edd70f7d4a784c2 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 8 Mar 2021 13:30:50 -0700 Subject: [PATCH 14/25] Add a corner test case for drop_list_duplicates --- cpp/tests/lists/drop_list_duplicates_tests.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 4fa59869b5f..52c0fb166da 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -87,6 +87,11 @@ TEST_F(DropListDuplicatesTest, IntegerTestsNonNull) test_once(INT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, INT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); + // List containing similar entries + test_once( + INT_LCW{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}}, + INT_LCW{{1}, {1, 2}, {2, 3}}); + // Sliced list column auto const list0 = INT_LCW{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; auto const list1 = cudf::slice(list0, {1, 5})[0]; From 27bc327e8bf4ee72de5cf755eb347df8c4b49e21 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 9 Mar 2021 09:39:43 -0700 Subject: [PATCH 15/25] Change comments in unit tests of drop_list_duplicates --- cpp/tests/lists/drop_list_duplicates_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 52c0fb166da..c789eb723ac 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -87,7 +87,7 @@ TEST_F(DropListDuplicatesTest, IntegerTestsNonNull) test_once(INT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, INT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); - // List containing similar entries + // Adjacent lists containing the same entries test_once( INT_LCW{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}}, INT_LCW{{1}, {1, 2}, {2, 3}}); From 8fe0e98e4af3c32a88c40550a133648925cd5bac Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 9 Mar 2021 10:13:20 -0700 Subject: [PATCH 16/25] Address review comments: Change size_type to offset_type and use default the device resource for allocating temporary memory --- cpp/src/lists/drop_list_duplicates.cu | 97 ++++++++++++++------------- 1 file changed, 51 insertions(+), 46 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 1487c51095d..59ae2b31cea 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -32,12 +32,13 @@ namespace cudf { namespace lists { namespace detail { +using offset_type = lists_column_view::offset_type; /** * @brief Copy list entries and entry list offsets ignoring duplicates * * Given an array of all entries flattened from a list column and an array that maps each entry to - * the offset of the list containing that entry, those entries and list offsets are copied into new - * arrays such that the duplicated entries within each list will be ignored. + * the offset of the list containing that entry, those entries and list offsets are copied into + * new arrays such that the duplicated entries within each list will be ignored. * * @param all_lists_entries The input array containing all list entries * @param entries_list_offsets A map from list entries to their corresponding list offsets @@ -45,8 +46,8 @@ namespace detail { * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device resource used to allocate memory * - * @return A pair of columns, the first one contains unique list entries and the second one contains - * their corresponding list offsets + * @return A pair of columns, the first one contains unique list entries and the second one + * contains their corresponding list offsets */ template std::vector> get_unique_entries_and_list_offsets( @@ -66,15 +67,16 @@ std::vector> get_unique_entries_and_list_offsets( // Allocate memory to store the indices of the unique entries auto const unique_indices = cudf::make_numeric_column( entries_list_offsets.type(), num_entries, mask_state::UNALLOCATED, stream); - auto const unique_indices_begin = unique_indices->mutable_view().begin(); + auto const unique_indices_begin = unique_indices->mutable_view().begin(); auto const copy_end = thrust::unique_copy( rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_entries), unique_indices_begin, - [list_offsets = entries_list_offsets.begin(), comp] __device__( - size_type i, size_type j) { return list_offsets[i] == list_offsets[j] && comp(i, j); }); + [list_offsets = entries_list_offsets.begin(), comp] __device__(auto i, auto j) { + return list_offsets[i] == list_offsets[j] && comp(i, j); + }); // Collect unique entries and entry list offsets auto const indices = cudf::detail::slice( @@ -117,12 +119,13 @@ std::unique_ptr generate_clean_offsets(lists_column_view const& lists_co .front(); auto output_offsets = make_numeric_column( list_offsets.type(), lists_column.size() + 1, mask_state::UNALLOCATED, stream, mr); - thrust::transform( - rmm::exec_policy(stream), - list_offsets.begin(), - list_offsets.end(), - output_offsets->mutable_view().begin(), - [first = list_offsets.begin()] __device__(auto offset) { return offset - *first; }); + thrust::transform(rmm::exec_policy(stream), + list_offsets.begin(), + list_offsets.end(), + output_offsets->mutable_view().begin(), + [first = list_offsets.begin()] __device__(auto offset) { + return offset - *first; + }); return output_offsets; } @@ -147,17 +150,19 @@ std::unique_ptr generate_clean_offsets(lists_column_view const& lists_co */ std::unique_ptr generate_entry_list_offsets(size_type num_entries, column_view const& offsets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::cuda_stream_view stream) { - auto entry_list_offsets = - make_numeric_column(offsets.type(), num_entries, mask_state::UNALLOCATED, stream, mr); + auto entry_list_offsets = make_numeric_column(offsets.type(), + num_entries, + mask_state::UNALLOCATED, + stream, + rmm::mr::get_current_device_resource()); thrust::upper_bound(rmm::exec_policy(stream), - offsets.begin(), - offsets.begin() + offsets.size(), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - entry_list_offsets->mutable_view().begin()); + offsets.begin(), + offsets.end(), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + entry_list_offsets->mutable_view().begin()); return entry_list_offsets; } @@ -181,18 +186,19 @@ void generate_offsets(size_type num_entries, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const new_offsets = allocate_like(original_offsets, mask_allocation_policy::NEVER, mr); - - // Firstly, generate list offsets for the unique entries, ignoring empty lists (if any) + // Firstly, generate temporary list offsets for the unique entries, ignoring empty lists (if any) // If entries_list_offsets = {1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3 }, num_entries = 11, - // then output = { 0, 4, 5, 8, 10 } - auto const end_copy = thrust::copy_if( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries + 1), - new_offsets->mutable_view().begin(), - [num_entries, offsets_ptr = entries_list_offsets.begin()] __device__(size_type i) - -> bool { return i == 0 || i == num_entries || offsets_ptr[i] != offsets_ptr[i - 1]; }); + // then new_offsets = { 0, 4, 5, 8, 10 } + auto const new_offsets = allocate_like( + original_offsets, mask_allocation_policy::NEVER, rmm::mr::get_current_device_resource()); + thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries + 1), + new_offsets->mutable_view().begin(), + [num_entries, offsets_ptr = entries_list_offsets.begin()] __device__( + auto i) -> bool { + return i == 0 || i == num_entries || offsets_ptr[i] != offsets_ptr[i - 1]; + }); // Generate a prefix sum of number of empty lists, storing inplace to the original lists // offsets @@ -200,27 +206,26 @@ void generate_offsets(size_type num_entries, // and new_offsets = { 0, 4, 6 }, // then output = { 0, 1, 1, 2, 2, 3} auto const iter_trans_begin = cudf::detail::make_counting_transform_iterator( - 0, [offsets = original_offsets.begin()] __device__(size_type i) -> size_type { + 0, [offsets = original_offsets.begin()] __device__(auto i) { return (i > 0 && offsets[i] == offsets[i - 1]) ? 1 : 0; }); thrust::inclusive_scan(rmm::exec_policy(stream), iter_trans_begin, iter_trans_begin + original_offsets.size(), - original_offsets.begin()); + original_offsets.begin()); // Generate the final list offsets // If the original list offsets are { 0, 0, 5, 5, 6, 6 }, the new offsets are { 0, 4, 6 }, // and the prefix sums of empty lists are { 0, 1, 1, 2, 2, 3 }, // then output = { 0, 0, 4, 4, 5, 5 } - thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(original_offsets.size()), - original_offsets.begin(), - [prefix_sum_empty_lists = original_offsets.begin(), - offsets = new_offsets->view().begin()] __device__(size_type i) -> size_type { - return offsets[i - prefix_sum_empty_lists[i]]; - }); + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(original_offsets.size()), + original_offsets.begin(), + [prefix_sum_empty_lists = original_offsets.begin(), + offsets = new_offsets->view().begin()] __device__(auto i) { + return offsets[i - prefix_sum_empty_lists[i]]; + }); } /** * @copydoc cudf::lists::drop_list_duplicates @@ -246,8 +251,8 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu auto lists_offsets = detail::generate_clean_offsets(lists_column, stream, mr); // Generate a mapping from list entries to offsets of the lists containing those entries - auto const entries_list_offsets = detail::generate_entry_list_offsets( - all_lists_entries.size(), lists_offsets->view(), stream, mr); + auto const entries_list_offsets = + detail::generate_entry_list_offsets(all_lists_entries.size(), lists_offsets->view(), stream); // Copy non-duplicated entries (along with their list offsets) to new arrays auto unique_entries_and_list_offsets = From d879b01fe5019ab341a17fa47570e41ff768ac74 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 9 Mar 2021 19:00:23 -0700 Subject: [PATCH 17/25] Address review comments: Change coding style and clean up --- cpp/src/lists/drop_list_duplicates.cu | 24 +++++++++---------- .../lists/drop_list_duplicates_tests.cpp | 5 ++-- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 59ae2b31cea..bef20503537 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -112,20 +112,17 @@ std::unique_ptr generate_clean_offsets(lists_column_view const& lists_co rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const list_offsets = - cudf::detail::slice(lists_column.offsets(), - {lists_column.offset(), lists_column.offset() + lists_column.size() + 1}, - stream) - .front(); - auto output_offsets = make_numeric_column( - list_offsets.type(), lists_column.size() + 1, mask_state::UNALLOCATED, stream, mr); + auto output_offsets = make_numeric_column(data_type{type_to_id()}, + lists_column.size() + 1, + mask_state::UNALLOCATED, + stream, + mr); + auto const base_offsets = lists_column.offsets().begin() + lists_column.offset(); thrust::transform(rmm::exec_policy(stream), - list_offsets.begin(), - list_offsets.end(), + base_offsets, + base_offsets + lists_column.size() + 1, output_offsets->mutable_view().begin(), - [first = list_offsets.begin()] __device__(auto offset) { - return offset - *first; - }); + [first = base_offsets] __device__(auto offset) { return offset - *first; }); return output_offsets; } @@ -238,8 +235,9 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu rmm::mr::device_memory_resource* mr) { if (lists_column.is_empty()) return cudf::empty_like(lists_column.parent()); - if (cudf::is_nested(lists_column.child().type())) + if (cudf::is_nested(lists_column.child().type())) { CUDF_FAIL("Nested types are not supported in drop_list_duplicates."); + } // Call segmented sort on the list elements auto const sorted_lists = lists::sort_lists(lists_column, order::ASCENDING, null_order::AFTER); diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index c789eb723ac..c792c93ca9e 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -32,10 +32,11 @@ void test_once(cudf::column_view const& input, { auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{input}, nulls_equal); - if (equal_test) + if (equal_test) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, true); - else + } else { CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected, true); + } } struct DropListDuplicatesTest : public cudf::test::BaseFixture { From b22252d128249bfbf7d5ac70645b17f894ba765b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 9 Mar 2021 21:57:31 -0700 Subject: [PATCH 18/25] Change comments --- cpp/include/cudf/lists/drop_list_duplicates.hpp | 10 ++++------ cpp/src/lists/drop_list_duplicates.cu | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp index e5e53bd6035..0939bd7956a 100644 --- a/cpp/include/cudf/lists/drop_list_duplicates.hpp +++ b/cpp/include/cudf/lists/drop_list_duplicates.hpp @@ -31,14 +31,12 @@ namespace lists { * @brief Create a new lists column by removing duplicated entries from each list element in the * given lists column * + * @throw cudf::logic_error if any row (list element) in the input column is a nested type. + * * Given an `input` lists_column_view, the list elements in the column are copied to an output lists * column such that their duplicated entries are dropped out to keep only the unique ones. The - * order of those entries within each list are not guaranteed to be preserved as in the input. - * - * Notes: - * - If any row (list element) in the input column contains nested types, cudf::logic_error will be - * thrown. - * - By current implementation, entries in the output lists are sorted by ascending order (nulls + * order of those entries within each list are not guaranteed to be preserved as in the input. In + * the current implementation, entries in the output lists are sorted by ascending order (nulls * last), but this is not guaranteed in future implementation. * * @param lists_column The input lists_column_view diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index bef20503537..171931b4047 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -267,7 +267,7 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu stream, mr); - // Construct a new list column without duplicated entries + // Construct a new lists column without duplicated entries return make_lists_column(lists_column.size(), std::move(lists_offsets), std::move(unique_entries_and_list_offsets.front()), From 3f7956b424cca9cae2672dac82509937bdefdd47 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 10 Mar 2021 08:59:53 -0700 Subject: [PATCH 19/25] Address review comments: Remove unused variable, shortening the codes, and fix example in the comments --- cpp/src/lists/drop_list_duplicates.cu | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 171931b4047..7cddc149590 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -32,6 +32,7 @@ namespace cudf { namespace lists { namespace detail { +namespace { using offset_type = lists_column_view::offset_type; /** * @brief Copy list entries and entry list offsets ignoring duplicates @@ -112,17 +113,17 @@ std::unique_ptr generate_clean_offsets(lists_column_view const& lists_co rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto output_offsets = make_numeric_column(data_type{type_to_id()}, + auto output_offsets = make_numeric_column(data_type{type_to_id()}, lists_column.size() + 1, mask_state::UNALLOCATED, stream, mr); - auto const base_offsets = lists_column.offsets().begin() + lists_column.offset(); - thrust::transform(rmm::exec_policy(stream), - base_offsets, - base_offsets + lists_column.size() + 1, - output_offsets->mutable_view().begin(), - [first = base_offsets] __device__(auto offset) { return offset - *first; }); + thrust::transform( + rmm::exec_policy(stream), + lists_column.offsets_begin(), + lists_column.offsets_end(), + output_offsets->mutable_view().begin(), + [first = lists_column.offsets_begin()] __device__(auto offset) { return offset - *first; }); return output_offsets; } @@ -180,11 +181,10 @@ std::unique_ptr generate_entry_list_offsets(size_type num_entries, void generate_offsets(size_type num_entries, column_view const& entries_list_offsets, mutable_column_view const& original_offsets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::cuda_stream_view stream) { // Firstly, generate temporary list offsets for the unique entries, ignoring empty lists (if any) - // If entries_list_offsets = {1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3 }, num_entries = 11, + // If entries_list_offsets = {1, 1, 1, 1, 2, 3, 3, 3, 4, 4 }, num_entries = 10, // then new_offsets = { 0, 4, 5, 8, 10 } auto const new_offsets = allocate_like( original_offsets, mask_allocation_policy::NEVER, rmm::mr::get_current_device_resource()); @@ -264,8 +264,7 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu detail::generate_offsets(unique_entries_and_list_offsets.front()->size(), unique_entries_and_list_offsets.back()->view(), lists_offsets->mutable_view(), - stream, - mr); + stream); // Construct a new lists column without duplicated entries return make_lists_column(lists_column.size(), @@ -275,6 +274,7 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu cudf::detail::copy_bitmask(lists_column.parent(), stream, mr)); } +} // anonymous namespace } // namespace detail /** From c759c9845148aa4702ebd4370cc12be99b84d3f9 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 10 Mar 2021 14:33:59 -0700 Subject: [PATCH 20/25] Add CUDF_FUNC_RANGE() to the public drop_list_duplicate API --- cpp/src/lists/drop_list_duplicates.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 7cddc149590..29ee682ea3c 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -284,6 +285,7 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu null_equality nulls_equal, rmm::mr::device_memory_resource* mr) { + CUDF_FUNC_RANGE(); return detail::drop_list_duplicates(lists_column, nulls_equal, rmm::cuda_stream_default, mr); } From 3bf0df3828fcaca9a55c72e0e65c344afb096c0f Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 10 Mar 2021 17:11:51 -0700 Subject: [PATCH 21/25] Address review comments: Using the detail::sort_lists API to sort list elements, which takes into account the current stream value --- cpp/src/lists/drop_list_duplicates.cu | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 29ee682ea3c..5ed3251f031 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -20,8 +20,8 @@ #include #include #include +#include #include -#include #include #include @@ -240,8 +240,12 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu CUDF_FAIL("Nested types are not supported in drop_list_duplicates."); } - // Call segmented sort on the list elements - auto const sorted_lists = lists::sort_lists(lists_column, order::ASCENDING, null_order::AFTER); + // Call segmented sort on the list elements and store them in a temporary column sorted_list + auto const sorted_lists = detail::sort_lists(lists_column, + order::ASCENDING, + null_order::AFTER, + stream, + rmm::mr::get_current_device_resource()); // Flatten all entries (depth = 1) of the lists column auto const all_lists_entries = lists_column_view(sorted_lists->view()).get_sliced_child(stream); From e8b6ed365706d0a736d7551d26315383c13ad704 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 11 Mar 2021 11:32:20 -0700 Subject: [PATCH 22/25] Switch to use the default device resource for detail::sort_lists --- cpp/src/lists/drop_list_duplicates.cu | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 5ed3251f031..1eb105d296d 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include @@ -241,11 +241,8 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu } // Call segmented sort on the list elements and store them in a temporary column sorted_list - auto const sorted_lists = detail::sort_lists(lists_column, - order::ASCENDING, - null_order::AFTER, - stream, - rmm::mr::get_current_device_resource()); + auto const sorted_lists = + detail::sort_lists(lists_column, order::ASCENDING, null_order::AFTER, stream); // Flatten all entries (depth = 1) of the lists column auto const all_lists_entries = lists_column_view(sorted_lists->view()).get_sliced_child(stream); From 6fa87441e68514a7912158d0f5d5787ce41b4fc6 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 11 Mar 2021 12:42:20 -0700 Subject: [PATCH 23/25] Add tests for sliced lists column --- cpp/tests/lists/drop_list_duplicates_tests.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index c792c93ca9e..31a6d93a94d 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -95,10 +95,16 @@ TEST_F(DropListDuplicatesTest, IntegerTestsNonNull) // Sliced list column auto const list0 = INT_LCW{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; - auto const list1 = cudf::slice(list0, {1, 5})[0]; - // TODO: add a test for cudf::slice(list0, {1, 3})[0] after the issue#7530 is fixed + auto const list1 = cudf::slice(list0, {0, 5})[0]; + auto const list2 = cudf::slice(list0, {1, 5})[0]; + auto const list3 = cudf::slice(list0, {1, 3})[0]; + auto const list4 = cudf::slice(list0, {0, 3})[0]; + test_once(list0, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list1, INT_LCW{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list1, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list2, INT_LCW{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list3, INT_LCW{{1, 2, 3, 4}, {5}}); + test_once(list4, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}}); } TEST_F(DropListDuplicatesTest, IntegerTestsWithNulls) From 36d5822760fb9869eb7096e4bb2715bbca8c9e0d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 11 Mar 2021 16:05:01 -0700 Subject: [PATCH 24/25] Update cpp/tests/lists/drop_list_duplicates_tests.cpp Co-authored-by: David <45795991+davidwendt@users.noreply.github.com> --- cpp/tests/lists/drop_list_duplicates_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 31a6d93a94d..a1ffeaa9bee 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -52,7 +52,7 @@ TEST_F(DropListDuplicatesTest, InvalidCasesTests) cudf::lists::drop_list_duplicates(cudf::lists_column_view{FLT_LCW{FLT_LCW{{1, 2}, {3}}}}), cudf::logic_error); EXPECT_THROW(cudf::lists::drop_list_duplicates( - cudf::lists_column_view{STR_LCW{STR_LCW{STR_LCW{"dummy string"}}}}), + cudf::lists_column_view{STR_LCW{STR_LCW{STR_LCW{"string"}}}}), cudf::logic_error); } From ae68606ec4a079dd16af926fed15cfd9457d8891 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 11 Mar 2021 17:18:54 -0700 Subject: [PATCH 25/25] Fix style error in drop_list_duplicates_tests.cpp --- cpp/tests/lists/drop_list_duplicates_tests.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index a1ffeaa9bee..0948ba96f62 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -51,9 +51,9 @@ TEST_F(DropListDuplicatesTest, InvalidCasesTests) EXPECT_THROW( cudf::lists::drop_list_duplicates(cudf::lists_column_view{FLT_LCW{FLT_LCW{{1, 2}, {3}}}}), cudf::logic_error); - EXPECT_THROW(cudf::lists::drop_list_duplicates( - cudf::lists_column_view{STR_LCW{STR_LCW{STR_LCW{"string"}}}}), - cudf::logic_error); + EXPECT_THROW( + cudf::lists::drop_list_duplicates(cudf::lists_column_view{STR_LCW{STR_LCW{STR_LCW{"string"}}}}), + cudf::logic_error); } TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull)