From 073cbd880bfdd1f57a621b1929ddfa3370eb6642 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 11 Jul 2022 13:54:34 -0700 Subject: [PATCH] Implement `lists::distinct` and `cudf::detail::stable_distinct` (#11149) This adds new APIs: * `lists::distinct` as a stream compaction component of `cudf::lists::`, allowing to extract distinct elements from lists in a lists column. The new API does a similar job as `lists::drop_list_duplicate` but can operate on arbitrary data types while `lists::drop_list_duplicate` can only work on basic data types and flat structs. * `cudf::detail::stable_distinct`, which is implemented in the main stream compaction module. This API is introduced as just a `detail::` API first (which means we can expose it to the public if needed), producing the equivalent output as `cudf::distinct` but with row order preserved. It is used as a building block to implement `lists::distinct`. This PR is a dependency to implement set-like operations (https://github.com/rapidsai/cudf/pull/11043). Note: This new `lists::distinct` API will completely replace `lists::drop_list_duplicate` (which in turn will be deprecated). This will be the follow-up work. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Yunsong Wang (https://github.com/PointKernel) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/11149 --- cpp/CMakeLists.txt | 5 +- cpp/include/cudf/detail/stream_compaction.hpp | 30 + .../cudf/lists/detail/stream_compaction.hpp | 12 + cpp/include/cudf/lists/stream_compaction.hpp | 25 + .../apply_boolean_mask.cu | 0 cpp/src/lists/stream_compaction/distinct.cu | 84 ++ cpp/src/lists/utilities.cu | 55 ++ cpp/src/lists/utilities.hpp | 53 ++ cpp/src/stream_compaction/stable_distinct.cu | 66 ++ cpp/tests/CMakeLists.txt | 3 +- .../apply_boolean_mask_tests.cpp} | 0 .../stream_compaction/distinct_tests.cpp | 760 ++++++++++++++++++ 12 files changed, 1091 insertions(+), 2 deletions(-) rename cpp/src/lists/{ => stream_compaction}/apply_boolean_mask.cu (100%) create mode 100644 cpp/src/lists/stream_compaction/distinct.cu create mode 100644 cpp/src/lists/utilities.cu create mode 100644 cpp/src/lists/utilities.hpp create mode 100644 cpp/src/stream_compaction/stable_distinct.cu rename cpp/tests/lists/{apply_boolean_mask_test.cpp => stream_compaction/apply_boolean_mask_tests.cpp} (100%) create mode 100644 cpp/tests/lists/stream_compaction/distinct_tests.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 4013f3894eb..3d604fb1030 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -369,7 +369,6 @@ add_library( src/join/mixed_join_size_kernel_nulls.cu src/join/mixed_join_size_kernels_semi.cu src/join/semi_join.cu - src/lists/apply_boolean_mask.cu src/lists/contains.cu src/lists/combine/concatenate_list_elements.cu src/lists/combine/concatenate_rows.cu @@ -387,6 +386,9 @@ add_library( src/lists/lists_column_view.cu src/lists/segmented_sort.cu src/lists/sequences.cu + src/lists/stream_compaction/apply_boolean_mask.cu + src/lists/stream_compaction/distinct.cu + src/lists/utilities.cu src/merge/merge.cu src/partitioning/partitioning.cu src/partitioning/round_robin.cu @@ -452,6 +454,7 @@ add_library( src/stream_compaction/distinct_reduce.cu src/stream_compaction/drop_nans.cu src/stream_compaction/drop_nulls.cu + src/stream_compaction/stable_distinct.cu src/stream_compaction/unique.cu src/stream_compaction/unique_count.cu src/strings/attributes.cu diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 8ba7b0cb996..0db929c523c 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -22,6 +22,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -89,6 +90,35 @@ std::unique_ptr distinct( rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Create a new table without duplicate rows. + * + * Given an `input` table_view, each row is copied to the output table to create a set of distinct + * rows. The row order is guaranteed to be preserved as in the input. + * + * If there are duplicate rows, which row to be copied depends on the specified value of the `keep` + * parameter. + * + * This API produces exactly the same set of output rows as `cudf::distinct`. + * + * @param input The input table + * @param keys Vector of indices indicating key columns in the `input` table + * @param keep Copy any, first, last, or none of the found duplicates + * @param nulls_equal Flag to specify whether null elements should be considered as equal + * @param nans_equal Flag to specify whether NaN elements should be considered as equal + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned table + * @return A table containing the resulting distinct rows + */ +std::unique_ptr
stable_distinct( + table_view const& input, + std::vector const& keys, + duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY, + null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::ALL_EQUAL, + rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Create a column of indices of all distinct rows in the input table. * diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp index 0e9f2ec16c4..ba3dbb6594b 100644 --- a/cpp/include/cudf/lists/detail/stream_compaction.hpp +++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp @@ -34,4 +34,16 @@ std::unique_ptr apply_boolean_mask( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::list::distinct + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr distinct( + lists_column_view const& input, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace cudf::lists::detail diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp index c7a9731eb65..26d3846ab3d 100644 --- a/cpp/include/cudf/lists/stream_compaction.hpp +++ b/cpp/include/cudf/lists/stream_compaction.hpp @@ -55,4 +55,29 @@ std::unique_ptr apply_boolean_mask( lists_column_view const& boolean_mask, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Create a new list column without duplicate elements in each list. + * + * Given a lists column `input`, distinct elements of each list are copied to the corresponding + * output list. The order of lists is preserved while the order of elements within each list is not + * guaranteed. + * + * Example: + * @code{.pseudo} + * input = { {0, 1, 2, 3, 2}, {3, 1, 2}, null, {4, null, null, 5} } + * result = { {0, 1, 2, 3}, {3, 1, 2}, null, {4, null, 5} } + * @endcode + * + * @param input The input lists column + * @param nulls_equal Flag to specify whether null elements should be considered as equal + * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal + * @param mr Device memory resource used to allocate the returned object + * @return The resulting lists column containing lists without duplicates + */ +std::unique_ptr distinct( + lists_column_view const& input, + null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::ALL_EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace cudf::lists diff --git a/cpp/src/lists/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu similarity index 100% rename from cpp/src/lists/apply_boolean_mask.cu rename to cpp/src/lists/stream_compaction/apply_boolean_mask.cu diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu new file mode 100644 index 00000000000..c88209292de --- /dev/null +++ b/cpp/src/lists/stream_compaction/distinct.cu @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace cudf::lists { +namespace detail { + +std::unique_ptr distinct(lists_column_view const& input, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Algorithm: + // - Generate labels for the child elements. + // - Get distinct rows of the table {labels, child} using `stable_distinct`. + // - Build the output lists column from the output distinct rows above. + + if (input.is_empty()) { return empty_like(input.parent()); } + + auto const child = input.get_sliced_child(stream); + auto const labels = generate_labels(input, child.size(), stream); + + auto const distinct_table = + cudf::detail::stable_distinct(table_view{{labels->view(), child}}, // input table + std::vector{0, 1}, // keys + duplicate_keep_option::KEEP_ANY, + nulls_equal, + nans_equal, + stream, + mr); + + auto out_offsets = + reconstruct_offsets(distinct_table->get_column(0).view(), input.size(), stream, mr); + + return make_lists_column(input.size(), + std::move(out_offsets), + std::move(distinct_table->release().back()), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + stream, + mr); +} + +} // namespace detail + +std::unique_ptr distinct(lists_column_view const& input, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::distinct(input, nulls_equal, nans_equal, cudf::default_stream_value, mr); +} + +} // namespace cudf::lists diff --git a/cpp/src/lists/utilities.cu b/cpp/src/lists/utilities.cu new file mode 100644 index 00000000000..95582ad5715 --- /dev/null +++ b/cpp/src/lists/utilities.cu @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utilities.hpp" + +#include +#include + +namespace cudf::lists::detail { + +std::unique_ptr generate_labels(lists_column_view const& input, + size_type n_elements, + rmm::cuda_stream_view stream) +{ + auto labels = make_numeric_column( + data_type(type_to_id()), n_elements, cudf::mask_state::UNALLOCATED, stream); + auto const labels_begin = labels->mutable_view().template begin(); + cudf::detail::label_segments( + input.offsets_begin(), input.offsets_end(), labels_begin, labels_begin + n_elements, stream); + return labels; +} + +std::unique_ptr reconstruct_offsets(column_view const& labels, + size_type n_lists, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + +{ + auto out_offsets = make_numeric_column( + data_type{type_to_id()}, n_lists + 1, mask_state::UNALLOCATED, stream, mr); + + auto const labels_begin = labels.template begin(); + auto const offsets_begin = out_offsets->mutable_view().template begin(); + cudf::detail::labels_to_offsets(labels_begin, + labels_begin + labels.size(), + offsets_begin, + offsets_begin + out_offsets->size(), + stream); + return out_offsets; +} + +} // namespace cudf::lists::detail diff --git a/cpp/src/lists/utilities.hpp b/cpp/src/lists/utilities.hpp new file mode 100644 index 00000000000..aab7ec1ad81 --- /dev/null +++ b/cpp/src/lists/utilities.hpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace cudf::lists::detail { + +/** + * @brief Generate list labels for elements in the child column of the input lists column. + * + * @param input The input lists column + * @param n_elements The number of elements in the child column of the input lists column + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A column containing list labels corresponding to each element in the child column + */ +std::unique_ptr generate_labels(lists_column_view const& input, + size_type n_elements, + rmm::cuda_stream_view stream); + +/** + * @brief Reconstruct an offsets column from the input list labels column. + * + * @param labels The list labels corresponding to each list element + * @param n_lists The number of lists to build the offsets column + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned object + * @return The output offsets column + */ +std::unique_ptr reconstruct_offsets(column_view const& labels, + size_type n_lists, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +} // namespace cudf::lists::detail diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu new file mode 100644 index 00000000000..dc80a454777 --- /dev/null +++ b/cpp/src/stream_compaction/stable_distinct.cu @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace cudf::detail { + +std::unique_ptr
stable_distinct(table_view const& input, + std::vector const& keys, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) { + return empty_like(input); + } + + auto const distinct_indices = + get_distinct_indices(input.select(keys), keep, nulls_equal, nans_equal, stream); + + // Markers to denote which rows to be copied to the output. + auto const output_markers = [&] { + auto markers = rmm::device_uvector(input.num_rows(), stream); + thrust::uninitialized_fill(rmm::exec_policy(stream), markers.begin(), markers.end(), false); + thrust::scatter( + rmm::exec_policy(stream), + thrust::constant_iterator(true, 0), + thrust::constant_iterator(true, static_cast(distinct_indices.size())), + distinct_indices.begin(), + markers.begin()); + return markers; + }(); + + return cudf::detail::copy_if( + input, + [output_markers = output_markers.begin()] __device__(auto const idx) { + return *(output_markers + idx); + }, + stream, + mr); +} + +} // namespace cudf::detail diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 8d8fc3210bb..fd350f26617 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -472,7 +472,6 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp) # * lists tests ---------------------------------------------------------------------------------- ConfigureTest( LISTS_TEST - lists/apply_boolean_mask_test.cpp lists/combine/concatenate_list_elements_tests.cpp lists/combine/concatenate_rows_tests.cpp lists/contains_tests.cpp @@ -482,6 +481,8 @@ ConfigureTest( lists/extract_tests.cpp lists/sequences_tests.cpp lists/sort_lists_tests.cpp + lists/stream_compaction/apply_boolean_mask_tests.cpp + lists/stream_compaction/distinct_tests.cpp ) # ################################################################################################## diff --git a/cpp/tests/lists/apply_boolean_mask_test.cpp b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp similarity index 100% rename from cpp/tests/lists/apply_boolean_mask_test.cpp rename to cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp diff --git a/cpp/tests/lists/stream_compaction/distinct_tests.cpp b/cpp/tests/lists/stream_compaction/distinct_tests.cpp new file mode 100644 index 00000000000..93a72cfb9ce --- /dev/null +++ b/cpp/tests/lists/stream_compaction/distinct_tests.cpp @@ -0,0 +1,760 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +using float_type = double; +using namespace cudf::test::iterators; + +auto constexpr null{0}; // null at current level +auto constexpr XXX{0}; // null pushed down from parent level +auto constexpr neg_NaN = -std::numeric_limits::quiet_NaN(); +auto constexpr neg_Inf = -std::numeric_limits::infinity(); +auto constexpr NaN = std::numeric_limits::quiet_NaN(); +auto constexpr Inf = std::numeric_limits::infinity(); +auto constexpr NULL_EQUAL = cudf::null_equality::EQUAL; +auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL; +auto constexpr NAN_EQUAL = cudf::nan_equality::ALL_EQUAL; +auto constexpr NAN_UNEQUAL = cudf::nan_equality::UNEQUAL; + +using int32s_col = cudf::test::fixed_width_column_wrapper; +using floats_lists = cudf::test::lists_column_wrapper; +using strings_lists = cudf::test::lists_column_wrapper; +using strings_col = cudf::test::strings_column_wrapper; +using structs_col = cudf::test::structs_column_wrapper; +using lists_cv = cudf::lists_column_view; + +namespace { + +auto distinct_sorted(cudf::column_view const& input, + cudf::null_equality nulls_equal = NULL_EQUAL, + cudf::nan_equality nans_equal = NAN_EQUAL) +{ + auto const results = cudf::lists::distinct(lists_cv{input}, nulls_equal, nans_equal); + + // The sorted result will have nulls first and NaNs last. + // In addition, row equality comparisons in tests just ignore NaN sign thus the expected values + // can be just NaN while the input can be mixed of NaN and neg_NaN. + return cudf::lists::sort_lists( + lists_cv{*results}, cudf::order::ASCENDING, cudf::null_order::BEFORE); +} + +} // namespace + +struct ListDistinctTest : public cudf::test::BaseFixture { +}; + +template +struct ListDistinctTypedTest : public cudf::test::BaseFixture { +}; + +using TestTypes = + cudf::test::Concat; + +TYPED_TEST_SUITE(ListDistinctTypedTest, TestTypes); + +TEST_F(ListDistinctTest, TrivialTest) +{ + auto const input = + floats_lists{{floats_lists{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 0.0}, null_at(6)}, + floats_lists{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 1.0}, null_at(6)}, + {} /*NULL*/, + floats_lists{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 1.0}, null_at(6)}}, + null_at(2)}; + + auto const expected = floats_lists{{floats_lists{{null, 0.0, 5.0, NaN}, null_at(0)}, + floats_lists{{null, 0.0, 1.0, 5.0, NaN}, null_at(0)}, + floats_lists{} /*NULL*/, + floats_lists{{null, 0.0, 1.0, 5.0, NaN}, null_at(0)}}, + null_at(2)}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); +} + +TEST_F(ListDistinctTest, FloatingPointTestsWithSignedZero) +{ + // -0.0 and 0.0 should be considered equal. + auto const input = floats_lists{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0, 3}; + auto const expected = floats_lists{0, 1, 2, 3}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); +} + +TEST_F(ListDistinctTest, FloatingPointTestsWithInf) +{ + auto const input = floats_lists{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf}; + auto const expected = floats_lists{neg_Inf, 0, Inf}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); +} + +TEST_F(ListDistinctTest, FloatingPointTestsWithNaNs) +{ + auto const input = + floats_lists{0, -1, 1, NaN, 2, 0, neg_NaN, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN}; + + // NaNs are equal. + { + auto const expected = floats_lists{-2, -1, 0, 1, 2, NaN}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // NaNs are unequal. + { + auto const expected = floats_lists{-2, -1, 0, 1, 2, NaN, NaN, NaN, NaN, NaN, NaN, NaN}; + + auto const results_sorted = distinct_sorted(input, NULL_EQUAL, NAN_UNEQUAL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TEST_F(ListDistinctTest, StringTestsNonNull) +{ + // Trivial cases - empty input. + { + auto const input = strings_lists{{}}; + auto const expected = strings_lists{{}}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // No duplicate. + { + auto const input = strings_lists{"this", "is", "a", "string"}; + auto const expected = strings_lists{"a", "is", "string", "this"}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // One list column. + { + auto const input = strings_lists{"this", "is", "is", "is", "a", "string", "string"}; + auto const expected = strings_lists{"a", "is", "string", "this"}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // Multiple lists column. + { + auto const input = strings_lists{ + strings_lists{"this", "is", "a", "no duplicate", "string"}, + strings_lists{"this", "is", "is", "a", "one duplicate", "string"}, + strings_lists{"this", "is", "is", "is", "a", "two duplicates", "string"}, + strings_lists{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}; + auto const expected = + strings_lists{strings_lists{"a", "is", "no duplicate", "string", "this"}, + strings_lists{"a", "is", "one duplicate", "string", "this"}, + strings_lists{"a", "is", "string", "this", "two duplicates"}, + strings_lists{"a", "is", "string", "this", "three duplicates"}}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TEST_F(ListDistinctTest, StringTestsWithNullsEqual) +{ + auto const null = std::string(""); + + // One list column with null entries. + { + auto const input = strings_lists{ + {"this", null, "is", "is", "is", "a", null, "string", null, "string"}, nulls_at({1, 6, 8})}; + auto const expected = strings_lists{{null, "a", "is", "string", "this"}, null_at(0)}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // Multiple lists column with null lists and null entries. + { + auto const input = strings_lists{ + {strings_lists{{"this", null, "is", null, "a", null, "no duplicate", null, "string"}, + nulls_at({1, 3, 5, 7})}, + strings_lists{}, /* NULL */ + strings_lists{"this", "is", "is", "a", "one duplicate", "string"}}, + null_at(1)}; + auto const expected = + strings_lists{{strings_lists{{null, "a", "is", "no duplicate", "string", "this"}, null_at(0)}, + strings_lists{}, /* NULL */ + strings_lists{"a", "is", "one duplicate", "string", "this"}}, + null_at(1)}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TEST_F(ListDistinctTest, StringTestsWithNullsUnequal) +{ + auto const null = std::string(""); + + // One list column with null entries. + { + auto const input = strings_lists{ + {"this", null, "is", "is", "is", "a", null, "string", null, "string"}, nulls_at({1, 6, 8})}; + auto const expected = + strings_lists{{null, null, null, "a", "is", "string", "this"}, nulls_at({0, 1, 2})}; + + auto const results_sorted = distinct_sorted(input, NULL_UNEQUAL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // Multiple lists column with null lists and null entries. + { + auto const input = strings_lists{ + {strings_lists{{"this", null, "is", null, "a", null, "no duplicate", null, "string"}, + nulls_at({1, 3, 5, 7})}, + strings_lists{}, /* NULL */ + strings_lists{"this", "is", "is", "a", "one duplicate", "string"}}, + null_at(1)}; + auto const expected = strings_lists{ + {strings_lists{{null, null, null, null, "a", "is", "no duplicate", "string", "this"}, + nulls_at({0, 1, 2, 3})}, + strings_lists{}, /* NULL */ + strings_lists{"a", "is", "one duplicate", "string", "this"}}, + null_at(1)}; + + auto const results_sorted = distinct_sorted(input, NULL_UNEQUAL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TYPED_TEST(ListDistinctTypedTest, TrivialInputTests) +{ + using lists_col = cudf::test::lists_column_wrapper; + + // Empty input. + { + auto const input = lists_col{}; + auto const expected = lists_col{}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // All input lists are empty. + { + auto const input = lists_col{lists_col{}, lists_col{}, lists_col{}}; + auto const expected = lists_col{lists_col{}, lists_col{}, lists_col{}}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // Trivial cases. + { + auto const input = lists_col{0, 1, 2, 3, 4, 5}; + auto const expected = lists_col{0, 1, 2, 3, 4, 5}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // Multiple empty lists. + { + auto const input = lists_col{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}; + auto const expected = lists_col{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TYPED_TEST(ListDistinctTypedTest, SlicedNonNullInputTests) +{ + using lists_col = cudf::test::lists_column_wrapper; + + auto const input_original = + lists_col{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; + + { + auto const expected = lists_col{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + + auto const results_sorted = distinct_sorted(input_original); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + { + auto const input = cudf::slice(input_original, {0, 5})[0]; + auto const expected = lists_col{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + { + auto const input = cudf::slice(input_original, {1, 5})[0]; + auto const expected = lists_col{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + { + auto const input = cudf::slice(input_original, {1, 3})[0]; + auto const expected = lists_col{{1, 2, 3, 4}, {5}}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + { + auto const input = cudf::slice(input_original, {0, 3})[0]; + auto const expected = lists_col{{1, 2, 3}, {1, 2, 3, 4}, {5}}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TYPED_TEST(ListDistinctTypedTest, InputHaveNullsTests) +{ + using lists_col = cudf::test::lists_column_wrapper; + auto constexpr null = TypeParam{0}; + + // Nullable lists. + { + auto const input = lists_col{ + {{3, 2, 1, 4, 1}, {5}, {} /*NULL*/, {} /*NULL*/, {10, 8, 9}, {6, 7}}, nulls_at({2, 3})}; + auto const expected = lists_col{ + {{1, 2, 3, 4}, {5}, {} /*NULL*/, {} /*NULL*/, {8, 9, 10}, {6, 7}}, nulls_at({2, 3})}; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // Nullable child and nulls are equal. + { + auto const input = + lists_col{{null, 1, null, 3, null, 5, null, 7, null, 9}, nulls_at({0, 2, 4, 6, 8})}; + auto const expected = lists_col{{null, 1, 3, 5, 7, 9}, null_at(0)}; + + auto const results_sorted = distinct_sorted(input, NULL_EQUAL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // Nullable child and nulls are unequal. + { + auto const input = lists_col{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nulls_at({0, 2, 4, 6, 8})}; + auto const expected = + lists_col{{null, null, null, null, null, 1, 3, 5, 7, 9}, nulls_at({0, 1, 2, 3, 4})}; + + auto const results_sorted = distinct_sorted(input, NULL_UNEQUAL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TEST_F(ListDistinctTest, InputListsOfStructsNoNull) +{ + auto const get_structs = [] { + auto child1 = int32s_col{ + 1, 1, 1, 1, 1, 1, 1, 1, // list1 + 1, 1, 1, 1, 2, 1, 2, 2, // list2 + 2, 2, 2, 2, 3, 2, 3, 3 // list3 + }; + auto child2 = strings_col{ + // begin list1 + "Banana", + "Mango", + "Apple", + "Cherry", + "Kiwi", + "Banana", + "Cherry", + "Kiwi", // end list1 + // begin list2 + "Bear", + "Duck", + "Cat", + "Dog", + "Panda", + "Bear", + "Cat", + "Panda", // end list2 + // begin list3 + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÍÍÍÍÍ", + "ÁBC", + "XYZ", + "ÁÁÁ", + "ÁBC", + "XYZ" // end list3 + }; + return structs_col{{child1, child2}}; + }; + + auto const get_expected = [] { + auto child1 = int32s_col{1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3}; + auto child2 = strings_col{ + // begin list1 + "Apple", + "Banana", + "Cherry", + "Kiwi", + "Mango", // end list1 + // begin list2 + "Bear", + "Cat", + "Dog", + "Duck", + "Cat", + "Panda", // end list2 + // begin list3 + "ÁBC", + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÍÍÍÍÍ", + "XYZ", + "ÁBC" // end list3 + }; + return structs_col{{child1, child2}}; + }; + + // Test full columns. + { + auto const input = cudf::make_lists_column( + 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected = cudf::make_lists_column( + 3, int32s_col{0, 5, 11, 17}.release(), get_expected().release(), 0, {}); + + auto const results_sorted = distinct_sorted(*input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results_sorted); + } + + // Test sliced columns. + { + auto const input_original = cudf::make_lists_column( + 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected_original = cudf::make_lists_column( + 3, int32s_col{0, 5, 11, 17}.release(), get_expected().release(), 0, {}); + auto const input = cudf::slice(*input_original, {1, 3})[0]; + auto const expected = cudf::slice(*expected_original, {1, 3})[0]; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TEST_F(ListDistinctTest, InputListsOfStructsHaveNull) +{ + auto const get_structs = [] { + auto child1 = int32s_col{{ + 1, 1, null, XXX, XXX, 1, 1, 1, // list1 + 1, 1, 1, 1, 2, 1, null, 2, // list2 + null, null, 2, 2, 3, 2, 3, 3 // list3 + }, + nulls_at({2, 14, 16, 17})}; + auto child2 = strings_col{{ + // begin list1 + "Banana", + "Mango", + "Apple", + "XXX", /*NULL*/ + "XXX", /*NULL*/ + "Banana", + "Cherry", + "Kiwi", // end list1 + // begin list2 + "Bear", + "Duck", + "Cat", + "Dog", + "Panda", + "Bear", + "" /*NULL*/, + "Panda", // end list2 + // begin list3 + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÍÍÍÍÍ", + "ÁBC", + "" /*NULL*/, + "ÁÁÁ", + "ÁBC", + "XYZ" // end list3 + }, + nulls_at({14, 20})}; + return structs_col{{child1, child2}, nulls_at({3, 4})}; + }; + + auto const get_expected = [] { + auto child1 = int32s_col{{ // begin list1 + XXX, // end list1 + null, + 1, + 1, + 1, + 1, + // begin list2 + null, // end list2 + 1, + 1, + 1, + 1, + 2, + // begin list3 + null, + null, + 2, + 2, + 2, + 3, + 3, + 3}, // end list3 + nulls_at({1, 6, 12, 13})}; + auto child2 = strings_col{{ // begin list1 + "XXX", /*NULL*/ + "Apple", + "Banana", + "Cherry", + "Kiwi", + "Mango", // end list1 + // begin list2 + "", /*NULL*/ + "Bear", + "Cat", + "Dog", + "Duck", + "Panda", // end list2 + // begin list3 + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÁBC", + "ÁÁÁ", + "ÍÍÍÍÍ", + "", /*NULL*/ + "XYZ", + "ÁBC"}, // end list3 + nulls_at({6, 17})}; + return structs_col{{child1, child2}, null_at(0)}; + }; + + // Test full columns. + { + auto const input = cudf::make_lists_column( + 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected = cudf::make_lists_column( + 3, int32s_col{0, 6, 12, 20}.release(), get_expected().release(), 0, {}); + + auto const results_sorted = distinct_sorted(*input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results_sorted); + } + + // Test sliced columns. + { + auto const input_original = cudf::make_lists_column( + 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected_original = cudf::make_lists_column( + 3, int32s_col{0, 6, 12, 20}.release(), get_expected().release(), 0, {}); + auto const input = cudf::slice(*input_original, {1, 3})[0]; + auto const expected = cudf::slice(*expected_original, {1, 3})[0]; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TEST_F(ListDistinctTest, InputListsOfNestedStructsHaveNull) +{ + auto const get_structs = [] { + auto grandchild1 = int32s_col{{ + 1, XXX, null, XXX, XXX, 1, 1, 1, // list1 + 1, 1, 1, 1, 2, 1, null, 2, // list2 + null, null, 2, 2, 3, 2, 3, 3 // list3 + }, + nulls_at({2, 14, 16, 17})}; + auto grandchild2 = strings_col{{ + // begin list1 + "Banana", + "YYY", /*NULL*/ + "Apple", + "XXX", /*NULL*/ + "YYY", /*NULL*/ + "Banana", + "Cherry", + "Kiwi", // end list1 + // begin list2 + "Bear", + "Duck", + "Cat", + "Dog", + "Panda", + "Bear", + "" /*NULL*/, + "Panda", // end list2 + // begin list3 + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÍÍÍÍÍ", + "ÁBC", + "" /*NULL*/, + "ÁÁÁ", + "ÁBC", + "XYZ" // end list3 + }, + nulls_at({14, 20})}; + auto child1 = structs_col{{grandchild1, grandchild2}, nulls_at({1, 3, 4})}; + return structs_col{{child1}}; + }; + + auto const get_expected = [] { + auto grandchild1 = int32s_col{{// begin list1 + XXX, + null, + 1, + 1, + 1, // end list1 + // begin list2 + null, + 1, + 1, + 1, + 1, + 2, // end list2 + // begin list3 + null, + null, + 2, + 2, + 2, + 3, + 3, + 3}, + nulls_at({1, 5, 11, 12})}; + auto grandchild2 = strings_col{{ + // begin list1 + "XXX" /*NULL*/, + "Apple", + "Banana", + "Cherry", + "Kiwi", // end list1 + // begin list2 + "" /*NULL*/, + "Bear", + "Cat", + "Dog", + "Duck", + "Panda", // end list2 + // begin list3 + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÁBC", + "ÁÁÁ", + "ÍÍÍÍÍ", + "", /*NULL*/ + "XYZ", + "ÁBC" // end list3 + }, + nulls_at({5, 16})}; + auto child1 = structs_col{{grandchild1, grandchild2}, nulls_at({0})}; + return structs_col{{child1}}; + }; + + // Test full columns. + { + auto const input = cudf::make_lists_column( + 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected = cudf::make_lists_column( + 3, int32s_col{0, 5, 11, 19}.release(), get_expected().release(), 0, {}); + + auto const results_sorted = distinct_sorted(*input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results_sorted); + } + + // Test sliced columns. + { + auto const input_original = cudf::make_lists_column( + 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected_original = cudf::make_lists_column( + 3, int32s_col{0, 5, 11, 19}.release(), get_expected().release(), 0, {}); + auto const input = cudf::slice(*input_original, {1, 3})[0]; + auto const expected = cudf::slice(*expected_original, {1, 3})[0]; + + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TEST_F(ListDistinctTest, InputListsOfStructsOfLists) +{ + auto const input = [] { + auto const get_structs = [] { + auto child1 = int32s_col{// begin list1 + 0, + 0, + 0, // end list1 + // begin list2 + 1, // end list2 + // begin list3 + 2, + 2, // end list3 + // begin list4 + 3, + 3, + 3}; + auto child2 = floats_lists{// begin list1 + floats_lists{0, 1}, + floats_lists{0, 1}, + floats_lists{0, 1}, // end list1 + // begin list2 + floats_lists{3, 4, 5}, // end list2 + // begin list3 + floats_lists{}, + floats_lists{}, // end list3 + // begin list4 + floats_lists{6, 7}, + floats_lists{6, 7}, + floats_lists{6, 7}}; + return structs_col{{child1, child2}}; + }; + + return cudf::make_lists_column( + 4, int32s_col{0, 3, 4, 6, 9}.release(), get_structs().release(), 0, {}); + }(); + + auto const expected = [] { + auto const get_structs = [] { + auto child1 = int32s_col{0, 1, 2, 3}; + auto child2 = + floats_lists{floats_lists{0, 1}, floats_lists{3, 4, 5}, floats_lists{}, floats_lists{6, 7}}; + return structs_col{{child1, child2}}; + }; + + return cudf::make_lists_column( + 4, int32s_col{0, 1, 2, 3, 4}.release(), get_structs().release(), 0, {}); + }(); + + auto const results = cudf::lists::distinct(lists_cv{*input}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results); +}