From 535b91edd2448565296d4c2603aebcda2bb6da8c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 28 Feb 2022 18:01:12 -0500 Subject: [PATCH 01/17] Update drop_duplicates to work like std::unique --- .../stream_compaction/drop_duplicates.cpp | 2 +- cpp/include/cudf/detail/stream_compaction.hpp | 1 - cpp/include/cudf/stream_compaction.hpp | 13 +-- cpp/src/stream_compaction/drop_duplicates.cu | 106 +++++------------- .../drop_duplicates_tests.cpp | 80 +++++++------ 5 files changed, 77 insertions(+), 125 deletions(-) diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates.cpp index 317db92ae8b..4f38c791a9c 100644 --- a/cpp/benchmarks/stream_compaction/drop_duplicates.cpp +++ b/cpp/benchmarks/stream_compaction/drop_duplicates.cpp @@ -72,7 +72,7 @@ void nvbench_drop_duplicates(nvbench::state& state, state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { rmm::cuda_stream_view stream_view{launch.get_stream()}; auto result = cudf::detail::drop_duplicates( - input_table, {0}, Keep, cudf::null_equality::EQUAL, cudf::null_order::BEFORE, stream_view); + input_table, {0}, Keep, cudf::null_equality::EQUAL, stream_view); }); } diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 3d065556827..3d84e112609 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -71,7 +71,6 @@ std::unique_ptr drop_duplicates( std::vector const& keys, duplicate_keep_option keep, null_equality nulls_equal = null_equality::EQUAL, - null_order null_precedence = null_order::BEFORE, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 94039d81f31..78e67132766 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -214,13 +214,10 @@ enum class duplicate_keep_option { }; /** - * @brief Create a new table without duplicate rows. + * @brief Create a new table with consecutive duplicate rows removed. * - * The output table is sorted according to the lexicographic ordering of the data in the columns - * indexed by `keys`. - * - * Given an `input` table_view, each row is copied to output table if the corresponding - * row of `keys` columns is unique, where the definition of unique depends on the value of @p keep: + * Given an `input` table_view, one specific row from a group of equivalent elements is copied to + * output table depending on the value of @p keep: * - KEEP_FIRST: only the first of a sequence of duplicate rows is copied * - KEEP_LAST: only the last of a sequence of duplicate rows is copied * - KEEP_NONE: no duplicate rows are copied @@ -232,18 +229,16 @@ enum class duplicate_keep_option { * @param[in] keep keep first row, last row, or no rows of the found duplicates * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not * equal if null_equality::UNEQUAL - * @param[in] null_precedence flag to denote nulls should appear before or after non-null items * @param[in] mr Device memory resource used to allocate the returned table's device * memory * - * @return Table with sorted unique rows as specified by `keep`. + * @return Table with unique rows from each sequence of equivalent rows as specified by `keep`. */ std::unique_ptr
drop_duplicates( table_view const& input, std::vector const& keys, duplicate_keep_option keep, null_equality nulls_equal = null_equality::EQUAL, - null_order null_precedence = null_order::BEFORE, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index 2fd1f530b6d..7c443b5fc63 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -45,89 +45,41 @@ namespace cudf { namespace detail { -namespace { -/** - * @brief Create a column_view of index values which represent the row values - * without duplicates as per @p `keep` - * - * Given a `keys` table_view, each row index is copied to output `unique_indices`, if the - * corresponding row of `keys` table_view is unique, where the definition of unique depends on the - * value of @p keep: - * - KEEP_FIRST: only the first of a sequence of duplicate rows is copied - * - KEEP_LAST: only the last of a sequence of duplicate rows is copied - * - KEEP_NONE: only unique rows are kept - * - * @param[in] keys table_view to identify duplicate rows - * @param[out] unique_indices Column to store the index with unique rows - * @param[in] keep keep first entry, last entry, or no entries if duplicates found - * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, - * @param[in] null_precedence flag to denote nulls should appear before or after non-null items, - * nulls are not equal if null_equality::UNEQUAL - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * - * @return column_view column_view of unique row index as per specified `keep`, this is actually - * slice of `unique_indices`. - */ -column_view get_unique_ordered_indices(cudf::table_view const& keys, - cudf::mutable_column_view& unique_indices, - duplicate_keep_option keep, - null_equality nulls_equal, - null_order null_precedence, - rmm::cuda_stream_view stream) -{ - // Sort only the indices. - // Note that stable sort must be used to maintain the order of duplicate elements. - auto sorted_indices = stable_sorted_order( - keys, - std::vector{}, - std::vector{static_cast(keys.num_columns()), null_precedence}, - stream, - rmm::mr::get_current_device_resource()); - - // extract unique indices - auto device_input_table = cudf::table_device_view::create(keys, stream); - - auto comp = row_equality_comparator( - nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal); - auto result_end = cudf::detail::unique_copy(sorted_indices->view().begin(), - sorted_indices->view().end(), - unique_indices.begin(), - comp, - keep, - stream); - - return cudf::detail::slice(column_view(unique_indices), - 0, - thrust::distance(unique_indices.begin(), result_end)); -} -} // namespace - std::unique_ptr
drop_duplicates(table_view const& input, std::vector const& keys, duplicate_keep_option keep, null_equality nulls_equal, - null_order null_precedence, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) { - return empty_like(input); - } - - auto keys_view = input.select(keys); - - // The values will be filled into this column - auto unique_indices = cudf::make_numeric_column( - data_type{type_id::INT32}, keys_view.num_rows(), mask_state::UNALLOCATED, stream); - auto mutable_unique_indices_view = unique_indices->mutable_view(); - // This is just slice of `unique_indices` but with different size as per the - // keys_view has been processed in `get_unique_ordered_indices` - auto unique_indices_view = detail::get_unique_ordered_indices( - keys_view, mutable_unique_indices_view, keep, nulls_equal, null_precedence, stream); - - // run gather operation to establish new order + auto const num_rows = input.num_rows(); + if (num_rows == 0 or input.num_columns() == 0 or keys.empty()) { return empty_like(input); } + + auto unique_indices = + make_numeric_column(data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED, stream, mr); + auto mutable_view = mutable_column_device_view::create(*unique_indices, stream); + auto keys_view = input.select(keys); + auto keys_device_view = cudf::table_device_view::create(keys_view, stream); + auto row_equal = row_equality_comparator(nullate::DYNAMIC{cudf::has_nulls(keys_view)}, + *keys_device_view, + *keys_device_view, + nulls_equal); + + // get indices of unique rows + auto result_end = unique_copy(thrust::counting_iterator(0), + thrust::counting_iterator(num_rows), + mutable_view->begin(), + row_equal, + keep, + stream); + auto indices_view = + cudf::detail::slice(column_view(*unique_indices), + 0, + thrust::distance(mutable_view->begin(), result_end)); + + // gather unique rows and return return detail::gather(input, - unique_indices_view, + indices_view, out_of_bounds_policy::DONT_CHECK, detail::negative_index_policy::NOT_ALLOWED, stream, @@ -196,12 +148,10 @@ std::unique_ptr
drop_duplicates(table_view const& input, std::vector const& keys, duplicate_keep_option const keep, null_equality nulls_equal, - null_order null_precedence, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::drop_duplicates( - input, keys, keep, nulls_equal, null_precedence, rmm::cuda_stream_default, mr); + return detail::drop_duplicates(input, keys, keep, nulls_equal, rmm::cuda_stream_default, mr); } std::unique_ptr
unordered_drop_duplicates(table_view const& input, diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp index d49b8208094..9d60f09643c 100644 --- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp +++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp @@ -45,14 +45,19 @@ TEST_F(DropDuplicatesCommon, StringKeyColumn) cudf::table_view input{{col, key_col}}; std::vector keys{1}; + cudf::test::fixed_width_column_wrapper exp_col{{5, 4, 5, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::strings_column_wrapper exp_key_col{{"all", "new", "all", "new", "the", "strings"}, + {1, 1, 1, 0, 1, 1}}; + cudf::table_view expected{{exp_col, exp_key_col}}; + + auto got = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got->view()); + cudf::test::fixed_width_column_wrapper exp_sort_col{{5, 5, 4, 1, 8}, {1, 1, 0, 1, 1}}; cudf::test::strings_column_wrapper exp_sort_key_col{{"new", "all", "new", "strings", "the"}, {0, 1, 1, 1, 1}}; cudf::table_view expected_sort{{exp_sort_col, exp_sort_key_col}}; - auto got_sort = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, got_sort->view()); - auto got_unordered = unordered_drop_duplicates(input, keys); auto key_view = got_unordered->select(keys.begin(), keys.end()); auto sorted_result = cudf::sort_by_key(got_unordered->view(), key_view); @@ -115,12 +120,12 @@ TEST_F(DropDuplicates, NonNullTable) cudf::table_view input{{col1, col2, col1_key, col2_key}}; std::vector keys{2, 3}; - // Keep the first duplicate row + // Keep the first of duplicate // The expected table would be sorted in ascending order with respect to keys - cudf::test::fixed_width_column_wrapper exp_col1_first{{5, 5, 5, 3, 8}}; - cudf::test::fixed_width_column_wrapper exp_col2_first{{4, 4, 4, 3, 9}}; - cudf::test::fixed_width_column_wrapper exp_col1_key_first{{9, 19, 20, 20, 21}}; - cudf::test::fixed_width_column_wrapper exp_col2_key_first{{21, 20, 19, 20, 9}}; + cudf::test::fixed_width_column_wrapper exp_col1_first{{5, 3, 5, 8, 5}}; + cudf::test::fixed_width_column_wrapper exp_col2_first{{4, 3, 4, 9, 4}}; + cudf::test::fixed_width_column_wrapper exp_col1_key_first{{20, 20, 19, 21, 9}}; + cudf::test::fixed_width_column_wrapper exp_col2_key_first{{19, 20, 20, 9, 21}}; cudf::table_view expected_first{ {exp_col1_first, exp_col2_first, exp_col1_key_first, exp_col2_key_first}}; @@ -128,11 +133,11 @@ TEST_F(DropDuplicates, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); - // Keep the last duplicate row - cudf::test::fixed_width_column_wrapper exp_col1_last{{5, 5, 4, 3, 8}}; - cudf::test::fixed_width_column_wrapper exp_col2_last{{4, 4, 5, 3, 9}}; - cudf::test::fixed_width_column_wrapper exp_col1_key_last{{9, 19, 20, 20, 21}}; - cudf::test::fixed_width_column_wrapper exp_col2_key_last{{21, 20, 19, 20, 9}}; + // Keep the last of duplicate + cudf::test::fixed_width_column_wrapper exp_col1_last{{4, 3, 5, 8, 5}}; + cudf::test::fixed_width_column_wrapper exp_col2_last{{5, 3, 4, 9, 4}}; + cudf::test::fixed_width_column_wrapper exp_col1_key_last{{20, 20, 19, 21, 9}}; + cudf::test::fixed_width_column_wrapper exp_col2_key_last{{19, 20, 20, 9, 21}}; cudf::table_view expected_last{ {exp_col1_last, exp_col2_last, exp_col1_key_last, exp_col2_key_last}}; @@ -141,10 +146,10 @@ TEST_F(DropDuplicates, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); // Keep no duplicate rows - cudf::test::fixed_width_column_wrapper exp_col1_unique{{5, 5, 3, 8}}; - cudf::test::fixed_width_column_wrapper exp_col2_unique{{4, 4, 3, 9}}; - cudf::test::fixed_width_column_wrapper exp_col1_key_unique{{9, 19, 20, 21}}; - cudf::test::fixed_width_column_wrapper exp_col2_key_unique{{21, 20, 20, 9}}; + cudf::test::fixed_width_column_wrapper exp_col1_unique{{3, 5, 8, 5}}; + cudf::test::fixed_width_column_wrapper exp_col2_unique{{3, 4, 9, 4}}; + cudf::test::fixed_width_column_wrapper exp_col1_key_unique{{20, 19, 21, 9}}; + cudf::test::fixed_width_column_wrapper exp_col2_key_unique{{20, 20, 9, 21}}; cudf::table_view expected_unique{ {exp_col1_unique, exp_col2_unique, exp_col1_key_unique, exp_col2_key_unique}}; @@ -162,9 +167,10 @@ TEST_F(DropDuplicates, KeepFirstWithNull) std::vector keys{1}; // nulls are equal - cudf::test::fixed_width_column_wrapper exp_col_first_equal{{3, 5, 5, 8}, {1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_first_equal{{20, 19, 20, 21}, - {0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_col_first_equal{{5, 3, 5, 8, 1}, + {1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_first_equal{{20, 20, 19, 21, 19}, + {1, 0, 1, 1, 1}}; cudf::table_view expected_first_equal{{exp_col_first_equal, exp_key_col_first_equal}}; auto got_first_equal = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); @@ -172,10 +178,10 @@ TEST_F(DropDuplicates, KeepFirstWithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_equal, got_first_equal->view()); // nulls are unequal - cudf::test::fixed_width_column_wrapper exp_col_first_unequal{{3, 2, 5, 5, 8}, - {1, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_first_unequal{{20, 20, 19, 20, 21}, - {0, 0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_col_first_unequal{{5, 3, 2, 5, 8, 1}, + {1, 1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_first_unequal{ + {20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; cudf::table_view expected_first_unequal{{exp_col_first_unequal, exp_key_col_first_unequal}}; auto got_first_unequal = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::UNEQUAL); @@ -192,9 +198,10 @@ TEST_F(DropDuplicates, KeepLastWithNull) std::vector keys{1}; // nulls are equal - cudf::test::fixed_width_column_wrapper exp_col_last_equal{{2, 1, 4, 8}, {1, 1, 0, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_last_equal{{20, 19, 20, 21}, - {0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_col_last_equal{{4, 2, 5, 8, 1}, + {0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_last_equal{{20, 20, 19, 21, 19}, + {1, 0, 1, 1, 1}}; cudf::table_view expected_last_equal{{exp_col_last_equal, exp_key_col_last_equal}}; auto got_last_equal = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::EQUAL); @@ -202,10 +209,10 @@ TEST_F(DropDuplicates, KeepLastWithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_equal, got_last_equal->view()); // nulls are unequal - cudf::test::fixed_width_column_wrapper exp_col_last_unequal{{3, 2, 1, 4, 8}, - {1, 1, 1, 0, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_last_unequal{{20, 20, 19, 20, 21}, - {0, 0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_col_last_unequal{{4, 3, 2, 5, 8, 1}, + {0, 1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_last_unequal{{20, 20, 20, 19, 21, 19}, + {1, 0, 0, 1, 1, 1}}; cudf::table_view expected_last_unequal{{exp_col_last_unequal, exp_key_col_last_unequal}}; auto got_last_unequal = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::UNEQUAL); @@ -222,8 +229,8 @@ TEST_F(DropDuplicates, KeepNoneWithNull) std::vector keys{1}; // nulls are equal - cudf::test::fixed_width_column_wrapper exp_col_unique_equal{{8}, {1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_unique_equal{{21}, {1}}; + cudf::test::fixed_width_column_wrapper exp_col_unique_equal{{5, 8, 1}, {1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_unique_equal{{19, 21, 19}, {1, 1, 1}}; cudf::table_view expected_unique_equal{{exp_col_unique_equal, exp_key_col_unique_equal}}; auto got_unique_equal = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::EQUAL); @@ -231,9 +238,10 @@ TEST_F(DropDuplicates, KeepNoneWithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_equal, got_unique_equal->view()); // nulls are unequal - cudf::test::fixed_width_column_wrapper exp_col_unique_unequal{{3, 2, 8}, {1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_unique_unequal{{20, 20, 21}, - {0, 0, 1}}; + cudf::test::fixed_width_column_wrapper exp_col_unique_unequal{{3, 2, 5, 8, 1}, + {1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_unique_unequal{{20, 20, 19, 21, 19}, + {0, 0, 1, 1, 1}}; cudf::table_view expected_unique_unequal{{exp_col_unique_unequal, exp_key_col_unique_unequal}}; auto got_unique_unequal = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::UNEQUAL); From d17788e2e6ca71a94d3e98bf4813110f22b621ac Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 1 Mar 2022 17:10:39 -0500 Subject: [PATCH 02/17] Update drop_duplicates cython code --- python/cudf/cudf/_lib/cpp/sorting.pxd | 7 +++- python/cudf/cudf/_lib/stream_compaction.pyx | 37 ++++++++++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/cpp/sorting.pxd index d614ef64ee2..3529c42829c 100644 --- a/python/cudf/cudf/_lib/cpp/sorting.pxd +++ b/python/cudf/cudf/_lib/cpp/sorting.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -27,6 +27,11 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil: vector[libcudf_types.order] column_order, vector[libcudf_types.null_order] null_precedence) except + + cdef unique_ptr[column] stable_sorted_order( + table_view source_table, + vector[libcudf_types.order] column_order, + vector[libcudf_types.null_order] null_precedence) except + + cdef unique_ptr[column] rank( column_view input_view, rank_method method, diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index c4f885382f3..a863adb2324 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -8,7 +8,12 @@ from libcpp.utility cimport move from libcpp.vector cimport vector from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.copying cimport gather as cpp_gather, out_of_bounds_policy +from cudf._lib.cpp.sorting cimport ( + stable_sorted_order as cpp_stable_sorted_order, +) from cudf._lib.cpp.stream_compaction cimport ( apply_boolean_mask as cpp_apply_boolean_mask, drop_duplicates as cpp_drop_duplicates, @@ -21,7 +26,9 @@ from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport ( nan_policy, null_equality, + null_order, null_policy, + order, size_type, ) from cudf._lib.utils cimport ( @@ -144,13 +151,41 @@ def drop_duplicates(columns: list, if nulls_are_equal else null_equality.UNEQUAL ) + + cdef vector[order] column_order + column_order.reserve(cpp_keys.size()) + cdef vector[null_order] null_precedence + null_precedence.reserve(cpp_keys.size()) + + for _ in range(cpp_keys.size()): + column_order.push_back(order.ASCENDING) + null_precedence.push_back(null_order.BEFORE) + + cdef unique_ptr[column] gather_map + cdef unique_ptr[table] sorted_source_table cdef unique_ptr[table] c_result cdef table_view source_table_view = table_view_from_columns(columns) + cdef table_view keys_view = source_table_view.select(cpp_keys) + cdef out_of_bounds_policy policy = out_of_bounds_policy.DONT_CHECK with nogil: + gather_map = move( + cpp_stable_sorted_order( + keys_view, + column_order, + null_precedence + ) + ) + sorted_source_table = move( + cpp_gather( + source_table_view, + gather_map.get().view(), + policy + ) + ) c_result = move( cpp_drop_duplicates( - source_table_view, + sorted_source_table.get().view(), cpp_keys, cpp_keep_option, cpp_nulls_equal From bd9617b3e14ccb56678a90763c0185acb8d5e2eb Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 1 Mar 2022 17:59:46 -0500 Subject: [PATCH 03/17] Update drop_duplicates JNI code --- java/src/main/native/src/TableJni.cpp | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 1cf56da35da..39b425fb94e 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -3005,13 +3005,22 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates( auto const keys_indices = std::vector(native_keys_indices.begin(), native_keys_indices.end()); - auto result = cudf::drop_duplicates( - *input, keys_indices, - keep_first ? cudf::duplicate_keep_option::KEEP_FIRST : - cudf::duplicate_keep_option::KEEP_LAST, - nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL, - nulls_before ? cudf::null_order::BEFORE : cudf::null_order::AFTER, - rmm::mr::get_current_device_resource()); + std::vector order(keys_indices.size(), cudf::order::ASCENDING); + std::vector null_precedence( + keys_indices.size(), nulls_before ? cudf::null_order::BEFORE : cudf::null_order::AFTER); + auto const gather_map = + cudf::stable_sorted_order(input->select(keys_indices), order, null_precedence, + rmm::mr::get_current_device_resource()); + auto const sorted_input = + cudf::gather(*input, gather_map->view(), cudf::out_of_bounds_policy::DONT_CHECK, + rmm::mr::get_current_device_resource()); + + auto result = cudf::drop_duplicates(sorted_input->view(), keys_indices, + keep_first ? cudf::duplicate_keep_option::KEEP_FIRST : + cudf::duplicate_keep_option::KEEP_LAST, + nulls_equal ? cudf::null_equality::EQUAL : + cudf::null_equality::UNEQUAL, + rmm::mr::get_current_device_resource()); return convert_table_for_return(env, result); } CATCH_STD(env, 0); From 616c095c0df9859a075fbdb36f26114058d3edff Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 1 Mar 2022 18:02:52 -0500 Subject: [PATCH 04/17] Minor comment updates --- cpp/tests/stream_compaction/drop_duplicates_tests.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp index 9d60f09643c..cd9a5725db2 100644 --- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp +++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp @@ -120,7 +120,7 @@ TEST_F(DropDuplicates, NonNullTable) cudf::table_view input{{col1, col2, col1_key, col2_key}}; std::vector keys{2, 3}; - // Keep the first of duplicate + // Keep the first duplicate row // The expected table would be sorted in ascending order with respect to keys cudf::test::fixed_width_column_wrapper exp_col1_first{{5, 3, 5, 8, 5}}; cudf::test::fixed_width_column_wrapper exp_col2_first{{4, 3, 4, 9, 4}}; @@ -133,7 +133,7 @@ TEST_F(DropDuplicates, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); - // Keep the last of duplicate + // Keep the last duplicate row cudf::test::fixed_width_column_wrapper exp_col1_last{{4, 3, 5, 8, 5}}; cudf::test::fixed_width_column_wrapper exp_col2_last{{5, 3, 4, 9, 4}}; cudf::test::fixed_width_column_wrapper exp_col1_key_last{{20, 20, 19, 21, 9}}; From 6c01469981a02f4deaa00b323a0014bdc19a0603 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 2 Mar 2022 12:51:18 -0500 Subject: [PATCH 05/17] Update python/cudf/cudf/_lib/stream_compaction.pyx Co-authored-by: Bradley Dice --- python/cudf/cudf/_lib/stream_compaction.pyx | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index a863adb2324..892dfd2d6ed 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -152,14 +152,8 @@ def drop_duplicates(columns: list, else null_equality.UNEQUAL ) - cdef vector[order] column_order - column_order.reserve(cpp_keys.size()) - cdef vector[null_order] null_precedence - null_precedence.reserve(cpp_keys.size()) - - for _ in range(cpp_keys.size()): - column_order.push_back(order.ASCENDING) - null_precedence.push_back(null_order.BEFORE) + cdef vector[order] column_order(cpp_keys.size(), order.ASCENDING) + cdef vector[null_order] null_precedence(cpp_keys.size(), null_order.BEFORE) cdef unique_ptr[column] gather_map cdef unique_ptr[table] sorted_source_table From c752247fe56a88f9f484801158783ff6e00d0c52 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 2 Mar 2022 13:00:29 -0500 Subject: [PATCH 06/17] Address review: replace INT32 with type_to_id of size_type --- cpp/src/stream_compaction/drop_duplicates.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index 7c443b5fc63..f3b580749bf 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -55,8 +55,8 @@ std::unique_ptr
drop_duplicates(table_view const& input, auto const num_rows = input.num_rows(); if (num_rows == 0 or input.num_columns() == 0 or keys.empty()) { return empty_like(input); } - auto unique_indices = - make_numeric_column(data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED, stream, mr); + auto unique_indices = make_numeric_column( + data_type{type_to_id()}, num_rows, mask_state::UNALLOCATED, stream, mr); auto mutable_view = mutable_column_device_view::create(*unique_indices, stream); auto keys_view = input.select(keys); auto keys_device_view = cudf::table_device_view::create(keys_view, stream); From 910f5ac4aea1c2c912e6e03136297dff00924afb Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 2 Mar 2022 14:42:28 -0500 Subject: [PATCH 07/17] Minor cython correction --- python/cudf/cudf/_lib/stream_compaction.pyx | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 892dfd2d6ed..d4a9ab605a8 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -152,8 +152,18 @@ def drop_duplicates(columns: list, else null_equality.UNEQUAL ) - cdef vector[order] column_order(cpp_keys.size(), order.ASCENDING) - cdef vector[null_order] null_precedence(cpp_keys.size(), null_order.BEFORE) + cdef vector[order] column_order = ( + vector[order]( + cpp_keys.size(), + order.ASCENDING + ) + ) + cdef vector[null_order] null_precedence = ( + vector[null_order]( + cpp_keys.size(), + null_order.BEFORE + ) + ) cdef unique_ptr[column] gather_map cdef unique_ptr[table] sorted_source_table From 2563b391adb670eff5646bb218879bbefad45403 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 8 Mar 2022 10:19:09 -0500 Subject: [PATCH 08/17] Update Cython bindings to use stable_sort_by_key --- python/cudf/cudf/_lib/cpp/sorting.pxd | 5 +++-- python/cudf/cudf/_lib/stream_compaction.pyx | 24 ++++++--------------- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/cpp/sorting.pxd index 3529c42829c..243b841ce4b 100644 --- a/python/cudf/cudf/_lib/cpp/sorting.pxd +++ b/python/cudf/cudf/_lib/cpp/sorting.pxd @@ -27,8 +27,9 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil: vector[libcudf_types.order] column_order, vector[libcudf_types.null_order] null_precedence) except + - cdef unique_ptr[column] stable_sorted_order( - table_view source_table, + cdef unique_ptr[table] stable_sort_by_key( + const table_view& values, + const table_view& keys, vector[libcudf_types.order] column_order, vector[libcudf_types.null_order] null_precedence) except + diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index d4a9ab605a8..0c4b22e1c77 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -8,12 +8,8 @@ from libcpp.utility cimport move from libcpp.vector cimport vector from cudf._lib.column cimport Column -from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.copying cimport gather as cpp_gather, out_of_bounds_policy -from cudf._lib.cpp.sorting cimport ( - stable_sorted_order as cpp_stable_sorted_order, -) +from cudf._lib.cpp.sorting cimport stable_sort_by_key as cpp_stable_sort_by_key from cudf._lib.cpp.stream_compaction cimport ( apply_boolean_mask as cpp_apply_boolean_mask, drop_duplicates as cpp_drop_duplicates, @@ -165,28 +161,20 @@ def drop_duplicates(columns: list, ) ) - cdef unique_ptr[column] gather_map - cdef unique_ptr[table] sorted_source_table - cdef unique_ptr[table] c_result cdef table_view source_table_view = table_view_from_columns(columns) cdef table_view keys_view = source_table_view.select(cpp_keys) - cdef out_of_bounds_policy policy = out_of_bounds_policy.DONT_CHECK + cdef unique_ptr[table] sorted_source_table + cdef unique_ptr[table] c_result with nogil: - gather_map = move( - cpp_stable_sorted_order( + sorted_source_table = move( + cpp_stable_sort_by_key( + source_table_view, keys_view, column_order, null_precedence ) ) - sorted_source_table = move( - cpp_gather( - source_table_view, - gather_map.get().view(), - policy - ) - ) c_result = move( cpp_drop_duplicates( sorted_source_table.get().view(), From d5764c82f568c8f18361dc5e4d5bd950fe65902f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 8 Mar 2022 10:25:46 -0500 Subject: [PATCH 09/17] Update JNI bindings to use stable_sort_by_key --- java/src/main/native/src/TableJni.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index c3a5e41fb87..7e1d5487670 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -3047,12 +3047,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates( std::vector order(keys_indices.size(), cudf::order::ASCENDING); std::vector null_precedence( keys_indices.size(), nulls_before ? cudf::null_order::BEFORE : cudf::null_order::AFTER); - auto const gather_map = - cudf::stable_sorted_order(input->select(keys_indices), order, null_precedence, - rmm::mr::get_current_device_resource()); auto const sorted_input = - cudf::gather(*input, gather_map->view(), cudf::out_of_bounds_policy::DONT_CHECK, - rmm::mr::get_current_device_resource()); + cudf::stable_sort_by_key(*input, input->select(keys_indices), order, null_precedence); auto result = cudf::drop_duplicates(sorted_input->view(), keys_indices, keep_first ? cudf::duplicate_keep_option::KEEP_FIRST : From 04d5a475056cd86ab19b24bd286dd87adb600ceb Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 8 Mar 2022 10:44:52 -0500 Subject: [PATCH 10/17] Update comments --- java/src/main/native/src/TableJni.cpp | 2 ++ python/cudf/cudf/_lib/stream_compaction.pyx | 3 +++ 2 files changed, 5 insertions(+) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 7e1d5487670..d426ac3ce62 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -3044,6 +3044,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates( auto const keys_indices = std::vector(native_keys_indices.begin(), native_keys_indices.end()); + // cudf::drop_duplicates works like std::unique thus does NOT match the behavior of + // pandas.DataFrame.drop_duplicates. Users need to stable sort the input first and then drop. std::vector order(keys_indices.size(), cudf::order::ASCENDING); std::vector null_precedence( keys_indices.size(), nulls_before ? cudf::null_order::BEFORE : cudf::null_order::AFTER); diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 0c4b22e1c77..93c15791840 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -167,6 +167,9 @@ def drop_duplicates(columns: list, cdef unique_ptr[table] c_result with nogil: + # cudf::drop_duplicates works like std::unique thus does NOT match + # the behavior of pandas.DataFrame.drop_duplicates. Users need to + # stable sort the input first and then drop. sorted_source_table = move( cpp_stable_sort_by_key( source_table_view, From 0fd9f3ff8f04b021069044acc9bf124c333ff55d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 10 Mar 2022 16:12:43 -0500 Subject: [PATCH 11/17] Rename libcudf stream compaction APIs + update tests and benchmarks --- cpp/CMakeLists.txt | 4 +- cpp/benchmarks/CMakeLists.txt | 3 +- cpp/benchmarks/stream_compaction/distinct.cpp | 63 +++++ .../{drop_duplicates.cpp => unique.cpp} | 40 +-- cpp/include/cudf/detail/stream_compaction.hpp | 44 ++-- cpp/include/cudf/stream_compaction.hpp | 26 +- cpp/src/dictionary/add_keys.cu | 13 +- cpp/src/dictionary/detail/concatenate.cu | 2 +- cpp/src/dictionary/set_keys.cu | 4 +- cpp/src/lists/drop_list_duplicates.cu | 4 +- cpp/src/reductions/reductions.cpp | 2 +- .../{drop_duplicates.cu => distinct.cu} | 72 +----- cpp/src/stream_compaction/distinct_count.cu | 91 +------ cpp/src/stream_compaction/drop_duplicates.cuh | 93 ------- .../stream_compaction_common.cuh | 72 ++++++ cpp/src/stream_compaction/unique.cu | 99 ++++++++ cpp/src/stream_compaction/unique_count.cu | 138 ++++++++++ cpp/src/transform/encode.cu | 4 +- cpp/tests/CMakeLists.txt | 4 +- .../distinct_count_tests.cpp | 104 ++------ .../stream_compaction/distinct_tests.cpp | 140 ++++++++++ .../stream_compaction/unique_count_tests.cpp | 239 ++++++++++++++++++ ..._duplicates_tests.cpp => unique_tests.cpp} | 125 ++------- 23 files changed, 867 insertions(+), 519 deletions(-) create mode 100644 cpp/benchmarks/stream_compaction/distinct.cpp rename cpp/benchmarks/stream_compaction/{drop_duplicates.cpp => unique.cpp} (65%) rename cpp/src/stream_compaction/{drop_duplicates.cu => distinct.cu} (52%) delete mode 100644 cpp/src/stream_compaction/drop_duplicates.cuh create mode 100644 cpp/src/stream_compaction/unique.cu create mode 100644 cpp/src/stream_compaction/unique_count.cu create mode 100644 cpp/tests/stream_compaction/distinct_tests.cpp create mode 100644 cpp/tests/stream_compaction/unique_count_tests.cpp rename cpp/tests/stream_compaction/{drop_duplicates_tests.cpp => unique_tests.cpp} (62%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 825ea37c6ac..be89fdf7cf8 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -400,10 +400,12 @@ add_library( src/sort/stable_sort_column.cu src/sort/stable_sort.cu src/stream_compaction/apply_boolean_mask.cu + src/stream_compaction/distinct.cu src/stream_compaction/distinct_count.cu - src/stream_compaction/drop_duplicates.cu src/stream_compaction/drop_nans.cu src/stream_compaction/drop_nulls.cu + src/stream_compaction/unique.cu + src/stream_compaction/unique_count.cu src/strings/attributes.cu src/strings/capitalize.cu src/strings/case.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 054410c3265..8e939482e45 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -144,7 +144,8 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- -ConfigureNVBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates.cpp) +ConfigureNVBench( + STREAM_COMPACTION_BENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp) # ################################################################################################## # * join benchmark -------------------------------------------------------------------------------- diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp new file mode 100644 index 00000000000..0eb353381e1 --- /dev/null +++ b/cpp/benchmarks/stream_compaction/distinct.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms"); + +template +void nvbench_distinct(nvbench::state& state, nvbench::type_list) +{ + cudf::rmm_pool_raii pool_raii; + + auto const num_rows = state.get_int64("NumRows"); + + cudf::test::UniformRandomGenerator rand_gen(0, 100); + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [&rand_gen](auto row) { return rand_gen.generate(); }); + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 100 == 0 ? false : true; }); + cudf::test::fixed_width_column_wrapper values(elements, elements + num_rows, valids); + + auto input_column = cudf::column_view(values); + auto input_table = cudf::table_view({input_column, input_column, input_column, input_column}); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + auto result = cudf::detail::distinct(input_table, {0}, cudf::null_equality::EQUAL, stream_view); + }); +} + +using data_type = nvbench::type_list; +using keep_option = nvbench::enum_type_list; + +NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type)) + .set_name("distinct") + .set_type_axes_names({"Type"}) + .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates.cpp b/cpp/benchmarks/stream_compaction/unique.cpp similarity index 65% rename from cpp/benchmarks/stream_compaction/drop_duplicates.cpp rename to cpp/benchmarks/stream_compaction/unique.cpp index 4f38c791a9c..5ac90a878e2 100644 --- a/cpp/benchmarks/stream_compaction/drop_duplicates.cpp +++ b/cpp/benchmarks/stream_compaction/unique.cpp @@ -47,8 +47,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms"); template -void nvbench_drop_duplicates(nvbench::state& state, - nvbench::type_list>) +void nvbench_unique(nvbench::state& state, nvbench::type_list>) { if constexpr (not std::is_same_v and Keep != cudf::duplicate_keep_option::KEEP_FIRST) { @@ -71,32 +70,8 @@ void nvbench_drop_duplicates(nvbench::state& state, state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { rmm::cuda_stream_view stream_view{launch.get_stream()}; - auto result = cudf::detail::drop_duplicates( - input_table, {0}, Keep, cudf::null_equality::EQUAL, stream_view); - }); -} - -template -void nvbench_unordered_drop_duplicates(nvbench::state& state, nvbench::type_list) -{ - cudf::rmm_pool_raii pool_raii; - - auto const num_rows = state.get_int64("NumRows"); - - cudf::test::UniformRandomGenerator rand_gen(0, 100); - auto elements = cudf::detail::make_counting_transform_iterator( - 0, [&rand_gen](auto row) { return rand_gen.generate(); }); - auto valids = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 100 == 0 ? false : true; }); - cudf::test::fixed_width_column_wrapper values(elements, elements + num_rows, valids); - - auto input_column = cudf::column_view(values); - auto input_table = cudf::table_view({input_column, input_column, input_column, input_column}); - - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - rmm::cuda_stream_view stream_view{launch.get_stream()}; - auto result = cudf::detail::unordered_drop_duplicates( - input_table, {0}, cudf::null_equality::EQUAL, stream_view); + auto result = + cudf::detail::unique(input_table, {0}, Keep, cudf::null_equality::EQUAL, stream_view); }); } @@ -105,12 +80,7 @@ using keep_option = nvbench::enum_type_list; -NVBENCH_BENCH_TYPES(nvbench_drop_duplicates, NVBENCH_TYPE_AXES(data_type, keep_option)) - .set_name("drop_duplicates") +NVBENCH_BENCH_TYPES(nvbench_unique, NVBENCH_TYPE_AXES(data_type, keep_option)) + .set_name("unique") .set_type_axes_names({"Type", "KeepOption"}) .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); - -NVBENCH_BENCH_TYPES(nvbench_unordered_drop_duplicates, NVBENCH_TYPE_AXES(data_type)) - .set_name("unordered_drop_duplicates") - .set_type_axes_names({"Type"}) - .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 3d84e112609..eb7f837b0d0 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -62,11 +62,11 @@ std::unique_ptr
apply_boolean_mask( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @copydoc cudf::drop_duplicates + * @copydoc cudf::unique * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr
drop_duplicates( +std::unique_ptr
unique( table_view const& input, std::vector const& keys, duplicate_keep_option keep, @@ -75,11 +75,11 @@ std::unique_ptr
drop_duplicates( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @copydoc cudf::unordered_drop_duplicates + * @copydoc cudf::distinct * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr
unordered_drop_duplicates( +std::unique_ptr
distinct( table_view const& input, std::vector const& keys, null_equality nulls_equal = null_equality::EQUAL, @@ -87,42 +87,42 @@ std::unique_ptr
unordered_drop_duplicates( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy) + * @copydoc cudf::unique_count(column_view const&, null_policy, nan_policy) * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -cudf::size_type distinct_count(column_view const& input, - null_policy null_handling, - nan_policy nan_handling, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); +cudf::size_type unique_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** - * @copydoc cudf::distinct_count(table_view const&, null_equality) + * @copydoc cudf::unique_count(table_view const&, null_equality) * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -cudf::size_type distinct_count(table_view const& input, - null_equality nulls_equal = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); +cudf::size_type unique_count(table_view const& input, + null_equality nulls_equal = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** - * @copydoc cudf::unordered_distinct_count(column_view const&, null_policy, nan_policy) + * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy) * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -cudf::size_type unordered_distinct_count(column_view const& input, - null_policy null_handling, - nan_policy nan_handling, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); +cudf::size_type distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** - * @copydoc cudf::unordered_distinct_count(table_view const&, null_equality) + * @copydoc cudf::distinct_count(table_view const&, null_equality) * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -cudf::size_type unordered_distinct_count(table_view const& input, - null_equality nulls_equal = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); +cudf::size_type distinct_count(table_view const& input, + null_equality nulls_equal = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 78e67132766..a8d1bbc28f5 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -234,7 +234,7 @@ enum class duplicate_keep_option { * * @return Table with unique rows from each sequence of equivalent rows as specified by `keep`. */ -std::unique_ptr
drop_duplicates( +std::unique_ptr
unique( table_view const& input, std::vector const& keys, duplicate_keep_option keep, @@ -242,7 +242,7 @@ std::unique_ptr
drop_duplicates( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Create a new table without duplicate rows with hash-based algorithms. + * @brief Create a new table without duplicate rows. * * Given an `input` table_view, each row is copied to output table if the corresponding * row of `keys` columns is unique. If duplicate rows are present, it is unspecified which @@ -259,7 +259,7 @@ std::unique_ptr
drop_duplicates( * * @return Table with unique rows in an unspecified order. */ -std::unique_ptr
unordered_drop_duplicates( +std::unique_ptr
distinct( table_view const& input, std::vector const& keys, null_equality nulls_equal = null_equality::EQUAL, @@ -280,9 +280,9 @@ std::unique_ptr
unordered_drop_duplicates( * * @return number of distinct consecutive groups in the column */ -cudf::size_type distinct_count(column_view const& input, - null_policy null_handling, - nan_policy nan_handling); +cudf::size_type unique_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling); /** * @brief Count the number of consecutive groups of equivalent elements in a table. @@ -293,8 +293,8 @@ cudf::size_type distinct_count(column_view const& input, * * @return number of distinct consecutive groups in the table */ -cudf::size_type distinct_count(table_view const& input, - null_equality nulls_equal = null_equality::EQUAL); +cudf::size_type unique_count(table_view const& input, + null_equality nulls_equal = null_equality::EQUAL); /** * @brief Count the unique elements in the column_view. @@ -316,9 +316,9 @@ cudf::size_type distinct_count(table_view const& input, * * @return number of unique elements */ -cudf::size_type unordered_distinct_count(column_view const& input, - null_policy null_handling, - nan_policy nan_handling); +cudf::size_type distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling); /** * @brief Count the unique rows in a table. @@ -329,8 +329,8 @@ cudf::size_type unordered_distinct_count(column_view const& input, * * @return number of unique rows in the table */ -cudf::size_type unordered_distinct_count(table_view const& input, - null_equality nulls_equal = null_equality::EQUAL); +cudf::size_type distinct_count(table_view const& input, + null_equality nulls_equal = null_equality::EQUAL); /** @} */ } // namespace cudf diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index 96b7fd48dc9..d612cc3f34e 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -60,13 +60,12 @@ std::unique_ptr add_keys( cudf::detail::concatenate(std::vector{old_keys, new_keys}, stream); // Drop duplicates from the combined keys, then sort the result. - // sort(unordered_drop_duplicates([a,b,c,d,f,d,b,e])) = [a,b,c,d,e,f] - auto table_keys = - cudf::detail::unordered_drop_duplicates(table_view{{combined_keys->view()}}, - std::vector{0}, // only one key column - null_equality::EQUAL, - stream, - mr); + // sort(distinct([a,b,c,d,f,d,b,e])) = [a,b,c,d,e,f] + auto table_keys = cudf::detail::distinct(table_view{{combined_keys->view()}}, + std::vector{0}, // only one key column + null_equality::EQUAL, + stream, + mr); std::vector column_order{order::ASCENDING}; std::vector null_precedence{null_order::AFTER}; // should be no nulls here auto sorted_keys = diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index 871a36f7d62..055a20e4cfd 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -215,7 +215,7 @@ std::unique_ptr concatenate(host_span columns, // sort keys and remove duplicates; // this becomes the keys child for the output dictionary column - auto table_keys = cudf::detail::unordered_drop_duplicates( + auto table_keys = cudf::detail::distinct( table_view{{all_keys->view()}}, std::vector{0}, null_equality::EQUAL, stream, mr); auto sorted_keys = cudf::detail::sort(table_keys->view(), std::vector{order::ASCENDING}, diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index 7783e5f8daf..357f4ced00e 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -119,9 +119,9 @@ std::unique_ptr set_keys( auto keys = dictionary_column.keys(); CUDF_EXPECTS(keys.type() == new_keys.type(), "keys types must match"); - // copy the keys -- use unordered_drop_duplicates to make sure they are unique, then + // copy the keys -- use cudf::distinct to make sure they are unique, then // sort the results. - auto unique_keys = cudf::detail::unordered_drop_duplicates( + auto unique_keys = cudf::detail::distinct( table_view{{new_keys}}, std::vector{0}, null_equality::EQUAL, stream, mr); auto sorted_keys = cudf::detail::sort(unique_keys->view(), std::vector{order::ASCENDING}, diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index b86e028192e..7d391578428 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index 234eaf51f96..5a676512dad 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -93,7 +93,7 @@ struct reduce_dispatch_functor { case aggregation::NUNIQUE: { auto nunique_agg = dynamic_cast(agg.get()); return make_fixed_width_scalar( - detail::unordered_distinct_count( + detail::distinct_count( col, nunique_agg->_null_handling, nan_policy::NAN_IS_VALID, stream), stream, mr); diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/distinct.cu similarity index 52% rename from cpp/src/stream_compaction/drop_duplicates.cu rename to cpp/src/stream_compaction/distinct.cu index f3b580749bf..1d5d41f3095 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/distinct.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include "drop_duplicates.cuh" #include "stream_compaction_common.cuh" #include "stream_compaction_common.hpp" @@ -45,52 +44,11 @@ namespace cudf { namespace detail { -std::unique_ptr
drop_duplicates(table_view const& input, - std::vector const& keys, - duplicate_keep_option keep, - null_equality nulls_equal, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const num_rows = input.num_rows(); - if (num_rows == 0 or input.num_columns() == 0 or keys.empty()) { return empty_like(input); } - - auto unique_indices = make_numeric_column( - data_type{type_to_id()}, num_rows, mask_state::UNALLOCATED, stream, mr); - auto mutable_view = mutable_column_device_view::create(*unique_indices, stream); - auto keys_view = input.select(keys); - auto keys_device_view = cudf::table_device_view::create(keys_view, stream); - auto row_equal = row_equality_comparator(nullate::DYNAMIC{cudf::has_nulls(keys_view)}, - *keys_device_view, - *keys_device_view, - nulls_equal); - - // get indices of unique rows - auto result_end = unique_copy(thrust::counting_iterator(0), - thrust::counting_iterator(num_rows), - mutable_view->begin(), - row_equal, - keep, - stream); - auto indices_view = - cudf::detail::slice(column_view(*unique_indices), - 0, - thrust::distance(mutable_view->begin(), result_end)); - - // gather unique rows and return - return detail::gather(input, - indices_view, - out_of_bounds_policy::DONT_CHECK, - detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); -} - -std::unique_ptr
unordered_drop_duplicates(table_view const& input, - std::vector const& keys, - null_equality nulls_equal, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
distinct(table_view const& input, + std::vector const& keys, + null_equality nulls_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) { return empty_like(input); @@ -144,23 +102,13 @@ std::unique_ptr
unordered_drop_duplicates(table_view const& input, } // namespace detail -std::unique_ptr
drop_duplicates(table_view const& input, - std::vector const& keys, - duplicate_keep_option const keep, - null_equality nulls_equal, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::drop_duplicates(input, keys, keep, nulls_equal, rmm::cuda_stream_default, mr); -} - -std::unique_ptr
unordered_drop_duplicates(table_view const& input, - std::vector const& keys, - null_equality nulls_equal, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
distinct(table_view const& input, + std::vector const& keys, + null_equality nulls_equal, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::unordered_drop_duplicates(input, keys, nulls_equal, rmm::cuda_stream_default, mr); + return detail::distinct(input, keys, nulls_equal, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 2c7488084b5..9ff507a15c5 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -118,44 +118,11 @@ struct has_nans { return false; } }; - -/** - * @brief A functor to be used along with device type_dispatcher to check if - * the row `index` of `column_device_view` is `NaN`. - */ -struct check_nan { - // Check if it's `NaN` for floating point type columns - template >* = nullptr> - __device__ inline bool operator()(column_device_view const& input, size_type index) - { - return std::isnan(input.data()[index]); - } - // Non-floating point type columns can never have `NaN`, so it will always return false. - template >* = nullptr> - __device__ inline bool operator()(column_device_view const&, size_type) - { - return false; - } -}; } // namespace cudf::size_type distinct_count(table_view const& keys, null_equality nulls_equal, rmm::cuda_stream_view stream) -{ - auto table_ptr = cudf::table_device_view::create(keys, stream); - row_equality_comparator comp( - nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr, *table_ptr, nulls_equal); - return thrust::count_if( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(keys.num_rows()), - [comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); }); -} - -cudf::size_type unordered_distinct_count(table_view const& keys, - null_equality nulls_equal, - rmm::cuda_stream_view stream) { auto table_ptr = cudf::table_device_view::create(keys, stream); auto const num_rows = table_ptr->num_rows(); @@ -190,51 +157,10 @@ cudf::size_type distinct_count(column_view const& input, null_policy null_handling, nan_policy nan_handling, rmm::cuda_stream_view stream) -{ - auto const num_rows = input.size(); - - if (num_rows == 0 or num_rows == input.null_count()) { return 0; } - - auto const count_nulls = null_handling == null_policy::INCLUDE; - auto const nan_is_null = nan_handling == nan_policy::NAN_IS_NULL; - auto const should_check_nan = cudf::is_floating_point(input.type()); - auto input_device_view = cudf::column_device_view::create(input, stream); - auto device_view = *input_device_view; - auto input_table_view = table_view{{input}}; - auto table_ptr = cudf::table_device_view::create(input_table_view, stream); - row_equality_comparator comp(nullate::DYNAMIC{cudf::has_nulls(input_table_view)}, - *table_ptr, - *table_ptr, - null_equality::EQUAL); - - return thrust::count_if( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(num_rows), - [count_nulls, nan_is_null, should_check_nan, device_view, comp] __device__(cudf::size_type i) { - auto const is_null = device_view.is_null(i); - auto const is_nan = nan_is_null and should_check_nan and - cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i); - if (not count_nulls and (is_null or (nan_is_null and is_nan))) { return false; } - if (i == 0) { return true; } - if (count_nulls and nan_is_null and (is_nan or is_null)) { - auto const prev_is_nan = - should_check_nan and - cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1); - return not(prev_is_nan or device_view.is_null(i - 1)); - } - return not comp(i, i - 1); - }); -} - -cudf::size_type unordered_distinct_count(column_view const& input, - null_policy null_handling, - nan_policy nan_handling, - rmm::cuda_stream_view stream) { if (0 == input.size() or input.null_count() == input.size()) { return 0; } - auto count = detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream); + auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream); // Check for nulls. If the null policy is EXCLUDE and null values were found, // we decrement the count. @@ -268,19 +194,4 @@ cudf::size_type distinct_count(table_view const& input, null_equality nulls_equa CUDF_FUNC_RANGE(); return detail::distinct_count(input, nulls_equal); } - -cudf::size_type unordered_distinct_count(column_view const& input, - null_policy null_handling, - nan_policy nan_handling) -{ - CUDF_FUNC_RANGE(); - return detail::unordered_distinct_count(input, null_handling, nan_handling); -} - -cudf::size_type unordered_distinct_count(table_view const& input, null_equality nulls_equal) -{ - CUDF_FUNC_RANGE(); - return detail::unordered_distinct_count(input, nulls_equal); -} - } // namespace cudf diff --git a/cpp/src/stream_compaction/drop_duplicates.cuh b/cpp/src/stream_compaction/drop_duplicates.cuh deleted file mode 100644 index 3f8ae9507c2..00000000000 --- a/cpp/src/stream_compaction/drop_duplicates.cuh +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include - -#include -#include - -namespace cudf { -namespace detail { -template -struct unique_copy_fn { - /** - * @brief Functor for unique_copy() - * - * The logic here is equivalent to: - * @code - * ((keep == duplicate_keep_option::KEEP_LAST) || - * (i == 0 || !comp(iter[i], iter[i - 1]))) && - * ((keep == duplicate_keep_option::KEEP_FIRST) || - * (i == last_index || !comp(iter[i], iter[i + 1]))) - * @endcode - * - * It is written this way so that the `comp` comparator - * function appears only once minimizing the inlining - * required and reducing the compile time. - */ - __device__ bool operator()(size_type i) - { - size_type boundary = 0; - size_type offset = 1; - auto keep_option = duplicate_keep_option::KEEP_LAST; - do { - if ((keep != keep_option) && (i != boundary) && comp(iter[i], iter[i - offset])) { - return false; - } - keep_option = duplicate_keep_option::KEEP_FIRST; - boundary = last_index; - offset = -offset; - } while (offset < 0); - return true; - } - - InputIterator iter; - duplicate_keep_option const keep; - BinaryPredicate comp; - size_type const last_index; -}; - -/** - * @brief Copies unique elements from the range [first, last) to output iterator `output`. - * - * In a consecutive group of duplicate elements, depending on parameter `keep`, - * only the first element is copied, or the last element is copied or neither is copied. - * - * @return End of the range to which the elements are copied. - */ -template -OutputIterator unique_copy(InputIterator first, - InputIterator last, - OutputIterator output, - BinaryPredicate comp, - duplicate_keep_option const keep, - rmm::cuda_stream_view stream) -{ - size_type const last_index = thrust::distance(first, last) - 1; - return thrust::copy_if( - rmm::exec_policy(stream), - first, - last, - thrust::counting_iterator(0), - output, - unique_copy_fn{first, keep, comp, last_index}); -} - -} // namespace detail -} // namespace cudf diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh index 8ba9223a1bc..1b0ef1b9e55 100644 --- a/cpp/src/stream_compaction/stream_compaction_common.cuh +++ b/cpp/src/stream_compaction/stream_compaction_common.cuh @@ -17,6 +17,14 @@ #include "stream_compaction_common.hpp" +#include + +#include +#include + +#include +#include + namespace cudf { namespace detail { @@ -54,5 +62,69 @@ class row_validity { bitmask_type const* _row_bitmask; }; +template +struct unique_copy_fn { + /** + * @brief Functor for unique_copy() + * + * The logic here is equivalent to: + * @code + * ((keep == duplicate_keep_option::KEEP_LAST) || + * (i == 0 || !comp(iter[i], iter[i - 1]))) && + * ((keep == duplicate_keep_option::KEEP_FIRST) || + * (i == last_index || !comp(iter[i], iter[i + 1]))) + * @endcode + * + * It is written this way so that the `comp` comparator + * function appears only once minimizing the inlining + * required and reducing the compile time. + */ + __device__ bool operator()(size_type i) + { + size_type boundary = 0; + size_type offset = 1; + auto keep_option = duplicate_keep_option::KEEP_LAST; + do { + if ((keep != keep_option) && (i != boundary) && comp(iter[i], iter[i - offset])) { + return false; + } + keep_option = duplicate_keep_option::KEEP_FIRST; + boundary = last_index; + offset = -offset; + } while (offset < 0); + return true; + } + + InputIterator iter; + duplicate_keep_option const keep; + BinaryPredicate comp; + size_type const last_index; +}; + +/** + * @brief Copies unique elements from the range [first, last) to output iterator `output`. + * + * In a consecutive group of duplicate elements, depending on parameter `keep`, + * only the first element is copied, or the last element is copied or neither is copied. + * + * @return End of the range to which the elements are copied. + */ +template +OutputIterator unique_copy(InputIterator first, + InputIterator last, + OutputIterator output, + BinaryPredicate comp, + duplicate_keep_option const keep, + rmm::cuda_stream_view stream) +{ + size_type const last_index = thrust::distance(first, last) - 1; + return thrust::copy_if( + rmm::exec_policy(stream), + first, + last, + thrust::counting_iterator(0), + output, + unique_copy_fn{first, keep, comp, last_index}); +} } // namespace detail } // namespace cudf diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu new file mode 100644 index 00000000000..e9015afbf61 --- /dev/null +++ b/cpp/src/stream_compaction/unique.cu @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "stream_compaction_common.cuh" +#include "stream_compaction_common.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace detail { +std::unique_ptr
unique(table_view const& input, + std::vector const& keys, + duplicate_keep_option keep, + null_equality nulls_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_rows = input.num_rows(); + if (num_rows == 0 or input.num_columns() == 0 or keys.empty()) { return empty_like(input); } + + auto unique_indices = make_numeric_column( + data_type{type_to_id()}, num_rows, mask_state::UNALLOCATED, stream, mr); + auto mutable_view = mutable_column_device_view::create(*unique_indices, stream); + auto keys_view = input.select(keys); + auto keys_device_view = cudf::table_device_view::create(keys_view, stream); + auto row_equal = row_equality_comparator(nullate::DYNAMIC{cudf::has_nulls(keys_view)}, + *keys_device_view, + *keys_device_view, + nulls_equal); + + // get indices of unique rows + auto result_end = unique_copy(thrust::counting_iterator(0), + thrust::counting_iterator(num_rows), + mutable_view->begin(), + row_equal, + keep, + stream); + auto indices_view = + cudf::detail::slice(column_view(*unique_indices), + 0, + thrust::distance(mutable_view->begin(), result_end)); + + // gather unique rows and return + return detail::gather(input, + indices_view, + out_of_bounds_policy::DONT_CHECK, + detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); +} +} // namespace detail + +std::unique_ptr
unique(table_view const& input, + std::vector const& keys, + duplicate_keep_option const keep, + null_equality nulls_equal, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::unique(input, keys, keep, nulls_equal, rmm::cuda_stream_default, mr); +} + +} // namespace cudf diff --git a/cpp/src/stream_compaction/unique_count.cu b/cpp/src/stream_compaction/unique_count.cu new file mode 100644 index 00000000000..1b0a83ede80 --- /dev/null +++ b/cpp/src/stream_compaction/unique_count.cu @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "stream_compaction_common.cuh" +#include "stream_compaction_common.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cudf { +namespace detail { +namespace { +/** + * @brief A functor to be used along with device type_dispatcher to check if + * the row `index` of `column_device_view` is `NaN`. + */ +struct check_nan { + // Check if it's `NaN` for floating point type columns + template >* = nullptr> + __device__ inline bool operator()(column_device_view const& input, size_type index) + { + return std::isnan(input.data()[index]); + } + // Non-floating point type columns can never have `NaN`, so it will always return false. + template >* = nullptr> + __device__ inline bool operator()(column_device_view const&, size_type) + { + return false; + } +}; +} // namespace + +cudf::size_type unique_count(table_view const& keys, + null_equality nulls_equal, + rmm::cuda_stream_view stream) +{ + auto table_ptr = cudf::table_device_view::create(keys, stream); + row_equality_comparator comp( + nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr, *table_ptr, nulls_equal); + return thrust::count_if( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(keys.num_rows()), + [comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); }); +} + +cudf::size_type unique_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream) +{ + auto const num_rows = input.size(); + + if (num_rows == 0 or num_rows == input.null_count()) { return 0; } + + auto const count_nulls = null_handling == null_policy::INCLUDE; + auto const nan_is_null = nan_handling == nan_policy::NAN_IS_NULL; + auto const should_check_nan = cudf::is_floating_point(input.type()); + auto input_device_view = cudf::column_device_view::create(input, stream); + auto device_view = *input_device_view; + auto input_table_view = table_view{{input}}; + auto table_ptr = cudf::table_device_view::create(input_table_view, stream); + row_equality_comparator comp(nullate::DYNAMIC{cudf::has_nulls(input_table_view)}, + *table_ptr, + *table_ptr, + null_equality::EQUAL); + + return thrust::count_if( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(num_rows), + [count_nulls, nan_is_null, should_check_nan, device_view, comp] __device__(cudf::size_type i) { + auto const is_null = device_view.is_null(i); + auto const is_nan = nan_is_null and should_check_nan and + cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i); + if (not count_nulls and (is_null or (nan_is_null and is_nan))) { return false; } + if (i == 0) { return true; } + if (count_nulls and nan_is_null and (is_nan or is_null)) { + auto const prev_is_nan = + should_check_nan and + cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1); + return not(prev_is_nan or device_view.is_null(i - 1)); + } + return not comp(i, i - 1); + }); +} +} // namespace detail + +cudf::size_type unique_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling) +{ + CUDF_FUNC_RANGE(); + return detail::unique_count(input, null_handling, nan_handling); +} + +cudf::size_type unique_count(table_view const& input, null_equality nulls_equal) +{ + CUDF_FUNC_RANGE(); + return detail::unique_count(input, nulls_equal); +} + +} // namespace cudf diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index 405c83ab872..7236fe3882c 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -46,8 +46,8 @@ std::pair, std::unique_ptr> encode( std::vector drop_keys(num_cols); std::iota(drop_keys.begin(), drop_keys.end(), 0); - auto unique_keys = cudf::detail::unordered_drop_duplicates( - input_table, drop_keys, null_equality::EQUAL, stream, mr); + auto unique_keys = + cudf::detail::distinct(input_table, drop_keys, null_equality::EQUAL, stream, mr); std::vector column_order(num_cols, order::ASCENDING); std::vector null_precedence(num_cols, null_order::AFTER); diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index b1c23749c4b..56cfca486c6 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -316,9 +316,11 @@ ConfigureTest( STREAM_COMPACTION_TEST stream_compaction/apply_boolean_mask_tests.cpp stream_compaction/distinct_count_tests.cpp + stream_compaction/distinct_tests.cpp stream_compaction/drop_nulls_tests.cpp stream_compaction/drop_nans_tests.cpp - stream_compaction/drop_duplicates_tests.cpp + stream_compaction/unique_count_tests.cpp + stream_compaction/unique_tests.cpp ) # ################################################################################################## diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp index 78b52db5255..0529539c4b2 100644 --- a/cpp/tests/stream_compaction/distinct_count_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -38,12 +38,12 @@ constexpr int32_t XXX{70}; // Mark for null elements constexpr int32_t YYY{3}; // Mark for null elements template -struct DistinctCountCommon : public cudf::test::BaseFixture { +struct TypedDistinctCount : public cudf::test::BaseFixture { }; -TYPED_TEST_SUITE(DistinctCountCommon, cudf::test::NumericTypes); +TYPED_TEST_SUITE(TypedDistinctCount, cudf::test::NumericTypes); -TYPED_TEST(DistinctCountCommon, NoNull) +TYPED_TEST(TypedDistinctCount, NoNull) { using T = TypeParam; @@ -55,19 +55,11 @@ TYPED_TEST(DistinctCountCommon, NoNull) // explicit instantiation to one particular type (`double`) to reduce build time auto const expected = static_cast(std::set(input.begin(), input.end()).size()); - EXPECT_EQ( - expected, - cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); - - // explicit instantiation to one particular type (`double`) to reduce build time - std::vector input_data(input.begin(), input.end()); - auto const new_end = std::unique(input_data.begin(), input_data.end()); - auto const gold_ordered = std::distance(input_data.begin(), new_end); - EXPECT_EQ(gold_ordered, + EXPECT_EQ(expected, cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); } -TYPED_TEST(DistinctCountCommon, TableNoNull) +TYPED_TEST(TypedDistinctCount, TableNoNull) { using T = TypeParam; @@ -88,11 +80,7 @@ TYPED_TEST(DistinctCountCommon, TableNoNull) auto const expected = static_cast( std::set>(pair_input.begin(), pair_input.end()).size()); - EXPECT_EQ(expected, cudf::unordered_distinct_count(input_table, null_equality::EQUAL)); - - auto const new_end = std::unique(pair_input.begin(), pair_input.end()); - auto const gold_ordered = std::distance(pair_input.begin(), new_end); - EXPECT_EQ(gold_ordered, cudf::distinct_count(input_table, null_equality::EQUAL)); + EXPECT_EQ(expected, cudf::distinct_count(input_table, null_equality::EQUAL)); } struct DistinctCount : public cudf::test::BaseFixture { @@ -112,14 +100,8 @@ TEST_F(DistinctCount, WithNull) // explicit instantiation to one particular type (`double`) to reduce build time auto const expected = static_cast(std::set(input.begin(), input.end()).size()); - EXPECT_EQ( - expected, - cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); - - auto const new_end = std::unique(input.begin(), input.end()); - auto const gold_ordered = std::distance(input.begin(), new_end) - 3; - EXPECT_EQ(gold_ordered, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); } TEST_F(DistinctCount, IgnoringNull) @@ -136,15 +118,8 @@ TEST_F(DistinctCount, IgnoringNull) auto const expected = static_cast(std::set(input.begin(), input.end()).size()); // Removing 2 from expected to remove count for `XXX` and `YYY` - EXPECT_EQ( - expected - 2, - cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); - - auto const new_end = std::unique(input.begin(), input.end()); - // -1 since `YYY, YYY, XXX` is in the same group of equivalent rows - auto const gold_ordered = std::distance(input.begin(), new_end) - 1; - EXPECT_EQ(gold_ordered, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ(expected - 2, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); } TEST_F(DistinctCount, WithNansAndNull) @@ -160,13 +135,7 @@ TEST_F(DistinctCount, WithNansAndNull) auto const expected = static_cast(std::set(input.begin(), input.end()).size()); - EXPECT_EQ( - expected + 1, // +1 since `NAN` is not in std::set - cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); - - auto const new_end = std::unique(input.begin(), input.end()); - auto const gold_ordered = std::distance(input.begin(), new_end); - EXPECT_EQ(gold_ordered, + EXPECT_EQ(expected + 1, // +1 since `NAN` is not in std::set cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); input = {NAN, NAN, XXX}; @@ -174,9 +143,6 @@ TEST_F(DistinctCount, WithNansAndNull) input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; constexpr auto expected_all_nan = 2; - EXPECT_EQ( - expected_all_nan, - cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); EXPECT_EQ(expected_all_nan, cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); } @@ -191,9 +157,6 @@ TEST_F(DistinctCount, WithNansOnly) cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; constexpr auto expected = 5; - EXPECT_EQ( - expected, - cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); EXPECT_EQ(expected, cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); @@ -202,9 +165,6 @@ TEST_F(DistinctCount, WithNansOnly) input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; constexpr auto expected_all_nan = 1; - EXPECT_EQ( - expected_all_nan, - cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); EXPECT_EQ(expected_all_nan, cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); } @@ -219,9 +179,6 @@ TEST_F(DistinctCount, NansAsNullWithNoNull) cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; constexpr auto expected = 5; - EXPECT_EQ( - expected, - cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); EXPECT_EQ(expected, cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); @@ -230,9 +187,6 @@ TEST_F(DistinctCount, NansAsNullWithNoNull) input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; constexpr auto expected_all_nan = 1; - EXPECT_EQ( - expected_all_nan, - cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); EXPECT_EQ(expected_all_nan, cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); } @@ -247,9 +201,6 @@ TEST_F(DistinctCount, NansAsNullWithNull) cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; constexpr auto expected = 4; - EXPECT_EQ( - expected, - cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); EXPECT_EQ(expected, cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); @@ -258,9 +209,6 @@ TEST_F(DistinctCount, NansAsNullWithNull) input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; constexpr auto expected_all_null = 1; - EXPECT_EQ( - expected_all_null, - cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); EXPECT_EQ(expected_all_null, cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); } @@ -275,9 +223,6 @@ TEST_F(DistinctCount, NansAsNullWithIgnoreNull) cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; constexpr auto expected = 3; - EXPECT_EQ( - expected, - cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); EXPECT_EQ(expected, cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); @@ -286,9 +231,6 @@ TEST_F(DistinctCount, NansAsNullWithIgnoreNull) input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; constexpr auto expected_all_nan = 0; - EXPECT_EQ( - expected_all_nan, - cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); EXPECT_EQ(expected_all_nan, cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); } @@ -300,9 +242,6 @@ TEST_F(DistinctCount, EmptyColumn) cudf::test::fixed_width_column_wrapper input_col{}; constexpr auto expected = 0; - EXPECT_EQ( - expected, - cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); EXPECT_EQ(expected, cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); } @@ -315,9 +254,8 @@ TEST_F(DistinctCount, StringColumnWithNull) cudf::size_type const expected = (std::vector{"", "this", "is", "This", "a", "column", "of", "strings"}).size(); - EXPECT_EQ( - expected, - cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); } TEST_F(DistinctCount, TableWithNull) @@ -328,8 +266,8 @@ TEST_F(DistinctCount, TableWithNull) {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; cudf::table_view input{{col1, col2}}; - EXPECT_EQ(8, cudf::unordered_distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); + EXPECT_EQ(8, cudf::distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL)); } TEST_F(DistinctCount, EmptyColumnedTable) @@ -338,8 +276,8 @@ TEST_F(DistinctCount, EmptyColumnedTable) cudf::table_view input(cols); - EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); + EXPECT_EQ(0, cudf::distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(0, cudf::distinct_count(input, null_equality::UNEQUAL)); } TEST_F(DistinctCount, TableMixedTypes) @@ -352,8 +290,8 @@ TEST_F(DistinctCount, TableMixedTypes) {1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0}}; cudf::table_view input{{col1, col2, col3}}; - EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); + EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL)); } TEST_F(DistinctCount, TableWithStringColumnWithNull) @@ -365,6 +303,6 @@ TEST_F(DistinctCount, TableWithStringColumnWithNull) {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; cudf::table_view input{{col1, col2}}; - EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); + EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL)); } diff --git a/cpp/tests/stream_compaction/distinct_tests.cpp b/cpp/tests/stream_compaction/distinct_tests.cpp new file mode 100644 index 00000000000..866239efc9d --- /dev/null +++ b/cpp/tests/stream_compaction/distinct_tests.cpp @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +using cudf::nan_policy; +using cudf::null_equality; +using cudf::null_policy; + +struct Distinct : public cudf::test::BaseFixture { +}; + +TEST_F(Distinct, StringKeyColumn) +{ + cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 5, 5, 8, 1}, {1, 0, 0, 1, 1, 1, 1}}; + cudf::test::strings_column_wrapper key_col{{"all", "new", "new", "all", "new", "the", "strings"}, + {1, 1, 1, 1, 0, 1, 1}}; + cudf::table_view input{{col, key_col}}; + std::vector keys{1}; + + cudf::test::fixed_width_column_wrapper exp_sort_col{{5, 5, 4, 1, 8}, {1, 1, 0, 1, 1}}; + cudf::test::strings_column_wrapper exp_sort_key_col{{"new", "all", "new", "strings", "the"}, + {0, 1, 1, 1, 1}}; + cudf::table_view expected_sort{{exp_sort_col, exp_sort_key_col}}; + + auto got_unordered = distinct(input, keys); + auto key_view = got_unordered->select(keys.begin(), keys.end()); + auto sorted_result = cudf::sort_by_key(got_unordered->view(), key_view); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, sorted_result->view()); +} + +TEST_F(Distinct, EmptyInputTable) +{ + cudf::test::fixed_width_column_wrapper col(std::initializer_list{}); + cudf::table_view input{{col}}; + std::vector keys{1, 2}; + + auto got = distinct(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); +} + +TEST_F(Distinct, NoColumnInputTable) +{ + cudf::table_view input{std::vector()}; + std::vector keys{1, 2}; + + auto got = distinct(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); +} + +TEST_F(Distinct, EmptyKeys) +{ + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper empty_col{}; + cudf::table_view input{{col}}; + std::vector keys{}; + + auto got = distinct(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); +} + +TEST_F(Distinct, NonNullTable) +{ + cudf::test::fixed_width_column_wrapper col1{{6, 6, 3, 5, 8, 5}}; + cudf::test::fixed_width_column_wrapper col2{{6, 6, 3, 4, 9, 4}}; + cudf::test::fixed_width_column_wrapper col1_key{{20, 20, 20, 19, 21, 9}}; + cudf::test::fixed_width_column_wrapper col2_key{{19, 19, 20, 20, 9, 21}}; + + cudf::table_view input{{col1, col2, col1_key, col2_key}}; + std::vector keys{2, 3}; + + // The expected table would be sorted in ascending order with respect to keys + cudf::test::fixed_width_column_wrapper exp_col1{{5, 5, 6, 3, 8}}; + cudf::test::fixed_width_column_wrapper exp_col2{{4, 4, 6, 3, 9}}; + cudf::test::fixed_width_column_wrapper exp_col1_key{{9, 19, 20, 20, 21}}; + cudf::test::fixed_width_column_wrapper exp_col2_key{{21, 20, 19, 20, 9}}; + cudf::table_view expected{{exp_col1, exp_col2, exp_col1_key, exp_col2_key}}; + + auto result = distinct(input, keys); + auto key_view = result->select(keys.begin(), keys.end()); + auto sorted_result = cudf::sort_by_key(result->view(), key_view); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); +} + +TEST_F(Distinct, WithNull) +{ + cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 1, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; + cudf::table_view input{{col, key}}; + std::vector keys{1}; + + // nulls are equal + cudf::test::fixed_width_column_wrapper exp_equal_col{{4, 1, 5, 8}, {0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_equal_key_col{{20, 19, 20, 21}, {0, 1, 1, 1}}; + cudf::table_view expected_equal{{exp_equal_col, exp_equal_key_col}}; + auto res_equal = distinct(input, keys, null_equality::EQUAL); + auto equal_keys = res_equal->select(keys.begin(), keys.end()); + auto sorted_equal = cudf::sort_by_key(res_equal->view(), equal_keys); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_equal, sorted_equal->view()); + + // nulls are unequal + cudf::test::fixed_width_column_wrapper exp_unequal_col{{4, 1, 4, 5, 8}, {0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_unequal_key_col{{20, 19, 20, 20, 21}, + {0, 1, 0, 1, 1}}; + cudf::table_view expected_unequal{{exp_unequal_col, exp_unequal_key_col}}; + auto res_unequal = distinct(input, keys, null_equality::UNEQUAL); + auto sorted_unequal = cudf::sort(res_unequal->view()); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unequal, sorted_unequal->view()); +} diff --git a/cpp/tests/stream_compaction/unique_count_tests.cpp b/cpp/tests/stream_compaction/unique_count_tests.cpp new file mode 100644 index 00000000000..3285cd1a711 --- /dev/null +++ b/cpp/tests/stream_compaction/unique_count_tests.cpp @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +using cudf::nan_policy; +using cudf::null_equality; +using cudf::null_policy; + +constexpr int32_t XXX{70}; // Mark for null elements +constexpr int32_t YYY{3}; // Mark for null elements + +template +struct TypedUniqueCount : public cudf::test::BaseFixture { +}; + +TYPED_TEST_SUITE(TypedUniqueCount, cudf::test::NumericTypes); + +TYPED_TEST(TypedUniqueCount, NoNull) +{ + using T = TypeParam; + + auto const input = cudf::test::make_type_param_vector( + {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); + + cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end()); + + // explicit instantiation to one particular type (`double`) to reduce build time + std::vector input_data(input.begin(), input.end()); + auto const new_end = std::unique(input_data.begin(), input_data.end()); + auto const gold = std::distance(input_data.begin(), new_end); + EXPECT_EQ(gold, cudf::unique_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TYPED_TEST(TypedUniqueCount, TableNoNull) +{ + using T = TypeParam; + + auto const input1 = cudf::test::make_type_param_vector( + {1, 3, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); + auto const input2 = cudf::test::make_type_param_vector( + {3, 3, 3, 4, 31, 1, 8, 5, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1}); + + std::vector> pair_input; + std::transform( + input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) { + return std::make_pair(a, b); + }); + + cudf::test::fixed_width_column_wrapper input_col1(input1.begin(), input1.end()); + cudf::test::fixed_width_column_wrapper input_col2(input2.begin(), input2.end()); + cudf::table_view input_table({input_col1, input_col2}); + + auto const new_end = std::unique(pair_input.begin(), pair_input.end()); + auto const gold = std::distance(pair_input.begin(), new_end); + EXPECT_EQ(gold, cudf::unique_count(input_table, null_equality::EQUAL)); +} + +struct UniqueCount : public cudf::test::BaseFixture { +}; + +TEST_F(UniqueCount, WithNull) +{ + using T = int32_t; + + std::vector input = {1, 3, 3, XXX, 31, 1, 8, 2, 0, XXX, XXX, + XXX, 10, 40, 31, 42, 0, 42, 8, 5, XXX}; + std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + + cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); + + auto const new_end = std::unique(input.begin(), input.end()); + auto const gold = std::distance(input.begin(), new_end) - 3; + EXPECT_EQ(gold, cudf::unique_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(UniqueCount, IgnoringNull) +{ + using T = int32_t; + + std::vector input = {1, YYY, YYY, XXX, 31, 1, 8, 2, 0, XXX, 1, + XXX, 10, 40, 31, 42, 0, 42, 8, 5, XXX}; + std::vector valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + + cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); + + auto const new_end = std::unique(input.begin(), input.end()); + // -1 since `YYY, YYY, XXX` is in the same group of equivalent rows + auto const gold = std::distance(input.begin(), new_end) - 1; + EXPECT_EQ(gold, cudf::unique_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(UniqueCount, WithNansAndNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, XXX, 31, 1, 8, 2, 0, XXX, 1, + XXX, 10, 40, 31, NAN, 0, NAN, 8, 5, XXX}; + std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + auto const new_end = std::unique(input.begin(), input.end()); + auto const gold = std::distance(input.begin(), new_end); + EXPECT_EQ(gold, cudf::unique_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + input = {NAN, NAN, XXX}; + valid = {1, 1, 0}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 2; + EXPECT_EQ(expected_all_nan, + cudf::unique_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(UniqueCount, WithNansOnly) +{ + using T = float; + + std::vector input = {1, 3, NAN, 70, 31}; + std::vector valid = {1, 1, 1, 1, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected = 5; + EXPECT_EQ(expected, + cudf::unique_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + input = {NAN, NAN, NAN}; + valid = {1, 1, 1}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 1; + EXPECT_EQ(expected_all_nan, + cudf::unique_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(UniqueCount, NansAsNullWithNoNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, 70, 31}; + std::vector valid = {1, 1, 1, 1, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected = 5; + EXPECT_EQ(expected, cudf::unique_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + + input = {NAN, NAN, NAN}; + valid = {1, 1, 1}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 1; + EXPECT_EQ(expected_all_nan, + cudf::unique_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(UniqueCount, NansAsNullWithNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, XXX, 31}; + std::vector valid = {1, 1, 1, 0, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected = 4; + EXPECT_EQ(expected, cudf::unique_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + + input = {NAN, NAN, XXX}; + valid = {1, 1, 0}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_null = 1; + EXPECT_EQ(expected_all_null, + cudf::unique_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(UniqueCount, NansAsNullWithIgnoreNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, XXX, 31}; + std::vector valid = {1, 1, 1, 0, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected = 3; + EXPECT_EQ(expected, cudf::unique_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + + input = {NAN, NAN, NAN}; + valid = {1, 1, 1}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 0; + EXPECT_EQ(expected_all_nan, + cudf::unique_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(UniqueCount, EmptyColumn) +{ + using T = float; + + cudf::test::fixed_width_column_wrapper input_col{}; + + constexpr auto expected = 0; + EXPECT_EQ(expected, cudf::unique_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); +} diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/unique_tests.cpp similarity index 62% rename from cpp/tests/stream_compaction/drop_duplicates_tests.cpp rename to cpp/tests/stream_compaction/unique_tests.cpp index cd9a5725db2..ddb377c69a1 100644 --- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp +++ b/cpp/tests/stream_compaction/unique_tests.cpp @@ -34,10 +34,10 @@ using cudf::nan_policy; using cudf::null_equality; using cudf::null_policy; -struct DropDuplicatesCommon : public cudf::test::BaseFixture { +struct Unique : public cudf::test::BaseFixture { }; -TEST_F(DropDuplicatesCommon, StringKeyColumn) +TEST_F(Unique, StringKeyColumn) { cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 5, 5, 8, 1}, {1, 0, 0, 1, 1, 1, 1}}; cudf::test::strings_column_wrapper key_col{{"all", "new", "new", "all", "new", "the", "strings"}, @@ -50,67 +50,41 @@ TEST_F(DropDuplicatesCommon, StringKeyColumn) {1, 1, 1, 0, 1, 1}}; cudf::table_view expected{{exp_col, exp_key_col}}; - auto got = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + auto got = unique(input, keys, cudf::duplicate_keep_option::KEEP_LAST); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got->view()); - - cudf::test::fixed_width_column_wrapper exp_sort_col{{5, 5, 4, 1, 8}, {1, 1, 0, 1, 1}}; - cudf::test::strings_column_wrapper exp_sort_key_col{{"new", "all", "new", "strings", "the"}, - {0, 1, 1, 1, 1}}; - cudf::table_view expected_sort{{exp_sort_col, exp_sort_key_col}}; - - auto got_unordered = unordered_drop_duplicates(input, keys); - auto key_view = got_unordered->select(keys.begin(), keys.end()); - auto sorted_result = cudf::sort_by_key(got_unordered->view(), key_view); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, sorted_result->view()); } -TEST_F(DropDuplicatesCommon, EmptyInputTable) +TEST_F(Unique, EmptyInputTable) { cudf::test::fixed_width_column_wrapper col(std::initializer_list{}); cudf::table_view input{{col}}; std::vector keys{1, 2}; - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + auto got = unique(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); - - auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); } -TEST_F(DropDuplicatesCommon, NoColumnInputTable) +TEST_F(Unique, NoColumnInputTable) { cudf::table_view input{std::vector()}; std::vector keys{1, 2}; - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + auto got = unique(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); - - auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); } -TEST_F(DropDuplicatesCommon, EmptyKeys) +TEST_F(Unique, EmptyKeys) { cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper empty_col{}; cudf::table_view input{{col}}; std::vector keys{}; - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + auto got = unique(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); - - auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got_unordered->view()); } -struct DropDuplicates : public cudf::test::BaseFixture { -}; - -TEST_F(DropDuplicates, NonNullTable) +TEST_F(Unique, NonNullTable) { cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 5}}; cudf::test::fixed_width_column_wrapper col2{{4, 5, 3, 4, 9, 4}}; @@ -129,7 +103,7 @@ TEST_F(DropDuplicates, NonNullTable) cudf::table_view expected_first{ {exp_col1_first, exp_col2_first, exp_col1_key_first, exp_col2_key_first}}; - auto got_first = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST); + auto got_first = unique(input, keys, cudf::duplicate_keep_option::KEEP_FIRST); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); @@ -141,7 +115,7 @@ TEST_F(DropDuplicates, NonNullTable) cudf::table_view expected_last{ {exp_col1_last, exp_col2_last, exp_col1_key_last, exp_col2_key_last}}; - auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + auto got_last = unique(input, keys, cudf::duplicate_keep_option::KEEP_LAST); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); @@ -153,12 +127,12 @@ TEST_F(DropDuplicates, NonNullTable) cudf::table_view expected_unique{ {exp_col1_unique, exp_col2_unique, exp_col1_key_unique, exp_col2_key_unique}}; - auto got_unique = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); + auto got_unique = unique(input, keys, cudf::duplicate_keep_option::KEEP_NONE); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); } -TEST_F(DropDuplicates, KeepFirstWithNull) +TEST_F(Unique, KeepFirstWithNull) { cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 20, 19, 21, 19}, @@ -173,7 +147,7 @@ TEST_F(DropDuplicates, KeepFirstWithNull) {1, 0, 1, 1, 1}}; cudf::table_view expected_first_equal{{exp_col_first_equal, exp_key_col_first_equal}}; auto got_first_equal = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + unique(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_equal, got_first_equal->view()); @@ -184,12 +158,12 @@ TEST_F(DropDuplicates, KeepFirstWithNull) {20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; cudf::table_view expected_first_unequal{{exp_col_first_unequal, exp_key_col_first_unequal}}; auto got_first_unequal = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::UNEQUAL); + unique(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::UNEQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_unequal, got_first_unequal->view()); } -TEST_F(DropDuplicates, KeepLastWithNull) +TEST_F(Unique, KeepLastWithNull) { cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 20, 19, 21, 19}, @@ -204,7 +178,7 @@ TEST_F(DropDuplicates, KeepLastWithNull) {1, 0, 1, 1, 1}}; cudf::table_view expected_last_equal{{exp_col_last_equal, exp_key_col_last_equal}}; auto got_last_equal = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::EQUAL); + unique(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_equal, got_last_equal->view()); @@ -215,12 +189,12 @@ TEST_F(DropDuplicates, KeepLastWithNull) {1, 0, 0, 1, 1, 1}}; cudf::table_view expected_last_unequal{{exp_col_last_unequal, exp_key_col_last_unequal}}; auto got_last_unequal = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::UNEQUAL); + unique(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::UNEQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_unequal, got_last_unequal->view()); } -TEST_F(DropDuplicates, KeepNoneWithNull) +TEST_F(Unique, KeepNoneWithNull) { cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 20, 19, 21, 19}, @@ -233,7 +207,7 @@ TEST_F(DropDuplicates, KeepNoneWithNull) cudf::test::fixed_width_column_wrapper exp_key_col_unique_equal{{19, 21, 19}, {1, 1, 1}}; cudf::table_view expected_unique_equal{{exp_col_unique_equal, exp_key_col_unique_equal}}; auto got_unique_equal = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::EQUAL); + unique(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_equal, got_unique_equal->view()); @@ -244,62 +218,7 @@ TEST_F(DropDuplicates, KeepNoneWithNull) {0, 0, 1, 1, 1}}; cudf::table_view expected_unique_unequal{{exp_col_unique_unequal, exp_key_col_unique_unequal}}; auto got_unique_unequal = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::UNEQUAL); + unique(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::UNEQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_unequal, got_unique_unequal->view()); } - -struct UnorderedDropDuplicates : public cudf::test::BaseFixture { -}; - -TEST_F(UnorderedDropDuplicates, NonNullTable) -{ - cudf::test::fixed_width_column_wrapper col1{{6, 6, 3, 5, 8, 5}}; - cudf::test::fixed_width_column_wrapper col2{{6, 6, 3, 4, 9, 4}}; - cudf::test::fixed_width_column_wrapper col1_key{{20, 20, 20, 19, 21, 9}}; - cudf::test::fixed_width_column_wrapper col2_key{{19, 19, 20, 20, 9, 21}}; - - cudf::table_view input{{col1, col2, col1_key, col2_key}}; - std::vector keys{2, 3}; - - // The expected table would be sorted in ascending order with respect to keys - cudf::test::fixed_width_column_wrapper exp_col1{{5, 5, 6, 3, 8}}; - cudf::test::fixed_width_column_wrapper exp_col2{{4, 4, 6, 3, 9}}; - cudf::test::fixed_width_column_wrapper exp_col1_key{{9, 19, 20, 20, 21}}; - cudf::test::fixed_width_column_wrapper exp_col2_key{{21, 20, 19, 20, 9}}; - cudf::table_view expected{{exp_col1, exp_col2, exp_col1_key, exp_col2_key}}; - - auto result = unordered_drop_duplicates(input, keys); - auto key_view = result->select(keys.begin(), keys.end()); - auto sorted_result = cudf::sort_by_key(result->view(), key_view); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); -} - -TEST_F(UnorderedDropDuplicates, WithNull) -{ - cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 1, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; - cudf::table_view input{{col, key}}; - std::vector keys{1}; - - // nulls are equal - cudf::test::fixed_width_column_wrapper exp_equal_col{{4, 1, 5, 8}, {0, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_equal_key_col{{20, 19, 20, 21}, {0, 1, 1, 1}}; - cudf::table_view expected_equal{{exp_equal_col, exp_equal_key_col}}; - auto res_equal = unordered_drop_duplicates(input, keys, null_equality::EQUAL); - auto equal_keys = res_equal->select(keys.begin(), keys.end()); - auto sorted_equal = cudf::sort_by_key(res_equal->view(), equal_keys); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_equal, sorted_equal->view()); - - // nulls are unequal - cudf::test::fixed_width_column_wrapper exp_unequal_col{{4, 1, 4, 5, 8}, {0, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_unequal_key_col{{20, 19, 20, 20, 21}, - {0, 1, 0, 1, 1}}; - cudf::table_view expected_unequal{{exp_unequal_col, exp_unequal_key_col}}; - auto res_unequal = unordered_drop_duplicates(input, keys, null_equality::UNEQUAL); - auto sorted_unequal = cudf::sort(res_unequal->view()); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unequal, sorted_unequal->view()); -} From 053cda331554be46ce88981aeb2111b077e81d9e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 10 Mar 2022 16:31:26 -0500 Subject: [PATCH 12/17] Update JNI and Cython bindings --- java/src/main/native/src/TableJni.cpp | 17 +++++++++-------- python/cudf/cudf/_lib/cpp/stream_compaction.pxd | 4 ++-- python/cudf/cudf/_lib/stream_compaction.pyx | 15 ++++++++------- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index d426ac3ce62..72a4bb23d01 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -3044,20 +3044,21 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates( auto const keys_indices = std::vector(native_keys_indices.begin(), native_keys_indices.end()); - // cudf::drop_duplicates works like std::unique thus does NOT match the behavior of - // pandas.DataFrame.drop_duplicates. Users need to stable sort the input first and then drop. + // cudf::unique keeps unique rows in each consecutive group of equivalent rows. To match the + // behavior of pandas.DataFrame.drop_duplicates, users need to stable sort the input first and + // then unique. std::vector order(keys_indices.size(), cudf::order::ASCENDING); std::vector null_precedence( keys_indices.size(), nulls_before ? cudf::null_order::BEFORE : cudf::null_order::AFTER); auto const sorted_input = cudf::stable_sort_by_key(*input, input->select(keys_indices), order, null_precedence); - auto result = cudf::drop_duplicates(sorted_input->view(), keys_indices, - keep_first ? cudf::duplicate_keep_option::KEEP_FIRST : - cudf::duplicate_keep_option::KEEP_LAST, - nulls_equal ? cudf::null_equality::EQUAL : - cudf::null_equality::UNEQUAL, - rmm::mr::get_current_device_resource()); + auto result = + cudf::unique(sorted_input->view(), keys_indices, + keep_first ? cudf::duplicate_keep_option::KEEP_FIRST : + cudf::duplicate_keep_option::KEEP_LAST, + nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL, + rmm::mr::get_current_device_resource()); return convert_table_for_return(env, result); } CATCH_STD(env, 0); diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index 897b61f8001..61efd040807 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -33,13 +33,13 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \ column_view boolean_mask ) except + - cdef unique_ptr[table] drop_duplicates( + cdef unique_ptr[table] unique( table_view source_table, vector[size_type] keys, duplicate_keep_option keep, null_equality nulls_equal) except + - cdef size_type unordered_distinct_count( + cdef size_type distinct_count( column_view source_table, null_policy null_handling, nan_policy nan_handling) except + diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 93c15791840..e8fc584dc3b 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -12,10 +12,10 @@ from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.sorting cimport stable_sort_by_key as cpp_stable_sort_by_key from cudf._lib.cpp.stream_compaction cimport ( apply_boolean_mask as cpp_apply_boolean_mask, - drop_duplicates as cpp_drop_duplicates, + distinct_count as cpp_distinct_count, drop_nulls as cpp_drop_nulls, duplicate_keep_option, - unordered_distinct_count as cpp_unordered_distinct_count, + unique as cpp_unique, ) from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -167,9 +167,10 @@ def drop_duplicates(columns: list, cdef unique_ptr[table] c_result with nogil: - # cudf::drop_duplicates works like std::unique thus does NOT match - # the behavior of pandas.DataFrame.drop_duplicates. Users need to - # stable sort the input first and then drop. + # cudf::unique keeps unique rows in each consecutive group of + # equivalent rows. To match the behavior of pandas.DataFrame. + # drop_duplicates, users need to stable sort the input first + # and then unique. sorted_source_table = move( cpp_stable_sort_by_key( source_table_view, @@ -179,7 +180,7 @@ def drop_duplicates(columns: list, ) ) c_result = move( - cpp_drop_duplicates( + cpp_unique( sorted_source_table.get().view(), cpp_keys, cpp_keep_option, @@ -220,7 +221,7 @@ def distinct_count(Column source_column, ignore_nulls=True, nan_as_null=False): cdef column_view source_column_view = source_column.view() with nogil: - count = cpp_unordered_distinct_count( + count = cpp_distinct_count( source_column_view, cpp_null_handling, cpp_nan_handling From cdf35bd63c6f1b72d854c8bb54a1d5c572acb34b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 10 Mar 2022 16:56:17 -0500 Subject: [PATCH 13/17] cmake format --- cpp/benchmarks/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 8e939482e45..ac2af2c32ef 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -145,7 +145,8 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- ConfigureNVBench( - STREAM_COMPACTION_BENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp) + STREAM_COMPACTION_BENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp +) # ################################################################################################## # * join benchmark -------------------------------------------------------------------------------- From 3470dc2afd9757aa4154016d340a21956a1c7a00 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 11 Mar 2022 14:29:27 -0500 Subject: [PATCH 14/17] Address review comments: update docs/comments + code cleanups --- cpp/benchmarks/stream_compaction/distinct.cpp | 4 ++-- cpp/benchmarks/stream_compaction/unique.cpp | 4 ++-- cpp/include/cudf/stream_compaction.hpp | 8 ++++---- cpp/src/dictionary/set_keys.cu | 8 ++++---- cpp/src/stream_compaction/distinct.cu | 10 +++++----- cpp/src/stream_compaction/unique_count.cu | 2 +- cpp/src/transform/encode.cu | 4 ++-- java/src/main/native/src/TableJni.cpp | 2 +- python/cudf/cudf/_lib/stream_compaction.pyx | 2 +- 9 files changed, 22 insertions(+), 22 deletions(-) diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp index 0eb353381e1..3d601479068 100644 --- a/cpp/benchmarks/stream_compaction/distinct.cpp +++ b/cpp/benchmarks/stream_compaction/distinct.cpp @@ -39,8 +39,8 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list) cudf::test::UniformRandomGenerator rand_gen(0, 100); auto elements = cudf::detail::make_counting_transform_iterator( 0, [&rand_gen](auto row) { return rand_gen.generate(); }); - auto valids = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 100 == 0 ? false : true; }); + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100 != 0; }); cudf::test::fixed_width_column_wrapper values(elements, elements + num_rows, valids); auto input_column = cudf::column_view(values); diff --git a/cpp/benchmarks/stream_compaction/unique.cpp b/cpp/benchmarks/stream_compaction/unique.cpp index 5ac90a878e2..edc4097e55b 100644 --- a/cpp/benchmarks/stream_compaction/unique.cpp +++ b/cpp/benchmarks/stream_compaction/unique.cpp @@ -61,8 +61,8 @@ void nvbench_unique(nvbench::state& state, nvbench::type_list rand_gen(0, 100); auto elements = cudf::detail::make_counting_transform_iterator( 0, [&rand_gen](auto row) { return rand_gen.generate(); }); - auto valids = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 100 == 0 ? false : true; }); + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100 != 0; }); cudf::test::fixed_width_column_wrapper values(elements, elements + num_rows, valids); auto input_column = cudf::column_view(values); diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index a8d1bbc28f5..a726a619d86 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -278,7 +278,7 @@ std::unique_ptr
distinct( * @param[in] null_handling flag to include or ignore `null` while counting * @param[in] nan_handling flag to consider `NaN==null` or not * - * @return number of distinct consecutive groups in the column + * @return number of consecutive groups of equivalent rows in the column */ cudf::size_type unique_count(column_view const& input, null_policy null_handling, @@ -291,7 +291,7 @@ cudf::size_type unique_count(column_view const& input, * @param[in] nulls_equal flag to denote if null elements should be considered equal. * nulls are not equal if null_equality::UNEQUAL. * - * @return number of distinct consecutive groups in the table + * @return number of consecutive groups of equivalent rows in the column */ cudf::size_type unique_count(table_view const& input, null_equality nulls_equal = null_equality::EQUAL); @@ -314,7 +314,7 @@ cudf::size_type unique_count(table_view const& input, * @param[in] null_handling flag to include or ignore `null` while counting * @param[in] nan_handling flag to consider `NaN==null` or not * - * @return number of unique elements + * @return number of distinct rows in the table */ cudf::size_type distinct_count(column_view const& input, null_policy null_handling, @@ -327,7 +327,7 @@ cudf::size_type distinct_count(column_view const& input, * @param[in] nulls_equal flag to denote if null elements should be considered equal. * nulls are not equal if null_equality::UNEQUAL. * - * @return number of unique rows in the table + * @return number of distinct rows in the table */ cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal = null_equality::EQUAL); diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index 357f4ced00e..a3bbbc37506 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -119,11 +119,11 @@ std::unique_ptr set_keys( auto keys = dictionary_column.keys(); CUDF_EXPECTS(keys.type() == new_keys.type(), "keys types must match"); - // copy the keys -- use cudf::distinct to make sure they are unique, then - // sort the results. - auto unique_keys = cudf::detail::distinct( + // copy the keys -- use cudf::distinct to make sure there are no duplicates, + // then sort the results. + auto distinct_keys = cudf::detail::distinct( table_view{{new_keys}}, std::vector{0}, null_equality::EQUAL, stream, mr); - auto sorted_keys = cudf::detail::sort(unique_keys->view(), + auto sorted_keys = cudf::detail::sort(distinct_keys->view(), std::vector{order::ASCENDING}, std::vector{null_order::BEFORE}, stream, diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu index 1d5d41f3095..d856e63b8cb 100644 --- a/cpp/src/stream_compaction/distinct.cu +++ b/cpp/src/stream_compaction/distinct.cu @@ -70,7 +70,7 @@ std::unique_ptr
distinct(table_view const& input, auto iter = cudf::detail::make_counting_transform_iterator( 0, [] __device__(size_type i) { return cuco::make_pair(i, i); }); - // insert unique indices into the map. + // insert distinct indices into the map. key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); auto counting_iter = thrust::make_counting_iterator(0); @@ -80,10 +80,10 @@ std::unique_ptr
distinct(table_view const& input, auto const output_size{key_map.get_size()}; - // write unique indices to a numeric column - auto unique_indices = cudf::make_numeric_column( + // write distinct indices to a numeric column + auto distinct_indices = cudf::make_numeric_column( data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr); - auto mutable_view = mutable_column_device_view::create(*unique_indices, stream); + auto mutable_view = mutable_column_device_view::create(*distinct_indices, stream); thrust::copy_if(rmm::exec_policy(stream), counting_iter, counting_iter + num_rows, @@ -93,7 +93,7 @@ std::unique_ptr
distinct(table_view const& input, // run gather operation to establish new order return detail::gather(input, - unique_indices->view(), + distinct_indices->view(), out_of_bounds_policy::DONT_CHECK, detail::negative_index_policy::NOT_ALLOWED, stream, diff --git a/cpp/src/stream_compaction/unique_count.cu b/cpp/src/stream_compaction/unique_count.cu index 1b0a83ede80..91a2537cf97 100644 --- a/cpp/src/stream_compaction/unique_count.cu +++ b/cpp/src/stream_compaction/unique_count.cu @@ -50,7 +50,7 @@ namespace { * the row `index` of `column_device_view` is `NaN`. */ struct check_nan { - // Check if it's `NaN` for floating point type columns + // Check if a value is `NaN` for floating point type columns template >* = nullptr> __device__ inline bool operator()(column_device_view const& input, size_type index) { diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index 7236fe3882c..04821b09eab 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -46,13 +46,13 @@ std::pair, std::unique_ptr> encode( std::vector drop_keys(num_cols); std::iota(drop_keys.begin(), drop_keys.end(), 0); - auto unique_keys = + auto distinct_keys = cudf::detail::distinct(input_table, drop_keys, null_equality::EQUAL, stream, mr); std::vector column_order(num_cols, order::ASCENDING); std::vector null_precedence(num_cols, null_order::AFTER); auto sorted_unique_keys = - cudf::detail::sort(unique_keys->view(), column_order, null_precedence, stream, mr); + cudf::detail::sort(distinct_keys->view(), column_order, null_precedence, stream, mr); auto indices_column = cudf::detail::lower_bound( sorted_unique_keys->view(), input_table, column_order, null_precedence, stream, mr); diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 72a4bb23d01..78ac8a18107 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -3046,7 +3046,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates( // cudf::unique keeps unique rows in each consecutive group of equivalent rows. To match the // behavior of pandas.DataFrame.drop_duplicates, users need to stable sort the input first and - // then unique. + // then invoke cudf::unique. std::vector order(keys_indices.size(), cudf::order::ASCENDING); std::vector null_precedence( keys_indices.size(), nulls_before ? cudf::null_order::BEFORE : cudf::null_order::AFTER); diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index e8fc584dc3b..a627b501a2f 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -170,7 +170,7 @@ def drop_duplicates(columns: list, # cudf::unique keeps unique rows in each consecutive group of # equivalent rows. To match the behavior of pandas.DataFrame. # drop_duplicates, users need to stable sort the input first - # and then unique. + # and then invoke cudf::unique. sorted_source_table = move( cpp_stable_sort_by_key( source_table_view, From 93cef5b3a877d63ca8105cec7c519efc666a5eaf Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 11 Mar 2022 16:12:04 -0500 Subject: [PATCH 15/17] Remove keep options in disinct benchmark --- cpp/benchmarks/stream_compaction/distinct.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp index 3d601479068..37d90894746 100644 --- a/cpp/benchmarks/stream_compaction/distinct.cpp +++ b/cpp/benchmarks/stream_compaction/distinct.cpp @@ -52,10 +52,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list) }); } -using data_type = nvbench::type_list; -using keep_option = nvbench::enum_type_list; +using data_type = nvbench::type_list; NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type)) .set_name("distinct") From 56f0a5673970379c0895c46112ad0676f2a08b19 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 11 Mar 2022 16:24:46 -0500 Subject: [PATCH 16/17] Update docs --- cpp/include/cudf/stream_compaction.hpp | 36 ++++++++++++++------------ 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index a726a619d86..914b9df2a74 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -216,6 +216,10 @@ enum class duplicate_keep_option { /** * @brief Create a new table with consecutive duplicate rows removed. * + * A row is distinct if there are no equivalent rows in the table. A row is unique if there is no + * adjacent equivalent row. That is, keeping distinct rows removes all duplicates in the + * table/column, while keeping unique rows only removes duplicates from consecutive groupings. + * * Given an `input` table_view, one specific row from a group of equivalent elements is copied to * output table depending on the value of @p keep: * - KEEP_FIRST: only the first of a sequence of duplicate rows is copied @@ -245,19 +249,19 @@ std::unique_ptr
unique( * @brief Create a new table without duplicate rows. * * Given an `input` table_view, each row is copied to output table if the corresponding - * row of `keys` columns is unique. If duplicate rows are present, it is unspecified which - * row is copied. + * row of `keys` columns is distinct (no other equivalent row exists in the table). If duplicate + * rows are present, it is unspecified which row is copied. * * The order of elements in the output table is not specified. * - * @param[in] input input table_view to copy only unique rows + * @param[in] input input table_view to copy only distinct rows * @param[in] keys vector of indices representing key columns from `input` * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not * equal if null_equality::UNEQUAL * @param[in] mr Device memory resource used to allocate the returned table's device * memory * - * @return Table with unique rows in an unspecified order. + * @return Table with distinct rows in an unspecified order. */ std::unique_ptr
distinct( table_view const& input, @@ -266,7 +270,7 @@ std::unique_ptr
distinct( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Count the number of consecutive groups of equivalent elements in a column. + * @brief Count the number of consecutive groups of equivalent rows in a column. * * If `null_handling` is null_policy::EXCLUDE and `nan_handling` is nan_policy::NAN_IS_NULL, both * `NaN` and `null` values are ignored. If `null_handling` is null_policy::EXCLUDE and @@ -274,7 +278,7 @@ std::unique_ptr
distinct( * * `null`s are handled as equal. * - * @param[in] input The column_view whose number of distinct consecutive groups will be counted + * @param[in] input The column_view whose consecutive groups of equivalent rows will be counted * @param[in] null_handling flag to include or ignore `null` while counting * @param[in] nan_handling flag to consider `NaN==null` or not * @@ -285,9 +289,9 @@ cudf::size_type unique_count(column_view const& input, nan_policy nan_handling); /** - * @brief Count the number of consecutive groups of equivalent elements in a table. + * @brief Count the number of consecutive groups of equivalent rows in a table. * - * @param[in] input Table whose number of distinct consecutive groups will be counted + * @param[in] input Table whose consecutive groups of equivalent rows will be counted * @param[in] nulls_equal flag to denote if null elements should be considered equal. * nulls are not equal if null_equality::UNEQUAL. * @@ -297,20 +301,20 @@ cudf::size_type unique_count(table_view const& input, null_equality nulls_equal = null_equality::EQUAL); /** - * @brief Count the unique elements in the column_view. + * @brief Count the distinct elements in the column_view. * - * If `nulls_equal == nulls_equal::UNEQUAL`, all `null`s are unique. + * If `nulls_equal == nulls_equal::UNEQUAL`, all `null`s are distinct. * - * Given an input column_view, number of unique elements in this column_view is returned. + * Given an input column_view, number of distinct elements in this column_view is returned. * * If `null_handling` is null_policy::EXCLUDE and `nan_handling` is nan_policy::NAN_IS_NULL, both * `NaN` and `null` values are ignored. If `null_handling` is null_policy::EXCLUDE and - * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in unique - * count. + * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in + * distinct count. * * `null`s are handled as equal. * - * @param[in] input The column_view whose unique elements will be counted + * @param[in] input The column_view whose distinct elements will be counted * @param[in] null_handling flag to include or ignore `null` while counting * @param[in] nan_handling flag to consider `NaN==null` or not * @@ -321,9 +325,9 @@ cudf::size_type distinct_count(column_view const& input, nan_policy nan_handling); /** - * @brief Count the unique rows in a table. + * @brief Count the distinct rows in a table. * - * @param[in] input Table whose unique rows will be counted + * @param[in] input Table whose distinct rows will be counted * @param[in] nulls_equal flag to denote if null elements should be considered equal. * nulls are not equal if null_equality::UNEQUAL. * From 2c3c02a3f2cfb571a5e666f03b3f2219d449ad54 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 11 Mar 2022 19:13:43 -0500 Subject: [PATCH 17/17] Add performance hints into doc --- cpp/include/cudf/stream_compaction.hpp | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 914b9df2a74..b48795de16e 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -216,16 +216,22 @@ enum class duplicate_keep_option { /** * @brief Create a new table with consecutive duplicate rows removed. * - * A row is distinct if there are no equivalent rows in the table. A row is unique if there is no - * adjacent equivalent row. That is, keeping distinct rows removes all duplicates in the - * table/column, while keeping unique rows only removes duplicates from consecutive groupings. - * * Given an `input` table_view, one specific row from a group of equivalent elements is copied to * output table depending on the value of @p keep: * - KEEP_FIRST: only the first of a sequence of duplicate rows is copied * - KEEP_LAST: only the last of a sequence of duplicate rows is copied * - KEEP_NONE: no duplicate rows are copied * + * A row is distinct if there are no equivalent rows in the table. A row is unique if there is no + * adjacent equivalent row. That is, keeping distinct rows removes all duplicates in the + * table/column, while keeping unique rows only removes duplicates from consecutive groupings. + * + * Performance hints: + * - Always use `cudf::unique` instead of `cudf::distinct` if the input is pre-sorted + * - If the input is not pre-sorted and the behavior of pandas.DataFrame.drop_duplicates is desired: + * - If `keep` is not relevant, use `cudf::distinct` + * - If `keep` control is required, stable sort the input then `cudf::unique` + * * @throws cudf::logic_error if the `keys` column indices are out of bounds in the `input` table. * * @param[in] input input table_view to copy only unique rows @@ -254,6 +260,12 @@ std::unique_ptr
unique( * * The order of elements in the output table is not specified. * + * Performance hints: + * - Always use `cudf::unique` instead of `cudf::distinct` if the input is pre-sorted + * - If the input is not pre-sorted and the behavior of pandas.DataFrame.drop_duplicates is desired: + * - If `keep` is not relevant, use `cudf::distinct` + * - If `keep` control is required, stable sort the input then `cudf::unique` + * * @param[in] input input table_view to copy only distinct rows * @param[in] keys vector of indices representing key columns from `input` * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not