diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 57592de59af..13ef02efc99 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -123,7 +123,7 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- -ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates.cpp) +ConfigureNVBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates.cpp) # ################################################################################################## # * join benchmark -------------------------------------------------------------------------------- diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates.cpp index 8039d7d065f..317db92ae8b 100644 --- a/cpp/benchmarks/stream_compaction/drop_duplicates.cpp +++ b/cpp/benchmarks/stream_compaction/drop_duplicates.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,64 +15,102 @@ */ #include -#include +#include #include #include #include -#include -#include + +#include + +#include #include #include -class Compaction : public cudf::benchmark { -}; +// necessary for custom enum types +// see: https://github.com/NVIDIA/nvbench/blob/main/examples/enums.cu +NVBENCH_DECLARE_ENUM_TYPE_STRINGS( + // Enum type: + cudf::duplicate_keep_option, + // Callable to generate input strings: + [](cudf::duplicate_keep_option option) { + switch (option) { + case cudf::duplicate_keep_option::KEEP_FIRST: return "KEEP_FIRST"; + case cudf::duplicate_keep_option::KEEP_LAST: return "KEEP_LAST"; + case cudf::duplicate_keep_option::KEEP_NONE: return "KEEP_NONE"; + default: return "ERROR"; + } + }, + // Callable to generate descriptions: + [](auto) { return std::string{}; }) + +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms"); + +template +void nvbench_drop_duplicates(nvbench::state& state, + nvbench::type_list>) +{ + if constexpr (not std::is_same_v and + Keep != cudf::duplicate_keep_option::KEEP_FIRST) { + state.skip("Skip unwanted benchmarks."); + } + + cudf::rmm_pool_raii pool_raii; + + auto const num_rows = state.get_int64("NumRows"); + + cudf::test::UniformRandomGenerator rand_gen(0, 100); + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [&rand_gen](auto row) { return rand_gen.generate(); }); + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 100 == 0 ? false : true; }); + cudf::test::fixed_width_column_wrapper values(elements, elements + num_rows, valids); + + auto input_column = cudf::column_view(values); + auto input_table = cudf::table_view({input_column, input_column, input_column, input_column}); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + auto result = cudf::detail::drop_duplicates( + input_table, {0}, Keep, cudf::null_equality::EQUAL, cudf::null_order::BEFORE, stream_view); + }); +} template -void BM_compaction(benchmark::State& state, cudf::duplicate_keep_option keep) +void nvbench_unordered_drop_duplicates(nvbench::state& state, nvbench::type_list) { - auto const n_rows = static_cast(state.range(0)); + cudf::rmm_pool_raii pool_raii; + + auto const num_rows = state.get_int64("NumRows"); cudf::test::UniformRandomGenerator rand_gen(0, 100); auto elements = cudf::detail::make_counting_transform_iterator( 0, [&rand_gen](auto row) { return rand_gen.generate(); }); auto valids = cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i % 100 == 0 ? false : true; }); - cudf::test::fixed_width_column_wrapper values(elements, elements + n_rows, valids); + cudf::test::fixed_width_column_wrapper values(elements, elements + num_rows, valids); auto input_column = cudf::column_view(values); auto input_table = cudf::table_view({input_column, input_column, input_column, input_column}); - for (auto _ : state) { - cuda_event_timer timer(state, true); - auto result = cudf::drop_duplicates(input_table, {0}, keep); - } + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + auto result = cudf::detail::unordered_drop_duplicates( + input_table, {0}, cudf::null_equality::EQUAL, stream_view); + }); } -#define concat(a, b, c) a##b##c -#define get_keep(op) cudf::duplicate_keep_option::KEEP_##op - -// TYPE, OP -#define RBM_BENCHMARK_DEFINE(name, type, keep) \ - BENCHMARK_DEFINE_F(Compaction, name)(::benchmark::State & state) \ - { \ - BM_compaction(state, get_keep(keep)); \ - } \ - BENCHMARK_REGISTER_F(Compaction, name) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ - ->Arg(10000000) /* 10M */ - -#define COMPACTION_BENCHMARK_DEFINE(type, keep) \ - RBM_BENCHMARK_DEFINE(concat(type, _, keep), type, keep) - -COMPACTION_BENCHMARK_DEFINE(bool, NONE); -COMPACTION_BENCHMARK_DEFINE(int8_t, NONE); -COMPACTION_BENCHMARK_DEFINE(int32_t, NONE); -COMPACTION_BENCHMARK_DEFINE(int32_t, FIRST); -COMPACTION_BENCHMARK_DEFINE(int32_t, LAST); -using cudf::timestamp_ms; -COMPACTION_BENCHMARK_DEFINE(timestamp_ms, NONE); -COMPACTION_BENCHMARK_DEFINE(float, NONE); +using data_type = nvbench::type_list; +using keep_option = nvbench::enum_type_list; + +NVBENCH_BENCH_TYPES(nvbench_drop_duplicates, NVBENCH_TYPE_AXES(data_type, keep_option)) + .set_name("drop_duplicates") + .set_type_axes_names({"Type", "KeepOption"}) + .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); + +NVBENCH_BENCH_TYPES(nvbench_unordered_drop_duplicates, NVBENCH_TYPE_AXES(data_type)) + .set_name("unordered_drop_duplicates") + .set_type_axes_names({"Type"}) + .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index c964c85156c..5a20f78b798 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -21,7 +21,7 @@ function(find_and_configure_cucollections) cuco 0.0 GLOBAL_TARGETS cuco::cuco CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections - GIT_TAG 0ca860b824f5dc22cf8a41f09912e62e11f07d82 + GIT_TAG 6ec8b6dcdeceea07ab4456d32461a05c18864411 OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF" ) diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 87823d71c6f..3d065556827 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -75,6 +75,18 @@ std::unique_ptr drop_duplicates( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::unordered_drop_duplicates + * + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr
unordered_drop_duplicates( + table_view const& input, + std::vector const& keys, + null_equality nulls_equal = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy) * @@ -94,5 +106,24 @@ cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal = null_equality::EQUAL, rmm::cuda_stream_view stream = rmm::cuda_stream_default); +/** + * @copydoc cudf::unordered_distinct_count(column_view const&, null_policy, nan_policy) + * + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + */ +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + +/** + * @copydoc cudf::unordered_distinct_count(table_view const&, null_equality) + * + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + */ +cudf::size_type unordered_distinct_count(table_view const& input, + null_equality nulls_equal = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 7551511d281..94039d81f31 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -189,7 +189,7 @@ std::unique_ptr
drop_nans( * @note if @p input.num_rows() is zero, there is no error, and an empty table * is returned. * - * @throws cudf::logic_error if The `input` size and `boolean_mask` size mismatches. + * @throws cudf::logic_error if `input.num_rows() != boolean_mask.size()`. * @throws cudf::logic_error if `boolean_mask` is not `type_id::BOOL8` type. * * @param[in] input The input table_view to filter @@ -214,7 +214,10 @@ enum class duplicate_keep_option { }; /** - * @brief Create a new table without duplicate rows + * @brief Create a new table without duplicate rows. + * + * The output table is sorted according to the lexicographic ordering of the data in the columns + * indexed by `keys`. * * Given an `input` table_view, each row is copied to output table if the corresponding * row of `keys` columns is unique, where the definition of unique depends on the value of @p keep: @@ -222,18 +225,18 @@ enum class duplicate_keep_option { * - KEEP_LAST: only the last of a sequence of duplicate rows is copied * - KEEP_NONE: no duplicate rows are copied * - * @throws cudf::logic_error if The `input` row size mismatches with `keys`. + * @throws cudf::logic_error if the `keys` column indices are out of bounds in the `input` table. * * @param[in] input input table_view to copy only unique rows * @param[in] keys vector of indices representing key columns from `input` - * @param[in] keep keep first entry, last entry, or no entries if duplicates found + * @param[in] keep keep first row, last row, or no rows of the found duplicates * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not * equal if null_equality::UNEQUAL * @param[in] null_precedence flag to denote nulls should appear before or after non-null items * @param[in] mr Device memory resource used to allocate the returned table's device - * memory + * memory * - * @return Table with unique rows as per specified `keep`. + * @return Table with sorted unique rows as specified by `keep`. */ std::unique_ptr
drop_duplicates( table_view const& input, @@ -244,37 +247,95 @@ std::unique_ptr
drop_duplicates( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Count the unique elements in the column_view + * @brief Create a new table without duplicate rows with hash-based algorithms. + * + * Given an `input` table_view, each row is copied to output table if the corresponding + * row of `keys` columns is unique. If duplicate rows are present, it is unspecified which + * row is copied. * - * Given an input column_view, number of unique elements in this column_view is returned + * The order of elements in the output table is not specified. + * + * @param[in] input input table_view to copy only unique rows + * @param[in] keys vector of indices representing key columns from `input` + * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not + * equal if null_equality::UNEQUAL + * @param[in] mr Device memory resource used to allocate the returned table's device + * memory + * + * @return Table with unique rows in an unspecified order. + */ +std::unique_ptr
unordered_drop_duplicates( + table_view const& input, + std::vector const& keys, + null_equality nulls_equal = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Count the number of consecutive groups of equivalent elements in a column. * * If `null_handling` is null_policy::EXCLUDE and `nan_handling` is nan_policy::NAN_IS_NULL, both * `NaN` and `null` values are ignored. If `null_handling` is null_policy::EXCLUDE and - * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in unique - * count. + * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in count. + * + * `null`s are handled as equal. * - * @param[in] input The column_view whose unique elements will be counted. + * @param[in] input The column_view whose number of distinct consecutive groups will be counted * @param[in] null_handling flag to include or ignore `null` while counting - * @param[in] nan_handling flag to consider `NaN==null` or not. + * @param[in] nan_handling flag to consider `NaN==null` or not * - * @return number of unique elements + * @return number of distinct consecutive groups in the column */ cudf::size_type distinct_count(column_view const& input, null_policy null_handling, nan_policy nan_handling); /** - * @brief Count the unique rows in a table. - * + * @brief Count the number of consecutive groups of equivalent elements in a table. * - * @param[in] input Table whose unique rows will be counted. - * @param[in] nulls_equal flag to denote if null elements should be considered equal - * nulls are not equal if null_equality::UNEQUAL + * @param[in] input Table whose number of distinct consecutive groups will be counted + * @param[in] nulls_equal flag to denote if null elements should be considered equal. + * nulls are not equal if null_equality::UNEQUAL. * - * @return number of unique rows in the table + * @return number of distinct consecutive groups in the table */ cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal = null_equality::EQUAL); +/** + * @brief Count the unique elements in the column_view. + * + * If `nulls_equal == nulls_equal::UNEQUAL`, all `null`s are unique. + * + * Given an input column_view, number of unique elements in this column_view is returned. + * + * If `null_handling` is null_policy::EXCLUDE and `nan_handling` is nan_policy::NAN_IS_NULL, both + * `NaN` and `null` values are ignored. If `null_handling` is null_policy::EXCLUDE and + * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in unique + * count. + * + * `null`s are handled as equal. + * + * @param[in] input The column_view whose unique elements will be counted + * @param[in] null_handling flag to include or ignore `null` while counting + * @param[in] nan_handling flag to consider `NaN==null` or not + * + * @return number of unique elements + */ +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling); + +/** + * @brief Count the unique rows in a table. + * + * @param[in] input Table whose unique rows will be counted + * @param[in] nulls_equal flag to denote if null elements should be considered equal. + * nulls are not equal if null_equality::UNEQUAL. + * + * @return number of unique rows in the table + */ +cudf::size_type unordered_distinct_count(table_view const& input, + null_equality nulls_equal = null_equality::EQUAL); + /** @} */ } // namespace cudf diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index e3d1ea88ece..96b7fd48dc9 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -57,26 +58,29 @@ std::unique_ptr add_keys( // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e] auto combined_keys = cudf::detail::concatenate(std::vector{old_keys, new_keys}, stream); - // sort and remove any duplicates from the combined keys - // drop_duplicates([a,b,c,d,f,d,b,e]) = [a,b,c,d,e,f] - auto table_keys = cudf::detail::drop_duplicates(table_view{{combined_keys->view()}}, - std::vector{0}, // only one key column - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::BEFORE, - stream, - mr) - ->release(); - std::unique_ptr keys_column(std::move(table_keys.front())); + + // Drop duplicates from the combined keys, then sort the result. + // sort(unordered_drop_duplicates([a,b,c,d,f,d,b,e])) = [a,b,c,d,e,f] + auto table_keys = + cudf::detail::unordered_drop_duplicates(table_view{{combined_keys->view()}}, + std::vector{0}, // only one key column + null_equality::EQUAL, + stream, + mr); + std::vector column_order{order::ASCENDING}; + std::vector null_precedence{null_order::AFTER}; // should be no nulls here + auto sorted_keys = + cudf::detail::sort(table_keys->view(), column_order, null_precedence, stream, mr)->release(); + + std::unique_ptr keys_column(std::move(sorted_keys.front())); // create a map for the indices // lower_bound([a,b,c,d,e,f],[a,b,c,d,f]) = [0,1,2,3,5] - auto map_indices = cudf::detail::lower_bound( - table_view{{keys_column->view()}}, - table_view{{old_keys}}, - std::vector{order::ASCENDING}, - std::vector{null_order::AFTER}, // should be no nulls here - stream, - mr); + auto map_indices = cudf::detail::lower_bound(table_view{{keys_column->view()}}, + table_view{{old_keys}}, + column_order, + null_precedence, + stream, + mr); // now create the indices column -- map old values to the new ones // gather([4,0,3,1,2,2,2,4,0],[0,1,2,3,5]) = [5,0,3,1,2,2,2,5,0] column_view indices_view(dictionary_column.indices().type(), diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index fd86d8ec7d4..301338fa1a8 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -216,15 +217,15 @@ std::unique_ptr concatenate(host_span columns, // sort keys and remove duplicates; // this becomes the keys child for the output dictionary column - auto table_keys = cudf::detail::drop_duplicates(table_view{{all_keys->view()}}, - std::vector{0}, - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::BEFORE, - stream, - mr) - ->release(); - std::unique_ptr keys_column(std::move(table_keys.front())); + auto table_keys = cudf::detail::unordered_drop_duplicates( + table_view{{all_keys->view()}}, std::vector{0}, null_equality::EQUAL, stream, mr); + auto sorted_keys = cudf::detail::sort(table_keys->view(), + std::vector{order::ASCENDING}, + std::vector{null_order::BEFORE}, + stream, + mr) + ->release(); + std::unique_ptr keys_column(std::move(sorted_keys.front())); // next, concatenate the indices std::vector indices_views(columns.size()); diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index 72f6e034479..c1fb1fa2180 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -120,16 +121,17 @@ std::unique_ptr set_keys( auto keys = dictionary_column.keys(); CUDF_EXPECTS(keys.type() == new_keys.type(), "keys types must match"); - // copy the keys -- use drop_duplicates to make sure they are sorted and unique - auto table_keys = cudf::detail::drop_duplicates(table_view{{new_keys}}, - std::vector{0}, - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::BEFORE, - stream, - mr) - ->release(); - std::unique_ptr keys_column(std::move(table_keys.front())); + // copy the keys -- use unordered_drop_duplicates to make sure they are unique, then + // sort the results. + auto unique_keys = cudf::detail::unordered_drop_duplicates( + table_view{{new_keys}}, std::vector{0}, null_equality::EQUAL, stream, mr); + auto sorted_keys = cudf::detail::sort(unique_keys->view(), + std::vector{order::ASCENDING}, + std::vector{null_order::BEFORE}, + stream, + mr) + ->release(); + std::unique_ptr keys_column(std::move(sorted_keys.front())); // compute the new nulls auto matches = cudf::detail::contains(keys, keys_column->view(), stream, mr); diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index 6f9149a47e2..234eaf51f96 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -93,7 +93,7 @@ struct reduce_dispatch_functor { case aggregation::NUNIQUE: { auto nunique_agg = dynamic_cast(agg.get()); return make_fixed_width_scalar( - detail::distinct_count( + detail::unordered_distinct_count( col, nunique_agg->_null_handling, nan_policy::NAN_IS_VALID, stream), stream, mr); diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 5c695f8a16f..2c7488084b5 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,14 +14,18 @@ * limitations under the License. */ +#include "stream_compaction_common.cuh" +#include "stream_compaction_common.hpp" + #include #include #include +#include +#include #include #include #include #include -#include #include #include @@ -30,39 +34,19 @@ #include #include +#include + +#include +#include +#include +#include #include namespace cudf { namespace detail { - -cudf::size_type distinct_count(table_view const& keys, - null_equality nulls_equal, - rmm::cuda_stream_view stream) -{ - // sort only indices - auto sorted_indices = sorted_order(keys, - std::vector{}, - std::vector{}, - stream, - rmm::mr::get_current_device_resource()); - - // count unique elements - auto sorted_row_index = sorted_indices->view().data(); - auto device_input_table = cudf::table_device_view::create(keys, stream); - - row_equality_comparator comp( - nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal); - return thrust::count_if( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(keys.num_rows()), - [sorted_row_index, comp] __device__(cudf::size_type i) { - return (i == 0 || not comp(sorted_row_index[i], sorted_row_index[i - 1])); - }); -} - +namespace { /** - * @brief Functor to check for `NAN` at an index in a `column_device_view`. + * @brief Functor to check for `NaN` at an index in a `column_device_view`. * * @tparam T The type of `column_device_view` */ @@ -76,97 +60,199 @@ struct check_for_nan { check_for_nan(cudf::column_device_view input) : _input{input} {} /** - * @brief Operator to be called to check for `NAN` at `index` in `_input` + * @brief Operator to be called to check for `NaN` at `index` in `_input` * - * @param[in] index The index at which the `NAN` needs to be checked in `input` + * @param[in] index The index at which the `NaN` needs to be checked in `input` * - * @returns bool true if value at `index` is `NAN` and not null, else false + * @returns bool true if value at `index` is `NaN` and not null, else false */ - __device__ bool operator()(size_type index) + __device__ bool operator()(size_type index) const noexcept { return std::isnan(_input.data()[index]) and _input.is_valid(index); } - protected: cudf::column_device_view _input; }; /** * @brief A structure to be used along with type_dispatcher to check if a - * `column_view` has `NAN`. + * `column_view` has `NaN`. */ struct has_nans { /** - * @brief Checks if `input` has `NAN` + * @brief Checks if `input` has `NaN` * * @note This will be applicable only for floating point type columns. * - * @param[in] input The `column_view` which will be checked for `NAN` + * @param[in] input The `column_view` which will be checked for `NaN` * @param[in] stream CUDA stream used for device memory operations and kernel launches. * - * @returns bool true if `input` has `NAN` else false + * @returns bool true if `input` has `NaN` else false */ - template ::value>* = nullptr> + template >* = nullptr> bool operator()(column_view const& input, rmm::cuda_stream_view stream) { auto input_device_view = cudf::column_device_view::create(input, stream); auto device_view = *input_device_view; - auto count = thrust::count_if(rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(input.size()), - check_for_nan(device_view)); - return count > 0; + return thrust::any_of(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + check_for_nan(device_view)); } /** - * @brief Checks if `input` has `NAN` + * @brief Checks if `input` has `NaN` * * @note This will be applicable only for non-floating point type columns. And - * non-floating point columns can never have `NAN`, so it will always return + * non-floating point columns can never have `NaN`, so it will always return * false * - * @param[in] input The `column_view` which will be checked for `NAN` + * @param[in] input The `column_view` which will be checked for `NaN` * @param[in] stream CUDA stream used for device memory operations and kernel launches. * - * @returns bool Always false as non-floating point columns can't have `NAN` + * @returns bool Always false as non-floating point columns can't have `NaN` */ - template ::value>* = nullptr> - bool operator()(column_view const& input, rmm::cuda_stream_view stream) + template >* = nullptr> + bool operator()(column_view const&, rmm::cuda_stream_view) + { + return false; + } +}; + +/** + * @brief A functor to be used along with device type_dispatcher to check if + * the row `index` of `column_device_view` is `NaN`. + */ +struct check_nan { + // Check if it's `NaN` for floating point type columns + template >* = nullptr> + __device__ inline bool operator()(column_device_view const& input, size_type index) + { + return std::isnan(input.data()[index]); + } + // Non-floating point type columns can never have `NaN`, so it will always return false. + template >* = nullptr> + __device__ inline bool operator()(column_device_view const&, size_type) { return false; } }; +} // namespace + +cudf::size_type distinct_count(table_view const& keys, + null_equality nulls_equal, + rmm::cuda_stream_view stream) +{ + auto table_ptr = cudf::table_device_view::create(keys, stream); + row_equality_comparator comp( + nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr, *table_ptr, nulls_equal); + return thrust::count_if( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(keys.num_rows()), + [comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); }); +} + +cudf::size_type unordered_distinct_count(table_view const& keys, + null_equality nulls_equal, + rmm::cuda_stream_view stream) +{ + auto table_ptr = cudf::table_device_view::create(keys, stream); + auto const num_rows = table_ptr->num_rows(); + auto const has_null = nullate::DYNAMIC{cudf::has_nulls(keys)}; + + hash_map_type key_map{compute_hash_table_size(num_rows), + COMPACTION_EMPTY_KEY_SENTINEL, + COMPACTION_EMPTY_VALUE_SENTINEL, + detail::hash_table_allocator_type{default_allocator{}, stream}, + stream.value()}; + + compaction_hash hash_key{has_null, *table_ptr}; + row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal); + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [] __device__(size_type i) { return cuco::make_pair(i, i); }); + + // when nulls are equal, insert non-null rows only to improve efficiency + if (nulls_equal == null_equality::EQUAL and has_null) { + thrust::counting_iterator stencil(0); + auto const [row_bitmask, null_count] = cudf::detail::bitmask_or(keys, stream); + row_validity pred{static_cast(row_bitmask.data())}; + + key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value()); + return key_map.get_size() + static_cast((null_count > 0) ? 1 : 0); + } + // otherwise, insert all + key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); + return key_map.get_size(); +} cudf::size_type distinct_count(column_view const& input, null_policy null_handling, nan_policy nan_handling, rmm::cuda_stream_view stream) { - if (0 == input.size() || input.null_count() == input.size()) { return 0; } - - cudf::size_type nrows = input.size(); - - bool has_nan = false; - // Check for Nans - // Checking for nulls in input and flag nan_handling, as the count will - // only get affected if these two conditions are true. NAN will only be - // be an extra if nan_handling was NAN_IS_NULL and input also had null, which - // will increase the count by 1. - if (input.has_nulls() and nan_handling == nan_policy::NAN_IS_NULL) { - has_nan = cudf::type_dispatcher(input.type(), has_nans{}, input, stream); - } + auto const num_rows = input.size(); - auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream); + if (num_rows == 0 or num_rows == input.null_count()) { return 0; } - // if nan is considered null and there are already null values - if (nan_handling == nan_policy::NAN_IS_NULL and has_nan and input.has_nulls()) --count; + auto const count_nulls = null_handling == null_policy::INCLUDE; + auto const nan_is_null = nan_handling == nan_policy::NAN_IS_NULL; + auto const should_check_nan = cudf::is_floating_point(input.type()); + auto input_device_view = cudf::column_device_view::create(input, stream); + auto device_view = *input_device_view; + auto input_table_view = table_view{{input}}; + auto table_ptr = cudf::table_device_view::create(input_table_view, stream); + row_equality_comparator comp(nullate::DYNAMIC{cudf::has_nulls(input_table_view)}, + *table_ptr, + *table_ptr, + null_equality::EQUAL); - if (null_handling == null_policy::EXCLUDE and input.has_nulls()) - return --count; - else - return count; + return thrust::count_if( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(num_rows), + [count_nulls, nan_is_null, should_check_nan, device_view, comp] __device__(cudf::size_type i) { + auto const is_null = device_view.is_null(i); + auto const is_nan = nan_is_null and should_check_nan and + cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i); + if (not count_nulls and (is_null or (nan_is_null and is_nan))) { return false; } + if (i == 0) { return true; } + if (count_nulls and nan_is_null and (is_nan or is_null)) { + auto const prev_is_nan = + should_check_nan and + cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1); + return not(prev_is_nan or device_view.is_null(i - 1)); + } + return not comp(i, i - 1); + }); } +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream) +{ + if (0 == input.size() or input.null_count() == input.size()) { return 0; } + + auto count = detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream); + + // Check for nulls. If the null policy is EXCLUDE and null values were found, + // we decrement the count. + auto const has_null = input.has_nulls(); + if (null_handling == null_policy::EXCLUDE and has_null) { --count; } + + // Check for NaNs. There are two cases that can lead to decrementing the + // count. The first case is when the input has no nulls, but has NaN values + // handled as a null via NAN_IS_NULL and has a policy to EXCLUDE null values + // from the count. The second case is when the input has null values and NaN + // values handled as nulls via NAN_IS_NULL. Regardless of whether the null + // policy is set to EXCLUDE, we decrement the count to avoid double-counting + // null and NaN as distinct entities. + auto const has_nan_as_null = (nan_handling == nan_policy::NAN_IS_NULL) and + cudf::type_dispatcher(input.type(), has_nans{}, input, stream); + if (has_nan_as_null and (has_null or null_handling == null_policy::EXCLUDE)) { --count; } + return count; +} } // namespace detail cudf::size_type distinct_count(column_view const& input, @@ -183,4 +269,18 @@ cudf::size_type distinct_count(table_view const& input, null_equality nulls_equa return detail::distinct_count(input, nulls_equal); } +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling) +{ + CUDF_FUNC_RANGE(); + return detail::unordered_distinct_count(input, null_handling, nan_handling); +} + +cudf::size_type unordered_distinct_count(table_view const& input, null_equality nulls_equal) +{ + CUDF_FUNC_RANGE(); + return detail::unordered_distinct_count(input, nulls_equal); +} + } // namespace cudf diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index abc34663aee..2fd1f530b6d 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,13 +14,16 @@ * limitations under the License. */ -#include +#include "drop_duplicates.cuh" +#include "stream_compaction_common.cuh" +#include "stream_compaction_common.hpp" #include #include #include #include #include +#include #include #include #include @@ -37,6 +40,7 @@ #include #include +#include #include namespace cudf { @@ -85,12 +89,12 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys, auto comp = row_equality_comparator( nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal); - auto result_end = unique_copy(sorted_indices->view().begin(), - sorted_indices->view().end(), - unique_indices.begin(), - comp, - keep, - stream); + auto result_end = cudf::detail::unique_copy(sorted_indices->view().begin(), + sorted_indices->view().end(), + unique_indices.begin(), + comp, + keep, + stream); return cudf::detail::slice(column_view(unique_indices), 0, @@ -106,7 +110,7 @@ std::unique_ptr
drop_duplicates(table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (0 == input.num_rows() || 0 == input.num_columns() || 0 == keys.size()) { + if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) { return empty_like(input); } @@ -130,6 +134,62 @@ std::unique_ptr
drop_duplicates(table_view const& input, mr); } +std::unique_ptr
unordered_drop_duplicates(table_view const& input, + std::vector const& keys, + null_equality nulls_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) { + return empty_like(input); + } + + auto keys_view = input.select(keys); + auto table_ptr = cudf::table_device_view::create(keys_view, stream); + auto has_null = nullate::DYNAMIC{cudf::has_nulls(keys_view)}; + auto const num_rows{table_ptr->num_rows()}; + + hash_map_type key_map{compute_hash_table_size(num_rows), + COMPACTION_EMPTY_KEY_SENTINEL, + COMPACTION_EMPTY_VALUE_SENTINEL, + detail::hash_table_allocator_type{default_allocator{}, stream}, + stream.value()}; + + compaction_hash hash_key{has_null, *table_ptr}; + row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal); + + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [] __device__(size_type i) { return cuco::make_pair(i, i); }); + // insert unique indices into the map. + key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); + + auto counting_iter = thrust::make_counting_iterator(0); + rmm::device_uvector index_exists_in_map(num_rows, stream, mr); + // enumerate all indices to check if they are present in the map. + key_map.contains(counting_iter, counting_iter + num_rows, index_exists_in_map.begin(), hash_key); + + auto const output_size{key_map.get_size()}; + + // write unique indices to a numeric column + auto unique_indices = cudf::make_numeric_column( + data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr); + auto mutable_view = mutable_column_device_view::create(*unique_indices, stream); + thrust::copy_if(rmm::exec_policy(stream), + counting_iter, + counting_iter + num_rows, + index_exists_in_map.begin(), + mutable_view->begin(), + thrust::identity{}); + + // run gather operation to establish new order + return detail::gather(input, + unique_indices->view(), + out_of_bounds_policy::DONT_CHECK, + detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); +} + } // namespace detail std::unique_ptr
drop_duplicates(table_view const& input, @@ -144,4 +204,13 @@ std::unique_ptr
drop_duplicates(table_view const& input, input, keys, keep, nulls_equal, null_precedence, rmm::cuda_stream_default, mr); } +std::unique_ptr
unordered_drop_duplicates(table_view const& input, + std::vector const& keys, + null_equality nulls_equal, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::unordered_drop_duplicates(input, keys, nulls_equal, rmm::cuda_stream_default, mr); +} + } // namespace cudf diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh new file mode 100644 index 00000000000..8ba9223a1bc --- /dev/null +++ b/cpp/src/stream_compaction/stream_compaction_common.cuh @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "stream_compaction_common.hpp" + +namespace cudf { +namespace detail { + +/** + * @brief Device callable to hash a given row. + */ +template +class compaction_hash { + public: + compaction_hash(Nullate has_nulls, table_device_view t) : _hash{has_nulls, t} {} + + __device__ inline auto operator()(size_type i) const noexcept + { + auto hash = _hash(i); + return (hash == COMPACTION_EMPTY_KEY_SENTINEL) ? (hash - 1) : hash; + } + + private: + row_hash _hash; +}; + +/** + * @brief Device functor to determine if a row is valid. + */ +class row_validity { + public: + row_validity(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {} + + __device__ inline bool operator()(const size_type& i) const noexcept + { + return cudf::bit_is_set(_row_bitmask, i); + } + + private: + bitmask_type const* _row_bitmask; +}; + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp new file mode 100644 index 00000000000..1d743eccdbe --- /dev/null +++ b/cpp/src/stream_compaction/stream_compaction_common.hpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include +#include + +#include + +#include + +#include + +namespace cudf { +namespace detail { + +constexpr auto COMPACTION_EMPTY_KEY_SENTINEL = std::numeric_limits::max(); +constexpr auto COMPACTION_EMPTY_VALUE_SENTINEL = std::numeric_limits::min(); + +using hash_type = cuco::detail::MurmurHash3_32; + +using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; + +using hash_map_type = + cuco::static_map; + +using row_hash = cudf::row_hasher; + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index dadeaf7d1e0..405c83ab872 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,7 +30,10 @@ #include #include +#include #include +#include +#include namespace cudf { namespace detail { @@ -38,29 +41,23 @@ namespace detail { std::pair, std::unique_ptr> encode( table_view const& input_table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - std::vector drop_keys(input_table.num_columns()); + auto const num_cols = input_table.num_columns(); + + std::vector drop_keys(num_cols); std::iota(drop_keys.begin(), drop_keys.end(), 0); - // side effects of this function we are now dependent on: - // - resulting column elements are sorted ascending - // - nulls are sorted to the beginning - auto keys_table = cudf::detail::drop_duplicates(input_table, - drop_keys, - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::AFTER, - stream, - mr); + auto unique_keys = cudf::detail::unordered_drop_duplicates( + input_table, drop_keys, null_equality::EQUAL, stream, mr); + + std::vector column_order(num_cols, order::ASCENDING); + std::vector null_precedence(num_cols, null_order::AFTER); + auto sorted_unique_keys = + cudf::detail::sort(unique_keys->view(), column_order, null_precedence, stream, mr); - auto indices_column = - cudf::detail::lower_bound(keys_table->view(), - input_table, - std::vector(input_table.num_columns(), order::ASCENDING), - std::vector(input_table.num_columns(), null_order::AFTER), - stream, - mr); + auto indices_column = cudf::detail::lower_bound( + sorted_unique_keys->view(), input_table, column_order, null_precedence, stream, mr); - return std::make_pair(std::move(keys_table), std::move(indices_column)); + return std::make_pair(std::move(sorted_unique_keys), std::move(indices_column)); } } // namespace detail diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e23403e68e4..6b5670630ec 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -292,6 +292,7 @@ ConfigureTest( ConfigureTest( STREAM_COMPACTION_TEST stream_compaction/apply_boolean_mask_tests.cpp + stream_compaction/distinct_count_tests.cpp stream_compaction/drop_nulls_tests.cpp stream_compaction/drop_nans_tests.cpp stream_compaction/drop_duplicates_tests.cpp diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp new file mode 100644 index 00000000000..78b52db5255 --- /dev/null +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +using cudf::nan_policy; +using cudf::null_equality; +using cudf::null_policy; + +constexpr int32_t XXX{70}; // Mark for null elements +constexpr int32_t YYY{3}; // Mark for null elements + +template +struct DistinctCountCommon : public cudf::test::BaseFixture { +}; + +TYPED_TEST_SUITE(DistinctCountCommon, cudf::test::NumericTypes); + +TYPED_TEST(DistinctCountCommon, NoNull) +{ + using T = TypeParam; + + auto const input = cudf::test::make_type_param_vector( + {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); + + cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end()); + + // explicit instantiation to one particular type (`double`) to reduce build time + auto const expected = + static_cast(std::set(input.begin(), input.end()).size()); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + // explicit instantiation to one particular type (`double`) to reduce build time + std::vector input_data(input.begin(), input.end()); + auto const new_end = std::unique(input_data.begin(), input_data.end()); + auto const gold_ordered = std::distance(input_data.begin(), new_end); + EXPECT_EQ(gold_ordered, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TYPED_TEST(DistinctCountCommon, TableNoNull) +{ + using T = TypeParam; + + auto const input1 = cudf::test::make_type_param_vector( + {1, 3, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); + auto const input2 = cudf::test::make_type_param_vector( + {3, 3, 3, 4, 31, 1, 8, 5, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1}); + + std::vector> pair_input; + std::transform( + input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) { + return std::make_pair(a, b); + }); + + cudf::test::fixed_width_column_wrapper input_col1(input1.begin(), input1.end()); + cudf::test::fixed_width_column_wrapper input_col2(input2.begin(), input2.end()); + cudf::table_view input_table({input_col1, input_col2}); + + auto const expected = static_cast( + std::set>(pair_input.begin(), pair_input.end()).size()); + EXPECT_EQ(expected, cudf::unordered_distinct_count(input_table, null_equality::EQUAL)); + + auto const new_end = std::unique(pair_input.begin(), pair_input.end()); + auto const gold_ordered = std::distance(pair_input.begin(), new_end); + EXPECT_EQ(gold_ordered, cudf::distinct_count(input_table, null_equality::EQUAL)); +} + +struct DistinctCount : public cudf::test::BaseFixture { +}; + +TEST_F(DistinctCount, WithNull) +{ + using T = int32_t; + + std::vector input = {1, 3, 3, XXX, 31, 1, 8, 2, 0, XXX, XXX, + XXX, 10, 40, 31, 42, 0, 42, 8, 5, XXX}; + std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + + cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); + + // explicit instantiation to one particular type (`double`) to reduce build time + auto const expected = + static_cast(std::set(input.begin(), input.end()).size()); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + auto const new_end = std::unique(input.begin(), input.end()); + auto const gold_ordered = std::distance(input.begin(), new_end) - 3; + EXPECT_EQ(gold_ordered, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, IgnoringNull) +{ + using T = int32_t; + + std::vector input = {1, YYY, YYY, XXX, 31, 1, 8, 2, 0, XXX, 1, + XXX, 10, 40, 31, 42, 0, 42, 8, 5, XXX}; + std::vector valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + + cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); + + auto const expected = + static_cast(std::set(input.begin(), input.end()).size()); + // Removing 2 from expected to remove count for `XXX` and `YYY` + EXPECT_EQ( + expected - 2, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); + + auto const new_end = std::unique(input.begin(), input.end()); + // -1 since `YYY, YYY, XXX` is in the same group of equivalent rows + auto const gold_ordered = std::distance(input.begin(), new_end) - 1; + EXPECT_EQ(gold_ordered, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, WithNansAndNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, XXX, 31, 1, 8, 2, 0, XXX, 1, + XXX, 10, 40, 31, NAN, 0, NAN, 8, 5, XXX}; + std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + auto const expected = + static_cast(std::set(input.begin(), input.end()).size()); + EXPECT_EQ( + expected + 1, // +1 since `NAN` is not in std::set + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + auto const new_end = std::unique(input.begin(), input.end()); + auto const gold_ordered = std::distance(input.begin(), new_end); + EXPECT_EQ(gold_ordered, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + input = {NAN, NAN, XXX}; + valid = {1, 1, 0}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 2; + EXPECT_EQ( + expected_all_nan, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ(expected_all_nan, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, WithNansOnly) +{ + using T = float; + + std::vector input = {1, 3, NAN, 70, 31}; + std::vector valid = {1, 1, 1, 1, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected = 5; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + input = {NAN, NAN, NAN}; + valid = {1, 1, 1}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 1; + EXPECT_EQ( + expected_all_nan, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ(expected_all_nan, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, NansAsNullWithNoNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, 70, 31}; + std::vector valid = {1, 1, 1, 1, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected = 5; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + + input = {NAN, NAN, NAN}; + valid = {1, 1, 1}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 1; + EXPECT_EQ( + expected_all_nan, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected_all_nan, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(DistinctCount, NansAsNullWithNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, XXX, 31}; + std::vector valid = {1, 1, 1, 0, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected = 4; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + + input = {NAN, NAN, XXX}; + valid = {1, 1, 0}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_null = 1; + EXPECT_EQ( + expected_all_null, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected_all_null, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(DistinctCount, NansAsNullWithIgnoreNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, XXX, 31}; + std::vector valid = {1, 1, 1, 0, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected = 3; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + + input = {NAN, NAN, NAN}; + valid = {1, 1, 1}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 0; + EXPECT_EQ( + expected_all_nan, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected_all_nan, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(DistinctCount, EmptyColumn) +{ + using T = float; + + cudf::test::fixed_width_column_wrapper input_col{}; + + constexpr auto expected = 0; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(DistinctCount, StringColumnWithNull) +{ + cudf::test::strings_column_wrapper input_col{ + {"", "this", "is", "this", "This", "a", "column", "of", "the", "strings"}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1}}; + + cudf::size_type const expected = + (std::vector{"", "this", "is", "This", "a", "column", "of", "strings"}).size(); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, TableWithNull) +{ + cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; + cudf::test::fixed_width_column_wrapper col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, + {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; + cudf::table_view input{{col1, col2}}; + + EXPECT_EQ(8, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); +} + +TEST_F(DistinctCount, EmptyColumnedTable) +{ + std::vector cols{}; + + cudf::table_view input(cols); + + EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); +} + +TEST_F(DistinctCount, TableMixedTypes) +{ + cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; + cudf::test::fixed_width_column_wrapper col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, + {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; + cudf::test::fixed_width_column_wrapper col3{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, + {1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0}}; + cudf::table_view input{{col1, col2, col3}}; + + EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); +} + +TEST_F(DistinctCount, TableWithStringColumnWithNull) +{ + cudf::test::fixed_width_column_wrapper col1{{0, 9, 8, 9, 6, 5, 4, 3, 2, 1, 0}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; + cudf::test::strings_column_wrapper col2{ + {"", "this", "is", "this", "this", "a", "column", "of", "the", "strings", ""}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; + + cudf::table_view input{{col1, col2}}; + EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); +} diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp index 916d2a33b97..d49b8208094 100644 --- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp +++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,254 +14,98 @@ * limitations under the License. */ -#include -#include -#include #include +#include #include #include #include #include + #include #include #include #include #include +#include +#include + using cudf::nan_policy; using cudf::null_equality; using cudf::null_policy; -template -struct DistinctCountCommon : public cudf::test::BaseFixture { -}; - -TYPED_TEST_SUITE(DistinctCountCommon, cudf::test::NumericTypes); - -TYPED_TEST(DistinctCountCommon, NoNull) -{ - using T = TypeParam; - - auto const input = cudf::test::make_type_param_vector( - {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); - - cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end()); - - cudf::size_type expected = std::set(input.begin(), input.end()).size(); - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); -} - -TYPED_TEST(DistinctCountCommon, TableNoNull) -{ - using T = TypeParam; - - auto const input1 = cudf::test::make_type_param_vector( - {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); - auto const input2 = cudf::test::make_type_param_vector( - {3, 3, 4, 31, 1, 8, 5, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1}); - std::vector> pair_input; - std::transform( - input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) { - return std::make_pair(a, b); - }); - - cudf::test::fixed_width_column_wrapper input_col1(input1.begin(), input1.end()); - cudf::test::fixed_width_column_wrapper input_col2(input2.begin(), input2.end()); - - std::vector cols{input_col1, input_col2}; - cudf::table_view input_table(cols); - - cudf::size_type expected = std::set>(pair_input.begin(), pair_input.end()).size(); - EXPECT_EQ(expected, cudf::distinct_count(input_table, null_equality::EQUAL)); -} - -struct DistinctCount : public cudf::test::BaseFixture { +struct DropDuplicatesCommon : public cudf::test::BaseFixture { }; -TEST_F(DistinctCount, WithNull) -{ - using T = int32_t; - - // Considering 70 as null - std::vector input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 1, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70}; - std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; - - cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); - - cudf::size_type expected = std::set(input.begin(), input.end()).size(); - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); -} - -TEST_F(DistinctCount, IgnoringNull) -{ - using T = int32_t; - - // Considering 70 and 3 as null - std::vector input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 1, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70}; - std::vector valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; - - cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); - - cudf::size_type expected = std::set(input.begin(), input.end()).size(); - // Removing 2 from expected to remove count for 70 and 3 - EXPECT_EQ(expected - 2, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); -} - -TEST_F(DistinctCount, WithNansAndNull) +TEST_F(DropDuplicatesCommon, StringKeyColumn) { - using T = float; - - std::vector input = {1, 3, NAN, 70, 31, 1, 8, 2, 0, 70, 1, - 70, 10, 40, 31, NAN, 0, NAN, 8, 5, 70}; - std::vector valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; - - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; - - cudf::size_type expected = std::set(input.begin(), input.end()).size(); - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); -} - -TEST_F(DistinctCount, WithNansOnly) -{ - using T = float; - - std::vector input = {1, 3, NAN, 70, 31}; - std::vector valid = {1, 1, 1, 1, 1}; - - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; - - cudf::size_type expected = 5; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); -} - -TEST_F(DistinctCount, NansAsNullWithNoNull) -{ - using T = float; - - std::vector input = {1, 3, NAN, 70, 31}; - std::vector valid = {1, 1, 1, 1, 1}; - - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; - - cudf::size_type expected = 5; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); -} + cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 5, 5, 8, 1}, {1, 0, 0, 1, 1, 1, 1}}; + cudf::test::strings_column_wrapper key_col{{"all", "new", "new", "all", "new", "the", "strings"}, + {1, 1, 1, 1, 0, 1, 1}}; + cudf::table_view input{{col, key_col}}; + std::vector keys{1}; -TEST_F(DistinctCount, NansAsNullWithNull) -{ - using T = float; + cudf::test::fixed_width_column_wrapper exp_sort_col{{5, 5, 4, 1, 8}, {1, 1, 0, 1, 1}}; + cudf::test::strings_column_wrapper exp_sort_key_col{{"new", "all", "new", "strings", "the"}, + {0, 1, 1, 1, 1}}; + cudf::table_view expected_sort{{exp_sort_col, exp_sort_key_col}}; - std::vector input = {1, 3, NAN, 70, 31}; - std::vector valid = {1, 1, 1, 0, 1}; + auto got_sort = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, got_sort->view()); - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + auto got_unordered = unordered_drop_duplicates(input, keys); + auto key_view = got_unordered->select(keys.begin(), keys.end()); + auto sorted_result = cudf::sort_by_key(got_unordered->view(), key_view); - cudf::size_type expected = 4; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, sorted_result->view()); } -TEST_F(DistinctCount, NansAsNullWithIgnoreNull) +TEST_F(DropDuplicatesCommon, EmptyInputTable) { - using T = float; - - std::vector input = {1, 3, NAN, 70, 31}; - std::vector valid = {1, 1, 1, 0, 1}; + cudf::test::fixed_width_column_wrapper col(std::initializer_list{}); + cudf::table_view input{{col}}; + std::vector keys{1, 2}; - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + auto got = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); - cudf::size_type expected = 3; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); } -TEST_F(DistinctCount, EmptyColumn) +TEST_F(DropDuplicatesCommon, NoColumnInputTable) { - using T = float; - - cudf::test::fixed_width_column_wrapper input_col{}; - - cudf::size_type expected = 0; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); -} + cudf::table_view input{std::vector()}; + std::vector keys{1, 2}; -TEST_F(DistinctCount, StringColumnWithNull) -{ - cudf::test::strings_column_wrapper input_col{ - {"", "this", "is", "this", "This", "a", "column", "of", "the", "strings"}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1}}; - - cudf::size_type expected = - (std::vector{"", "this", "is", "This", "a", "column", "of", "strings"}).size(); - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); -} + auto got = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); -TEST_F(DistinctCount, TableWithNull) -{ - cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; - cudf::test::fixed_width_column_wrapper col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, - {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; - cudf::table_view input{{col1, col2}}; - - EXPECT_EQ(8, cudf::distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL)); + auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); } -TEST_F(DistinctCount, EmptyColumnedTable) +TEST_F(DropDuplicatesCommon, EmptyKeys) { - std::vector cols{}; - - cudf::table_view input(cols); - - EXPECT_EQ(0, cudf::distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(0, cudf::distinct_count(input, null_equality::UNEQUAL)); - EXPECT_EQ(0, cudf::distinct_count(cudf::table_view{}, null_equality::EQUAL)); - EXPECT_EQ(0, cudf::distinct_count(cudf::table_view{}, null_equality::UNEQUAL)); -} + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper empty_col{}; + cudf::table_view input{{col}}; + std::vector keys{}; -TEST_F(DistinctCount, TableMixedTypes) -{ - cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; - cudf::test::fixed_width_column_wrapper col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, - {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; - cudf::test::fixed_width_column_wrapper col3{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, - {1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0}}; - cudf::table_view input{{col1, col2, col3}}; - - EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL)); -} + auto got = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); -TEST_F(DistinctCount, TableWithStringColumnWithNull) -{ - cudf::test::fixed_width_column_wrapper col1{{0, 9, 8, 9, 6, 5, 4, 3, 2, 1, 0}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; - cudf::test::strings_column_wrapper col2{ - {"", "this", "is", "this", "this", "a", "column", "of", "the", "strings", ""}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; - - cudf::table_view input{{col1, col2}}; - EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL)); + auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got_unordered->view()); } -struct DropDuplicate : public cudf::test::BaseFixture { +struct DropDuplicates : public cudf::test::BaseFixture { }; -TEST_F(DropDuplicate, NonNullTable) +TEST_F(DropDuplicates, NonNullTable) { cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 5}}; cudf::test::fixed_width_column_wrapper col2{{4, 5, 3, 4, 9, 4}}; @@ -271,7 +115,7 @@ TEST_F(DropDuplicate, NonNullTable) cudf::table_view input{{col1, col2, col1_key, col2_key}}; std::vector keys{2, 3}; - // Keep first of duplicate + // Keep the first duplicate row // The expected table would be sorted in ascending order with respect to keys cudf::test::fixed_width_column_wrapper exp_col1_first{{5, 5, 5, 3, 8}}; cudf::test::fixed_width_column_wrapper exp_col2_first{{4, 4, 4, 3, 9}}; @@ -284,7 +128,7 @@ TEST_F(DropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); - // keep last of duplicate + // Keep the last duplicate row cudf::test::fixed_width_column_wrapper exp_col1_last{{5, 5, 4, 3, 8}}; cudf::test::fixed_width_column_wrapper exp_col2_last{{4, 4, 5, 3, 9}}; cudf::test::fixed_width_column_wrapper exp_col1_key_last{{9, 19, 20, 20, 21}}; @@ -296,7 +140,7 @@ TEST_F(DropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); - // Keep unique + // Keep no duplicate rows cudf::test::fixed_width_column_wrapper exp_col1_unique{{5, 5, 3, 8}}; cudf::test::fixed_width_column_wrapper exp_col2_unique{{4, 4, 3, 9}}; cudf::test::fixed_width_column_wrapper exp_col1_key_unique{{9, 19, 20, 21}}; @@ -309,88 +153,145 @@ TEST_F(DropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); } -TEST_F(DropDuplicate, WithNull) +TEST_F(DropDuplicates, KeepFirstWithNull) { - cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 20, 19, 21, 19}, + {1, 1, 0, 0, 1, 1, 1}}; cudf::table_view input{{col, key}}; std::vector keys{1}; - // Keep first of duplicate - cudf::test::fixed_width_column_wrapper exp_col_first{{4, 5, 5, 8}, {0, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_first{{20, 19, 20, 21}, {0, 1, 1, 1}}; - cudf::table_view expected_first{{exp_col_first, exp_key_col_first}}; - auto got_first = + // nulls are equal + cudf::test::fixed_width_column_wrapper exp_col_first_equal{{3, 5, 5, 8}, {1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_first_equal{{20, 19, 20, 21}, + {0, 1, 1, 1}}; + cudf::table_view expected_first_equal{{exp_col_first_equal, exp_key_col_first_equal}}; + auto got_first_equal = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_equal, got_first_equal->view()); - // Keep last of duplicate - cudf::test::fixed_width_column_wrapper exp_col_last{{3, 1, 5, 8}, {1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_last{{20, 19, 20, 21}, {0, 1, 1, 1}}; - cudf::table_view expected_last{{exp_col_last, exp_key_col_last}}; - auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + // nulls are unequal + cudf::test::fixed_width_column_wrapper exp_col_first_unequal{{3, 2, 5, 5, 8}, + {1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_first_unequal{{20, 20, 19, 20, 21}, + {0, 0, 1, 1, 1}}; + cudf::table_view expected_first_unequal{{exp_col_first_unequal, exp_key_col_first_unequal}}; + auto got_first_unequal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::UNEQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); - - // Keep unique of duplicate - cudf::test::fixed_width_column_wrapper exp_col_unique{{5, 8}, {1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_unique{{20, 21}, {1, 1}}; - cudf::table_view expected_unique{{exp_col_unique, exp_key_col_unique}}; - auto got_unique = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_unequal, got_first_unequal->view()); } -TEST_F(DropDuplicate, StringKeyColumn) +TEST_F(DropDuplicates, KeepLastWithNull) { - cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::strings_column_wrapper key_col{{"all", "new", "all", "new", "the", "strings"}, - {1, 1, 1, 0, 1, 1}}; - cudf::table_view input{{col, key_col}}; + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 20, 19, 21, 19}, + {1, 1, 0, 0, 1, 1, 1}}; + cudf::table_view input{{col, key}}; std::vector keys{1}; - cudf::test::fixed_width_column_wrapper exp_col_last{{5, 3, 4, 1, 8}, {1, 1, 0, 1, 1}}; - cudf::test::strings_column_wrapper exp_key_col_last{{"new", "all", "new", "strings", "the"}, - {0, 1, 1, 1, 1}}; - cudf::table_view expected_last{{exp_col_last, exp_key_col_last}}; - auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); + // nulls are equal + cudf::test::fixed_width_column_wrapper exp_col_last_equal{{2, 1, 4, 8}, {1, 1, 0, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_last_equal{{20, 19, 20, 21}, + {0, 1, 1, 1}}; + cudf::table_view expected_last_equal{{exp_col_last_equal, exp_key_col_last_equal}}; + auto got_last_equal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::EQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_equal, got_last_equal->view()); + + // nulls are unequal + cudf::test::fixed_width_column_wrapper exp_col_last_unequal{{3, 2, 1, 4, 8}, + {1, 1, 1, 0, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_last_unequal{{20, 20, 19, 20, 21}, + {0, 0, 1, 1, 1}}; + cudf::table_view expected_last_unequal{{exp_col_last_unequal, exp_key_col_last_unequal}}; + auto got_last_unequal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::UNEQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_unequal, got_last_unequal->view()); } -TEST_F(DropDuplicate, EmptyInputTable) +TEST_F(DropDuplicates, KeepNoneWithNull) { - cudf::test::fixed_width_column_wrapper col(std::initializer_list{}); - cudf::table_view input{{col}}; - std::vector keys{1, 2}; + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 20, 19, 21, 19}, + {1, 1, 0, 0, 1, 1, 1}}; + cudf::table_view input{{col, key}}; + std::vector keys{1}; - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + // nulls are equal + cudf::test::fixed_width_column_wrapper exp_col_unique_equal{{8}, {1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_unique_equal{{21}, {1}}; + cudf::table_view expected_unique_equal{{exp_col_unique_equal, exp_key_col_unique_equal}}; + auto got_unique_equal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_equal, got_unique_equal->view()); + + // nulls are unequal + cudf::test::fixed_width_column_wrapper exp_col_unique_unequal{{3, 2, 8}, {1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_unique_unequal{{20, 20, 21}, + {0, 0, 1}}; + cudf::table_view expected_unique_unequal{{exp_col_unique_unequal, exp_key_col_unique_unequal}}; + auto got_unique_unequal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::UNEQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_unequal, got_unique_unequal->view()); } -TEST_F(DropDuplicate, NoColumnInputTable) +struct UnorderedDropDuplicates : public cudf::test::BaseFixture { +}; + +TEST_F(UnorderedDropDuplicates, NonNullTable) { - cudf::table_view input{std::vector()}; - std::vector keys{1, 2}; + cudf::test::fixed_width_column_wrapper col1{{6, 6, 3, 5, 8, 5}}; + cudf::test::fixed_width_column_wrapper col2{{6, 6, 3, 4, 9, 4}}; + cudf::test::fixed_width_column_wrapper col1_key{{20, 20, 20, 19, 21, 9}}; + cudf::test::fixed_width_column_wrapper col2_key{{19, 19, 20, 20, 9, 21}}; - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + cudf::table_view input{{col1, col2, col1_key, col2_key}}; + std::vector keys{2, 3}; - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); + // The expected table would be sorted in ascending order with respect to keys + cudf::test::fixed_width_column_wrapper exp_col1{{5, 5, 6, 3, 8}}; + cudf::test::fixed_width_column_wrapper exp_col2{{4, 4, 6, 3, 9}}; + cudf::test::fixed_width_column_wrapper exp_col1_key{{9, 19, 20, 20, 21}}; + cudf::test::fixed_width_column_wrapper exp_col2_key{{21, 20, 19, 20, 9}}; + cudf::table_view expected{{exp_col1, exp_col2, exp_col1_key, exp_col2_key}}; + + auto result = unordered_drop_duplicates(input, keys); + auto key_view = result->select(keys.begin(), keys.end()); + auto sorted_result = cudf::sort_by_key(result->view(), key_view); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); } -TEST_F(DropDuplicate, EmptyKeys) +TEST_F(UnorderedDropDuplicates, WithNull) { - cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper empty_col{}; - cudf::table_view input{{col}}; - std::vector keys{}; - - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 1, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; + cudf::table_view input{{col, key}}; + std::vector keys{1}; - CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); + // nulls are equal + cudf::test::fixed_width_column_wrapper exp_equal_col{{4, 1, 5, 8}, {0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_equal_key_col{{20, 19, 20, 21}, {0, 1, 1, 1}}; + cudf::table_view expected_equal{{exp_equal_col, exp_equal_key_col}}; + auto res_equal = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + auto equal_keys = res_equal->select(keys.begin(), keys.end()); + auto sorted_equal = cudf::sort_by_key(res_equal->view(), equal_keys); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_equal, sorted_equal->view()); + + // nulls are unequal + cudf::test::fixed_width_column_wrapper exp_unequal_col{{4, 1, 4, 5, 8}, {0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_unequal_key_col{{20, 19, 20, 20, 21}, + {0, 1, 0, 1, 1}}; + cudf::table_view expected_unequal{{exp_unequal_col, exp_unequal_key_col}}; + auto res_unequal = unordered_drop_duplicates(input, keys, null_equality::UNEQUAL); + auto sorted_unequal = cudf::sort(res_unequal->view()); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unequal, sorted_unequal->view()); } diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index 5b81d369ef5..897b61f8001 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -33,11 +33,13 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \ column_view boolean_mask ) except + - cdef unique_ptr[table] drop_duplicates(table_view source_table, - vector[size_type] keys, - duplicate_keep_option keep, - null_equality nulls_equal) except + - - cdef size_type distinct_count(column_view source_table, - null_policy null_handling, - nan_policy nan_handling) except + + cdef unique_ptr[table] drop_duplicates( + table_view source_table, + vector[size_type] keys, + duplicate_keep_option keep, + null_equality nulls_equal) except + + + cdef size_type unordered_distinct_count( + column_view source_table, + null_policy null_handling, + nan_policy nan_handling) except + diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 4330c565982..c4f885382f3 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import pandas as pd @@ -11,10 +11,10 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.stream_compaction cimport ( apply_boolean_mask as cpp_apply_boolean_mask, - distinct_count as cpp_distinct_count, drop_duplicates as cpp_drop_duplicates, drop_nulls as cpp_drop_nulls, duplicate_keep_option, + unordered_distinct_count as cpp_unordered_distinct_count, ) from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -190,7 +190,7 @@ def distinct_count(Column source_column, ignore_nulls=True, nan_as_null=False): cdef column_view source_column_view = source_column.view() with nogil: - count = cpp_distinct_count( + count = cpp_unordered_distinct_count( source_column_view, cpp_null_handling, cpp_nan_handling