Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor stream compaction APIs #10370

Merged
merged 19 commits into from
Mar 12, 2022
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -400,10 +400,12 @@ add_library(
src/sort/stable_sort_column.cu
src/sort/stable_sort.cu
src/stream_compaction/apply_boolean_mask.cu
src/stream_compaction/distinct.cu
src/stream_compaction/distinct_count.cu
src/stream_compaction/drop_duplicates.cu
src/stream_compaction/drop_nans.cu
src/stream_compaction/drop_nulls.cu
src/stream_compaction/unique.cu
src/stream_compaction/unique_count.cu
src/strings/attributes.cu
src/strings/capitalize.cu
src/strings/case.cu
Expand Down
4 changes: 3 additions & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,9 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp

# ##################################################################################################
# * stream_compaction benchmark -------------------------------------------------------------------
ConfigureNVBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates.cpp)
ConfigureNVBench(
STREAM_COMPACTION_BENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp
)

# ##################################################################################################
# * join benchmark --------------------------------------------------------------------------------
Expand Down
63 changes: 63 additions & 0 deletions cpp/benchmarks/stream_compaction/distinct.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column_view.hpp>
#include <cudf/detail/stream_compaction.hpp>
#include <cudf/types.hpp>
#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <fixture/rmm_pool_raii.hpp>

#include <nvbench/nvbench.cuh>

#include <memory>
#include <random>

NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");

template <typename Type>
void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
{
cudf::rmm_pool_raii pool_raii;

auto const num_rows = state.get_int64("NumRows");

cudf::test::UniformRandomGenerator<long> rand_gen(0, 100);
auto elements = cudf::detail::make_counting_transform_iterator(
0, [&rand_gen](auto row) { return rand_gen.generate(); });
auto valids =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100 != 0; });
cudf::test::fixed_width_column_wrapper<Type, long> values(elements, elements + num_rows, valids);

auto input_column = cudf::column_view(values);
auto input_table = cudf::table_view({input_column, input_column, input_column, input_column});

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
rmm::cuda_stream_view stream_view{launch.get_stream()};
auto result = cudf::detail::distinct(input_table, {0}, cudf::null_equality::EQUAL, stream_view);
});
}

using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
using keep_option = nvbench::enum_type_list<cudf::duplicate_keep_option::KEEP_FIRST,
cudf::duplicate_keep_option::KEEP_LAST,
cudf::duplicate_keep_option::KEEP_NONE>;
PointKernel marked this conversation as resolved.
Show resolved Hide resolved

NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type))
.set_name("distinct")
.set_type_axes_names({"Type"})
.add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");

template <typename Type, cudf::duplicate_keep_option Keep>
void nvbench_drop_duplicates(nvbench::state& state,
nvbench::type_list<Type, nvbench::enum_type<Keep>>)
void nvbench_unique(nvbench::state& state, nvbench::type_list<Type, nvbench::enum_type<Keep>>)
{
if constexpr (not std::is_same_v<Type, int32_t> and
Keep != cudf::duplicate_keep_option::KEEP_FIRST) {
Expand All @@ -62,41 +61,17 @@ void nvbench_drop_duplicates(nvbench::state& state,
cudf::test::UniformRandomGenerator<long> rand_gen(0, 100);
auto elements = cudf::detail::make_counting_transform_iterator(
0, [&rand_gen](auto row) { return rand_gen.generate(); });
auto valids = cudf::detail::make_counting_transform_iterator(
0, [](auto i) { return i % 100 == 0 ? false : true; });
auto valids =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100 != 0; });
cudf::test::fixed_width_column_wrapper<Type, long> values(elements, elements + num_rows, valids);

auto input_column = cudf::column_view(values);
auto input_table = cudf::table_view({input_column, input_column, input_column, input_column});

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
rmm::cuda_stream_view stream_view{launch.get_stream()};
auto result = cudf::detail::drop_duplicates(
input_table, {0}, Keep, cudf::null_equality::EQUAL, cudf::null_order::BEFORE, stream_view);
});
}

template <typename Type>
void nvbench_unordered_drop_duplicates(nvbench::state& state, nvbench::type_list<Type>)
{
cudf::rmm_pool_raii pool_raii;

auto const num_rows = state.get_int64("NumRows");

cudf::test::UniformRandomGenerator<long> rand_gen(0, 100);
auto elements = cudf::detail::make_counting_transform_iterator(
0, [&rand_gen](auto row) { return rand_gen.generate(); });
auto valids = cudf::detail::make_counting_transform_iterator(
0, [](auto i) { return i % 100 == 0 ? false : true; });
cudf::test::fixed_width_column_wrapper<Type, long> values(elements, elements + num_rows, valids);

auto input_column = cudf::column_view(values);
auto input_table = cudf::table_view({input_column, input_column, input_column, input_column});

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
rmm::cuda_stream_view stream_view{launch.get_stream()};
auto result = cudf::detail::unordered_drop_duplicates(
input_table, {0}, cudf::null_equality::EQUAL, stream_view);
auto result =
cudf::detail::unique(input_table, {0}, Keep, cudf::null_equality::EQUAL, stream_view);
});
}

Expand All @@ -105,12 +80,7 @@ using keep_option = nvbench::enum_type_list<cudf::duplicate_keep_option::KEEP_FI
cudf::duplicate_keep_option::KEEP_LAST,
cudf::duplicate_keep_option::KEEP_NONE>;

NVBENCH_BENCH_TYPES(nvbench_drop_duplicates, NVBENCH_TYPE_AXES(data_type, keep_option))
.set_name("drop_duplicates")
NVBENCH_BENCH_TYPES(nvbench_unique, NVBENCH_TYPE_AXES(data_type, keep_option))
.set_name("unique")
.set_type_axes_names({"Type", "KeepOption"})
.add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});

NVBENCH_BENCH_TYPES(nvbench_unordered_drop_duplicates, NVBENCH_TYPE_AXES(data_type))
.set_name("unordered_drop_duplicates")
.set_type_axes_names({"Type"})
.add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
45 changes: 22 additions & 23 deletions cpp/include/cudf/detail/stream_compaction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,68 +62,67 @@ std::unique_ptr<table> apply_boolean_mask(
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::drop_duplicates
* @copydoc cudf::unique
*
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<table> drop_duplicates(
std::unique_ptr<table> unique(
table_view const& input,
std::vector<size_type> const& keys,
duplicate_keep_option keep,
null_equality nulls_equal = null_equality::EQUAL,
null_order null_precedence = null_order::BEFORE,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::unordered_drop_duplicates
* @copydoc cudf::distinct
*
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<table> unordered_drop_duplicates(
std::unique_ptr<table> distinct(
table_view const& input,
std::vector<size_type> const& keys,
null_equality nulls_equal = null_equality::EQUAL,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy)
* @copydoc cudf::unique_count(column_view const&, null_policy, nan_policy)
*
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*/
cudf::size_type distinct_count(column_view const& input,
null_policy null_handling,
nan_policy nan_handling,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);
cudf::size_type unique_count(column_view const& input,
null_policy null_handling,
nan_policy nan_handling,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);

/**
* @copydoc cudf::distinct_count(table_view const&, null_equality)
* @copydoc cudf::unique_count(table_view const&, null_equality)
*
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*/
cudf::size_type distinct_count(table_view const& input,
null_equality nulls_equal = null_equality::EQUAL,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);
cudf::size_type unique_count(table_view const& input,
null_equality nulls_equal = null_equality::EQUAL,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);

/**
* @copydoc cudf::unordered_distinct_count(column_view const&, null_policy, nan_policy)
* @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy)
*
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*/
cudf::size_type unordered_distinct_count(column_view const& input,
null_policy null_handling,
nan_policy nan_handling,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);
cudf::size_type distinct_count(column_view const& input,
null_policy null_handling,
nan_policy nan_handling,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);

/**
* @copydoc cudf::unordered_distinct_count(table_view const&, null_equality)
* @copydoc cudf::distinct_count(table_view const&, null_equality)
*
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*/
cudf::size_type unordered_distinct_count(table_view const& input,
null_equality nulls_equal = null_equality::EQUAL,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);
cudf::size_type distinct_count(table_view const& input,
null_equality nulls_equal = null_equality::EQUAL,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);

} // namespace detail
} // namespace cudf
47 changes: 21 additions & 26 deletions cpp/include/cudf/stream_compaction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,13 +214,10 @@ enum class duplicate_keep_option {
};

/**
* @brief Create a new table without duplicate rows.
*
* The output table is sorted according to the lexicographic ordering of the data in the columns
* indexed by `keys`.
* @brief Create a new table with consecutive duplicate rows removed.
*
* Given an `input` table_view, each row is copied to output table if the corresponding
* row of `keys` columns is unique, where the definition of unique depends on the value of @p keep:
* Given an `input` table_view, one specific row from a group of equivalent elements is copied to
* output table depending on the value of @p keep:
* - KEEP_FIRST: only the first of a sequence of duplicate rows is copied
* - KEEP_LAST: only the last of a sequence of duplicate rows is copied
* - KEEP_NONE: no duplicate rows are copied
Expand All @@ -232,22 +229,20 @@ enum class duplicate_keep_option {
* @param[in] keep keep first row, last row, or no rows of the found duplicates
* @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not
* equal if null_equality::UNEQUAL
* @param[in] null_precedence flag to denote nulls should appear before or after non-null items
* @param[in] mr Device memory resource used to allocate the returned table's device
* memory
*
* @return Table with sorted unique rows as specified by `keep`.
* @return Table with unique rows from each sequence of equivalent rows as specified by `keep`.
*/
std::unique_ptr<table> drop_duplicates(
std::unique_ptr<table> unique(
table_view const& input,
std::vector<size_type> const& keys,
duplicate_keep_option keep,
null_equality nulls_equal = null_equality::EQUAL,
null_order null_precedence = null_order::BEFORE,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a new table without duplicate rows with hash-based algorithms.
* @brief Create a new table without duplicate rows.
*
* Given an `input` table_view, each row is copied to output table if the corresponding
* row of `keys` columns is unique. If duplicate rows are present, it is unspecified which
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -264,7 +259,7 @@ std::unique_ptr<table> drop_duplicates(
*
* @return Table with unique rows in an unspecified order.
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
*/
std::unique_ptr<table> unordered_drop_duplicates(
std::unique_ptr<table> distinct(
table_view const& input,
std::vector<size_type> const& keys,
null_equality nulls_equal = null_equality::EQUAL,
Expand All @@ -283,11 +278,11 @@ std::unique_ptr<table> unordered_drop_duplicates(
* @param[in] null_handling flag to include or ignore `null` while counting
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
* @param[in] nan_handling flag to consider `NaN==null` or not
*
* @return number of distinct consecutive groups in the column
* @return number of consecutive groups of equivalent rows in the column
*/
cudf::size_type distinct_count(column_view const& input,
null_policy null_handling,
nan_policy nan_handling);
cudf::size_type unique_count(column_view const& input,
null_policy null_handling,
nan_policy nan_handling);

/**
* @brief Count the number of consecutive groups of equivalent elements in a table.
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -296,10 +291,10 @@ cudf::size_type distinct_count(column_view const& input,
* @param[in] nulls_equal flag to denote if null elements should be considered equal.
* nulls are not equal if null_equality::UNEQUAL.
*
* @return number of distinct consecutive groups in the table
* @return number of consecutive groups of equivalent rows in the column
*/
cudf::size_type distinct_count(table_view const& input,
null_equality nulls_equal = null_equality::EQUAL);
cudf::size_type unique_count(table_view const& input,
null_equality nulls_equal = null_equality::EQUAL);

/**
* @brief Count the unique elements in the column_view.
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -319,11 +314,11 @@ cudf::size_type distinct_count(table_view const& input,
* @param[in] null_handling flag to include or ignore `null` while counting
* @param[in] nan_handling flag to consider `NaN==null` or not
*
* @return number of unique elements
* @return number of distinct rows in the table
*/
cudf::size_type unordered_distinct_count(column_view const& input,
null_policy null_handling,
nan_policy nan_handling);
cudf::size_type distinct_count(column_view const& input,
null_policy null_handling,
nan_policy nan_handling);

/**
* @brief Count the unique rows in a table.
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -332,10 +327,10 @@ cudf::size_type unordered_distinct_count(column_view const& input,
* @param[in] nulls_equal flag to denote if null elements should be considered equal.
* nulls are not equal if null_equality::UNEQUAL.
*
* @return number of unique rows in the table
* @return number of distinct rows in the table
*/
cudf::size_type unordered_distinct_count(table_view const& input,
null_equality nulls_equal = null_equality::EQUAL);
cudf::size_type distinct_count(table_view const& input,
null_equality nulls_equal = null_equality::EQUAL);

/** @} */
} // namespace cudf
13 changes: 6 additions & 7 deletions cpp/src/dictionary/add_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,12 @@ std::unique_ptr<column> add_keys(
cudf::detail::concatenate(std::vector<column_view>{old_keys, new_keys}, stream);

// Drop duplicates from the combined keys, then sort the result.
// sort(unordered_drop_duplicates([a,b,c,d,f,d,b,e])) = [a,b,c,d,e,f]
auto table_keys =
cudf::detail::unordered_drop_duplicates(table_view{{combined_keys->view()}},
std::vector<size_type>{0}, // only one key column
null_equality::EQUAL,
stream,
mr);
// sort(distinct([a,b,c,d,f,d,b,e])) = [a,b,c,d,e,f]
auto table_keys = cudf::detail::distinct(table_view{{combined_keys->view()}},
std::vector<size_type>{0}, // only one key column
null_equality::EQUAL,
stream,
mr);
std::vector<order> column_order{order::ASCENDING};
std::vector<null_order> null_precedence{null_order::AFTER}; // should be no nulls here
auto sorted_keys =
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/dictionary/detail/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,

// sort keys and remove duplicates;
// this becomes the keys child for the output dictionary column
auto table_keys = cudf::detail::unordered_drop_duplicates(
auto table_keys = cudf::detail::distinct(
table_view{{all_keys->view()}}, std::vector<size_type>{0}, null_equality::EQUAL, stream, mr);
auto sorted_keys = cudf::detail::sort(table_keys->view(),
std::vector<order>{order::ASCENDING},
Expand Down
Loading