Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor stream compaction APIs #10370

Merged
merged 19 commits into from
Mar 12, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/benchmarks/stream_compaction/drop_duplicates.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ void nvbench_drop_duplicates(nvbench::state& state,
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
rmm::cuda_stream_view stream_view{launch.get_stream()};
auto result = cudf::detail::drop_duplicates(
input_table, {0}, Keep, cudf::null_equality::EQUAL, cudf::null_order::BEFORE, stream_view);
input_table, {0}, Keep, cudf::null_equality::EQUAL, stream_view);
});
}

Expand Down
1 change: 0 additions & 1 deletion cpp/include/cudf/detail/stream_compaction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ std::unique_ptr<table> drop_duplicates(
std::vector<size_type> const& keys,
duplicate_keep_option keep,
null_equality nulls_equal = null_equality::EQUAL,
null_order null_precedence = null_order::BEFORE,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand Down
13 changes: 4 additions & 9 deletions cpp/include/cudf/stream_compaction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,13 +214,10 @@ enum class duplicate_keep_option {
};

/**
* @brief Create a new table without duplicate rows.
* @brief Create a new table with consecutive duplicate rows removed.
*
* The output table is sorted according to the lexicographic ordering of the data in the columns
* indexed by `keys`.
*
* Given an `input` table_view, each row is copied to output table if the corresponding
* row of `keys` columns is unique, where the definition of unique depends on the value of @p keep:
* Given an `input` table_view, one specific row from a group of equivalent elements is copied to
* output table depending on the value of @p keep:
* - KEEP_FIRST: only the first of a sequence of duplicate rows is copied
* - KEEP_LAST: only the last of a sequence of duplicate rows is copied
* - KEEP_NONE: no duplicate rows are copied
Expand All @@ -232,18 +229,16 @@ enum class duplicate_keep_option {
* @param[in] keep keep first row, last row, or no rows of the found duplicates
* @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not
* equal if null_equality::UNEQUAL
* @param[in] null_precedence flag to denote nulls should appear before or after non-null items
* @param[in] mr Device memory resource used to allocate the returned table's device
* memory
*
* @return Table with sorted unique rows as specified by `keep`.
* @return Table with unique rows from each sequence of equivalent rows as specified by `keep`.
*/
std::unique_ptr<table> drop_duplicates(
table_view const& input,
std::vector<size_type> const& keys,
duplicate_keep_option keep,
null_equality nulls_equal = null_equality::EQUAL,
null_order null_precedence = null_order::BEFORE,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down
106 changes: 28 additions & 78 deletions cpp/src/stream_compaction/drop_duplicates.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,89 +45,41 @@

namespace cudf {
namespace detail {
namespace {
/**
* @brief Create a column_view of index values which represent the row values
* without duplicates as per @p `keep`
*
* Given a `keys` table_view, each row index is copied to output `unique_indices`, if the
* corresponding row of `keys` table_view is unique, where the definition of unique depends on the
* value of @p keep:
* - KEEP_FIRST: only the first of a sequence of duplicate rows is copied
* - KEEP_LAST: only the last of a sequence of duplicate rows is copied
* - KEEP_NONE: only unique rows are kept
*
* @param[in] keys table_view to identify duplicate rows
* @param[out] unique_indices Column to store the index with unique rows
* @param[in] keep keep first entry, last entry, or no entries if duplicates found
* @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL,
* @param[in] null_precedence flag to denote nulls should appear before or after non-null items,
* nulls are not equal if null_equality::UNEQUAL
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*
* @return column_view column_view of unique row index as per specified `keep`, this is actually
* slice of `unique_indices`.
*/
column_view get_unique_ordered_indices(cudf::table_view const& keys,
cudf::mutable_column_view& unique_indices,
duplicate_keep_option keep,
null_equality nulls_equal,
null_order null_precedence,
rmm::cuda_stream_view stream)
{
// Sort only the indices.
// Note that stable sort must be used to maintain the order of duplicate elements.
auto sorted_indices = stable_sorted_order(
keys,
std::vector<order>{},
std::vector<null_order>{static_cast<uint64_t>(keys.num_columns()), null_precedence},
stream,
rmm::mr::get_current_device_resource());

// extract unique indices
auto device_input_table = cudf::table_device_view::create(keys, stream);

auto comp = row_equality_comparator(
nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal);
auto result_end = cudf::detail::unique_copy(sorted_indices->view().begin<cudf::size_type>(),
sorted_indices->view().end<cudf::size_type>(),
unique_indices.begin<cudf::size_type>(),
comp,
keep,
stream);

return cudf::detail::slice(column_view(unique_indices),
0,
thrust::distance(unique_indices.begin<cudf::size_type>(), result_end));
}
} // namespace

std::unique_ptr<table> drop_duplicates(table_view const& input,
std::vector<size_type> const& keys,
duplicate_keep_option keep,
null_equality nulls_equal,
null_order null_precedence,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) {
return empty_like(input);
}

auto keys_view = input.select(keys);

// The values will be filled into this column
auto unique_indices = cudf::make_numeric_column(
data_type{type_id::INT32}, keys_view.num_rows(), mask_state::UNALLOCATED, stream);
auto mutable_unique_indices_view = unique_indices->mutable_view();
// This is just slice of `unique_indices` but with different size as per the
// keys_view has been processed in `get_unique_ordered_indices`
auto unique_indices_view = detail::get_unique_ordered_indices(
keys_view, mutable_unique_indices_view, keep, nulls_equal, null_precedence, stream);

// run gather operation to establish new order
auto const num_rows = input.num_rows();
if (num_rows == 0 or input.num_columns() == 0 or keys.empty()) { return empty_like(input); }

auto unique_indices =
make_numeric_column(data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED, stream, mr);
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
auto mutable_view = mutable_column_device_view::create(*unique_indices, stream);
auto keys_view = input.select(keys);
auto keys_device_view = cudf::table_device_view::create(keys_view, stream);
auto row_equal = row_equality_comparator(nullate::DYNAMIC{cudf::has_nulls(keys_view)},
*keys_device_view,
*keys_device_view,
nulls_equal);

// get indices of unique rows
auto result_end = unique_copy(thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(num_rows),
mutable_view->begin<size_type>(),
row_equal,
keep,
stream);
auto indices_view =
cudf::detail::slice(column_view(*unique_indices),
0,
thrust::distance(mutable_view->begin<size_type>(), result_end));

// gather unique rows and return
return detail::gather(input,
unique_indices_view,
indices_view,
out_of_bounds_policy::DONT_CHECK,
detail::negative_index_policy::NOT_ALLOWED,
stream,
Expand Down Expand Up @@ -196,12 +148,10 @@ std::unique_ptr<table> drop_duplicates(table_view const& input,
std::vector<size_type> const& keys,
duplicate_keep_option const keep,
null_equality nulls_equal,
null_order null_precedence,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::drop_duplicates(
input, keys, keep, nulls_equal, null_precedence, rmm::cuda_stream_default, mr);
return detail::drop_duplicates(input, keys, keep, nulls_equal, rmm::cuda_stream_default, mr);
}

std::unique_ptr<table> unordered_drop_duplicates(table_view const& input,
Expand Down
76 changes: 42 additions & 34 deletions cpp/tests/stream_compaction/drop_duplicates_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,19 @@ TEST_F(DropDuplicatesCommon, StringKeyColumn)
cudf::table_view input{{col, key_col}};
std::vector<cudf::size_type> keys{1};

cudf::test::fixed_width_column_wrapper<int32_t> exp_col{{5, 4, 5, 5, 8, 1}, {1, 0, 1, 1, 1, 1}};
cudf::test::strings_column_wrapper exp_key_col{{"all", "new", "all", "new", "the", "strings"},
{1, 1, 1, 0, 1, 1}};
cudf::table_view expected{{exp_col, exp_key_col}};

auto got = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST);
CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got->view());

cudf::test::fixed_width_column_wrapper<int32_t> exp_sort_col{{5, 5, 4, 1, 8}, {1, 1, 0, 1, 1}};
cudf::test::strings_column_wrapper exp_sort_key_col{{"new", "all", "new", "strings", "the"},
{0, 1, 1, 1, 1}};
cudf::table_view expected_sort{{exp_sort_col, exp_sort_key_col}};

auto got_sort = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST);
CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, got_sort->view());

auto got_unordered = unordered_drop_duplicates(input, keys);
auto key_view = got_unordered->select(keys.begin(), keys.end());
auto sorted_result = cudf::sort_by_key(got_unordered->view(), key_view);
Expand Down Expand Up @@ -117,10 +122,10 @@ TEST_F(DropDuplicates, NonNullTable)

// Keep the first duplicate row
// The expected table would be sorted in ascending order with respect to keys
cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_first{{5, 5, 5, 3, 8}};
cudf::test::fixed_width_column_wrapper<float> exp_col2_first{{4, 4, 4, 3, 9}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_key_first{{9, 19, 20, 20, 21}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col2_key_first{{21, 20, 19, 20, 9}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_first{{5, 3, 5, 8, 5}};
cudf::test::fixed_width_column_wrapper<float> exp_col2_first{{4, 3, 4, 9, 4}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_key_first{{20, 20, 19, 21, 9}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col2_key_first{{19, 20, 20, 9, 21}};
cudf::table_view expected_first{
{exp_col1_first, exp_col2_first, exp_col1_key_first, exp_col2_key_first}};

Expand All @@ -129,10 +134,10 @@ TEST_F(DropDuplicates, NonNullTable)
CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view());

// Keep the last duplicate row
cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_last{{5, 5, 4, 3, 8}};
cudf::test::fixed_width_column_wrapper<float> exp_col2_last{{4, 4, 5, 3, 9}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_key_last{{9, 19, 20, 20, 21}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col2_key_last{{21, 20, 19, 20, 9}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_last{{4, 3, 5, 8, 5}};
cudf::test::fixed_width_column_wrapper<float> exp_col2_last{{5, 3, 4, 9, 4}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_key_last{{20, 20, 19, 21, 9}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col2_key_last{{19, 20, 20, 9, 21}};
cudf::table_view expected_last{
{exp_col1_last, exp_col2_last, exp_col1_key_last, exp_col2_key_last}};

Expand All @@ -141,10 +146,10 @@ TEST_F(DropDuplicates, NonNullTable)
CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view());

// Keep no duplicate rows
cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_unique{{5, 5, 3, 8}};
cudf::test::fixed_width_column_wrapper<float> exp_col2_unique{{4, 4, 3, 9}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_key_unique{{9, 19, 20, 21}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col2_key_unique{{21, 20, 20, 9}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_unique{{3, 5, 8, 5}};
cudf::test::fixed_width_column_wrapper<float> exp_col2_unique{{3, 4, 9, 4}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_key_unique{{20, 19, 21, 9}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col2_key_unique{{20, 20, 9, 21}};
cudf::table_view expected_unique{
{exp_col1_unique, exp_col2_unique, exp_col1_key_unique, exp_col2_key_unique}};

Expand All @@ -162,20 +167,21 @@ TEST_F(DropDuplicates, KeepFirstWithNull)
std::vector<cudf::size_type> keys{1};

// nulls are equal
cudf::test::fixed_width_column_wrapper<int32_t> exp_col_first_equal{{3, 5, 5, 8}, {1, 1, 1, 1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_first_equal{{20, 19, 20, 21},
{0, 1, 1, 1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col_first_equal{{5, 3, 5, 8, 1},
{1, 1, 1, 1, 1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_first_equal{{20, 20, 19, 21, 19},
{1, 0, 1, 1, 1}};
cudf::table_view expected_first_equal{{exp_col_first_equal, exp_key_col_first_equal}};
auto got_first_equal =
drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL);

CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_equal, got_first_equal->view());

// nulls are unequal
cudf::test::fixed_width_column_wrapper<int32_t> exp_col_first_unequal{{3, 2, 5, 5, 8},
{1, 1, 1, 1, 1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_first_unequal{{20, 20, 19, 20, 21},
{0, 0, 1, 1, 1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col_first_unequal{{5, 3, 2, 5, 8, 1},
{1, 1, 1, 1, 1, 1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_first_unequal{
{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}};
cudf::table_view expected_first_unequal{{exp_col_first_unequal, exp_key_col_first_unequal}};
auto got_first_unequal =
drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::UNEQUAL);
Expand All @@ -192,20 +198,21 @@ TEST_F(DropDuplicates, KeepLastWithNull)
std::vector<cudf::size_type> keys{1};

// nulls are equal
cudf::test::fixed_width_column_wrapper<int32_t> exp_col_last_equal{{2, 1, 4, 8}, {1, 1, 0, 1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_last_equal{{20, 19, 20, 21},
{0, 1, 1, 1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col_last_equal{{4, 2, 5, 8, 1},
{0, 1, 1, 1, 1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_last_equal{{20, 20, 19, 21, 19},
{1, 0, 1, 1, 1}};
cudf::table_view expected_last_equal{{exp_col_last_equal, exp_key_col_last_equal}};
auto got_last_equal =
drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::EQUAL);

CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_equal, got_last_equal->view());

// nulls are unequal
cudf::test::fixed_width_column_wrapper<int32_t> exp_col_last_unequal{{3, 2, 1, 4, 8},
{1, 1, 1, 0, 1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_last_unequal{{20, 20, 19, 20, 21},
{0, 0, 1, 1, 1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col_last_unequal{{4, 3, 2, 5, 8, 1},
{0, 1, 1, 1, 1, 1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_last_unequal{{20, 20, 20, 19, 21, 19},
{1, 0, 0, 1, 1, 1}};
cudf::table_view expected_last_unequal{{exp_col_last_unequal, exp_key_col_last_unequal}};
auto got_last_unequal =
drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::UNEQUAL);
Expand All @@ -222,18 +229,19 @@ TEST_F(DropDuplicates, KeepNoneWithNull)
std::vector<cudf::size_type> keys{1};

// nulls are equal
cudf::test::fixed_width_column_wrapper<int32_t> exp_col_unique_equal{{8}, {1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_unique_equal{{21}, {1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col_unique_equal{{5, 8, 1}, {1, 1, 1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_unique_equal{{19, 21, 19}, {1, 1, 1}};
cudf::table_view expected_unique_equal{{exp_col_unique_equal, exp_key_col_unique_equal}};
auto got_unique_equal =
drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::EQUAL);

CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_equal, got_unique_equal->view());

// nulls are unequal
cudf::test::fixed_width_column_wrapper<int32_t> exp_col_unique_unequal{{3, 2, 8}, {1, 1, 1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_unique_unequal{{20, 20, 21},
{0, 0, 1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_col_unique_unequal{{3, 2, 5, 8, 1},
{1, 1, 1, 1, 1}};
cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_unique_unequal{{20, 20, 19, 21, 19},
{0, 0, 1, 1, 1}};
cudf::table_view expected_unique_unequal{{exp_col_unique_unequal, exp_key_col_unique_unequal}};
auto got_unique_unequal =
drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::UNEQUAL);
Expand Down
Loading