Skip to content

Commit

Permalink
Implement lists::distinct and cudf::detail::stable_distinct (#11149)
Browse files Browse the repository at this point in the history
This adds new APIs:
 * `lists::distinct` as a stream compaction component of `cudf::lists::`, allowing to extract distinct elements from lists in a lists column. The new API does a similar job as `lists::drop_list_duplicate` but can operate on arbitrary data types while `lists::drop_list_duplicate` can only work on basic data types and flat structs. 
 * `cudf::detail::stable_distinct`, which is implemented in the main stream compaction module. This API is introduced as just a `detail::` API first (which means we can expose it to the public if needed), producing the equivalent output as `cudf::distinct` but with row order preserved. It is used as a building block to implement `lists::distinct`.

This PR is a dependency to implement set-like operations (#11043).

Note: This new `lists::distinct` API will completely replace `lists::drop_list_duplicate` (which in turn will be deprecated). This will be the follow-up work.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Yunsong Wang (https://github.com/PointKernel)
  - Karthikeyan (https://github.com/karthikeyann)

URL: #11149
  • Loading branch information
ttnghia authored Jul 11, 2022
1 parent f7f8dbc commit 073cbd8
Show file tree
Hide file tree
Showing 12 changed files with 1,091 additions and 2 deletions.
5 changes: 4 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,6 @@ add_library(
src/join/mixed_join_size_kernel_nulls.cu
src/join/mixed_join_size_kernels_semi.cu
src/join/semi_join.cu
src/lists/apply_boolean_mask.cu
src/lists/contains.cu
src/lists/combine/concatenate_list_elements.cu
src/lists/combine/concatenate_rows.cu
Expand All @@ -387,6 +386,9 @@ add_library(
src/lists/lists_column_view.cu
src/lists/segmented_sort.cu
src/lists/sequences.cu
src/lists/stream_compaction/apply_boolean_mask.cu
src/lists/stream_compaction/distinct.cu
src/lists/utilities.cu
src/merge/merge.cu
src/partitioning/partitioning.cu
src/partitioning/round_robin.cu
Expand Down Expand Up @@ -452,6 +454,7 @@ add_library(
src/stream_compaction/distinct_reduce.cu
src/stream_compaction/drop_nans.cu
src/stream_compaction/drop_nulls.cu
src/stream_compaction/stable_distinct.cu
src/stream_compaction/unique.cu
src/stream_compaction/unique_count.cu
src/strings/attributes.cu
Expand Down
30 changes: 30 additions & 0 deletions cpp/include/cudf/detail/stream_compaction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/utilities/default_stream.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>

namespace cudf {
namespace detail {
Expand Down Expand Up @@ -89,6 +90,35 @@ std::unique_ptr<table> distinct(
rmm::cuda_stream_view stream = cudf::default_stream_value,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a new table without duplicate rows.
*
* Given an `input` table_view, each row is copied to the output table to create a set of distinct
* rows. The row order is guaranteed to be preserved as in the input.
*
* If there are duplicate rows, which row to be copied depends on the specified value of the `keep`
* parameter.
*
* This API produces exactly the same set of output rows as `cudf::distinct`.
*
* @param input The input table
* @param keys Vector of indices indicating key columns in the `input` table
* @param keep Copy any, first, last, or none of the found duplicates
* @param nulls_equal Flag to specify whether null elements should be considered as equal
* @param nans_equal Flag to specify whether NaN elements should be considered as equal
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned table
* @return A table containing the resulting distinct rows
*/
std::unique_ptr<table> stable_distinct(
table_view const& input,
std::vector<size_type> const& keys,
duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY,
null_equality nulls_equal = null_equality::EQUAL,
nan_equality nans_equal = nan_equality::ALL_EQUAL,
rmm::cuda_stream_view stream = cudf::default_stream_value,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a column of indices of all distinct rows in the input table.
*
Expand Down
12 changes: 12 additions & 0 deletions cpp/include/cudf/lists/detail/stream_compaction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,16 @@ std::unique_ptr<column> apply_boolean_mask(
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::list::distinct
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> distinct(
lists_column_view const& input,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace cudf::lists::detail
25 changes: 25 additions & 0 deletions cpp/include/cudf/lists/stream_compaction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,29 @@ std::unique_ptr<column> apply_boolean_mask(
lists_column_view const& boolean_mask,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a new list column without duplicate elements in each list.
*
* Given a lists column `input`, distinct elements of each list are copied to the corresponding
* output list. The order of lists is preserved while the order of elements within each list is not
* guaranteed.
*
* Example:
* @code{.pseudo}
* input = { {0, 1, 2, 3, 2}, {3, 1, 2}, null, {4, null, null, 5} }
* result = { {0, 1, 2, 3}, {3, 1, 2}, null, {4, null, 5} }
* @endcode
*
* @param input The input lists column
* @param nulls_equal Flag to specify whether null elements should be considered as equal
* @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal
* @param mr Device memory resource used to allocate the returned object
* @return The resulting lists column containing lists without duplicates
*/
std::unique_ptr<column> distinct(
lists_column_view const& input,
null_equality nulls_equal = null_equality::EQUAL,
nan_equality nans_equal = nan_equality::ALL_EQUAL,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace cudf::lists
File renamed without changes.
84 changes: 84 additions & 0 deletions cpp/src/lists/stream_compaction/distinct.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <lists/utilities.hpp>

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/copy.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/stream_compaction.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>

#include <rmm/cuda_stream_view.hpp>

#include <memory>
#include <utility>

namespace cudf::lists {
namespace detail {

std::unique_ptr<column> distinct(lists_column_view const& input,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
// Algorithm:
// - Generate labels for the child elements.
// - Get distinct rows of the table {labels, child} using `stable_distinct`.
// - Build the output lists column from the output distinct rows above.

if (input.is_empty()) { return empty_like(input.parent()); }

auto const child = input.get_sliced_child(stream);
auto const labels = generate_labels(input, child.size(), stream);

auto const distinct_table =
cudf::detail::stable_distinct(table_view{{labels->view(), child}}, // input table
std::vector<size_type>{0, 1}, // keys
duplicate_keep_option::KEEP_ANY,
nulls_equal,
nans_equal,
stream,
mr);

auto out_offsets =
reconstruct_offsets(distinct_table->get_column(0).view(), input.size(), stream, mr);

return make_lists_column(input.size(),
std::move(out_offsets),
std::move(distinct_table->release().back()),
input.null_count(),
cudf::detail::copy_bitmask(input.parent(), stream, mr),
stream,
mr);
}

} // namespace detail

std::unique_ptr<column> distinct(lists_column_view const& input,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::distinct(input, nulls_equal, nans_equal, cudf::default_stream_value, mr);
}

} // namespace cudf::lists
55 changes: 55 additions & 0 deletions cpp/src/lists/utilities.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "utilities.hpp"

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/labeling/label_segments.cuh>

namespace cudf::lists::detail {

std::unique_ptr<column> generate_labels(lists_column_view const& input,
size_type n_elements,
rmm::cuda_stream_view stream)
{
auto labels = make_numeric_column(
data_type(type_to_id<size_type>()), n_elements, cudf::mask_state::UNALLOCATED, stream);
auto const labels_begin = labels->mutable_view().template begin<size_type>();
cudf::detail::label_segments(
input.offsets_begin(), input.offsets_end(), labels_begin, labels_begin + n_elements, stream);
return labels;
}

std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
size_type n_lists,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)

{
auto out_offsets = make_numeric_column(
data_type{type_to_id<offset_type>()}, n_lists + 1, mask_state::UNALLOCATED, stream, mr);

auto const labels_begin = labels.template begin<size_type>();
auto const offsets_begin = out_offsets->mutable_view().template begin<offset_type>();
cudf::detail::labels_to_offsets(labels_begin,
labels_begin + labels.size(),
offsets_begin,
offsets_begin + out_offsets->size(),
stream);
return out_offsets;
}

} // namespace cudf::lists::detail
53 changes: 53 additions & 0 deletions cpp/src/lists/utilities.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/column/column_view.hpp>
#include <cudf/lists/lists_column_view.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>

namespace cudf::lists::detail {

/**
* @brief Generate list labels for elements in the child column of the input lists column.
*
* @param input The input lists column
* @param n_elements The number of elements in the child column of the input lists column
* @param stream CUDA stream used for device memory operations and kernel launches
* @return A column containing list labels corresponding to each element in the child column
*/
std::unique_ptr<column> generate_labels(lists_column_view const& input,
size_type n_elements,
rmm::cuda_stream_view stream);

/**
* @brief Reconstruct an offsets column from the input list labels column.
*
* @param labels The list labels corresponding to each list element
* @param n_lists The number of lists to build the offsets column
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned object
* @return The output offsets column
*/
std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
size_type n_lists,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

} // namespace cudf::lists::detail
66 changes: 66 additions & 0 deletions cpp/src/stream_compaction/stable_distinct.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/detail/copy_if.cuh>
#include <cudf/detail/stream_compaction.hpp>
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>

#include <thrust/iterator/constant_iterator.h>
#include <thrust/scatter.h>
#include <thrust/uninitialized_fill.h>

namespace cudf::detail {

std::unique_ptr<table> stable_distinct(table_view const& input,
std::vector<size_type> const& keys,
duplicate_keep_option keep,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) {
return empty_like(input);
}

auto const distinct_indices =
get_distinct_indices(input.select(keys), keep, nulls_equal, nans_equal, stream);

// Markers to denote which rows to be copied to the output.
auto const output_markers = [&] {
auto markers = rmm::device_uvector<bool>(input.num_rows(), stream);
thrust::uninitialized_fill(rmm::exec_policy(stream), markers.begin(), markers.end(), false);
thrust::scatter(
rmm::exec_policy(stream),
thrust::constant_iterator<bool>(true, 0),
thrust::constant_iterator<bool>(true, static_cast<size_type>(distinct_indices.size())),
distinct_indices.begin(),
markers.begin());
return markers;
}();

return cudf::detail::copy_if(
input,
[output_markers = output_markers.begin()] __device__(auto const idx) {
return *(output_markers + idx);
},
stream,
mr);
}

} // namespace cudf::detail
3 changes: 2 additions & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,6 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp)
# * lists tests ----------------------------------------------------------------------------------
ConfigureTest(
LISTS_TEST
lists/apply_boolean_mask_test.cpp
lists/combine/concatenate_list_elements_tests.cpp
lists/combine/concatenate_rows_tests.cpp
lists/contains_tests.cpp
Expand All @@ -482,6 +481,8 @@ ConfigureTest(
lists/extract_tests.cpp
lists/sequences_tests.cpp
lists/sort_lists_tests.cpp
lists/stream_compaction/apply_boolean_mask_tests.cpp
lists/stream_compaction/distinct_tests.cpp
)

# ##################################################################################################
Expand Down
File renamed without changes.
Loading

0 comments on commit 073cbd8

Please sign in to comment.