-
Notifications
You must be signed in to change notification settings - Fork 912
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This adds new APIs: * `lists::distinct` as a stream compaction component of `cudf::lists::`, allowing to extract distinct elements from lists in a lists column. The new API does a similar job as `lists::drop_list_duplicate` but can operate on arbitrary data types while `lists::drop_list_duplicate` can only work on basic data types and flat structs. * `cudf::detail::stable_distinct`, which is implemented in the main stream compaction module. This API is introduced as just a `detail::` API first (which means we can expose it to the public if needed), producing the equivalent output as `cudf::distinct` but with row order preserved. It is used as a building block to implement `lists::distinct`. This PR is a dependency to implement set-like operations (#11043). Note: This new `lists::distinct` API will completely replace `lists::drop_list_duplicate` (which in turn will be deprecated). This will be the follow-up work. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Yunsong Wang (https://github.com/PointKernel) - Karthikeyan (https://github.com/karthikeyann) URL: #11149
- Loading branch information
Showing
12 changed files
with
1,091 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
/* | ||
* Copyright (c) 2022, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <lists/utilities.hpp> | ||
|
||
#include <cudf/column/column_factories.hpp> | ||
#include <cudf/detail/copy.hpp> | ||
#include <cudf/detail/null_mask.hpp> | ||
#include <cudf/detail/nvtx/ranges.hpp> | ||
#include <cudf/detail/stream_compaction.hpp> | ||
#include <cudf/lists/lists_column_view.hpp> | ||
#include <cudf/table/table.hpp> | ||
#include <cudf/table/table_view.hpp> | ||
|
||
#include <rmm/cuda_stream_view.hpp> | ||
|
||
#include <memory> | ||
#include <utility> | ||
|
||
namespace cudf::lists { | ||
namespace detail { | ||
|
||
std::unique_ptr<column> distinct(lists_column_view const& input, | ||
null_equality nulls_equal, | ||
nan_equality nans_equal, | ||
rmm::cuda_stream_view stream, | ||
rmm::mr::device_memory_resource* mr) | ||
{ | ||
// Algorithm: | ||
// - Generate labels for the child elements. | ||
// - Get distinct rows of the table {labels, child} using `stable_distinct`. | ||
// - Build the output lists column from the output distinct rows above. | ||
|
||
if (input.is_empty()) { return empty_like(input.parent()); } | ||
|
||
auto const child = input.get_sliced_child(stream); | ||
auto const labels = generate_labels(input, child.size(), stream); | ||
|
||
auto const distinct_table = | ||
cudf::detail::stable_distinct(table_view{{labels->view(), child}}, // input table | ||
std::vector<size_type>{0, 1}, // keys | ||
duplicate_keep_option::KEEP_ANY, | ||
nulls_equal, | ||
nans_equal, | ||
stream, | ||
mr); | ||
|
||
auto out_offsets = | ||
reconstruct_offsets(distinct_table->get_column(0).view(), input.size(), stream, mr); | ||
|
||
return make_lists_column(input.size(), | ||
std::move(out_offsets), | ||
std::move(distinct_table->release().back()), | ||
input.null_count(), | ||
cudf::detail::copy_bitmask(input.parent(), stream, mr), | ||
stream, | ||
mr); | ||
} | ||
|
||
} // namespace detail | ||
|
||
std::unique_ptr<column> distinct(lists_column_view const& input, | ||
null_equality nulls_equal, | ||
nan_equality nans_equal, | ||
rmm::mr::device_memory_resource* mr) | ||
{ | ||
CUDF_FUNC_RANGE(); | ||
return detail::distinct(input, nulls_equal, nans_equal, cudf::default_stream_value, mr); | ||
} | ||
|
||
} // namespace cudf::lists |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
/* | ||
* Copyright (c) 2022, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include "utilities.hpp" | ||
|
||
#include <cudf/column/column_factories.hpp> | ||
#include <cudf/detail/labeling/label_segments.cuh> | ||
|
||
namespace cudf::lists::detail { | ||
|
||
std::unique_ptr<column> generate_labels(lists_column_view const& input, | ||
size_type n_elements, | ||
rmm::cuda_stream_view stream) | ||
{ | ||
auto labels = make_numeric_column( | ||
data_type(type_to_id<size_type>()), n_elements, cudf::mask_state::UNALLOCATED, stream); | ||
auto const labels_begin = labels->mutable_view().template begin<size_type>(); | ||
cudf::detail::label_segments( | ||
input.offsets_begin(), input.offsets_end(), labels_begin, labels_begin + n_elements, stream); | ||
return labels; | ||
} | ||
|
||
std::unique_ptr<column> reconstruct_offsets(column_view const& labels, | ||
size_type n_lists, | ||
rmm::cuda_stream_view stream, | ||
rmm::mr::device_memory_resource* mr) | ||
|
||
{ | ||
auto out_offsets = make_numeric_column( | ||
data_type{type_to_id<offset_type>()}, n_lists + 1, mask_state::UNALLOCATED, stream, mr); | ||
|
||
auto const labels_begin = labels.template begin<size_type>(); | ||
auto const offsets_begin = out_offsets->mutable_view().template begin<offset_type>(); | ||
cudf::detail::labels_to_offsets(labels_begin, | ||
labels_begin + labels.size(), | ||
offsets_begin, | ||
offsets_begin + out_offsets->size(), | ||
stream); | ||
return out_offsets; | ||
} | ||
|
||
} // namespace cudf::lists::detail |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
/* | ||
* Copyright (c) 2022, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <cudf/column/column_view.hpp> | ||
#include <cudf/lists/lists_column_view.hpp> | ||
|
||
#include <rmm/cuda_stream_view.hpp> | ||
#include <rmm/mr/device/device_memory_resource.hpp> | ||
|
||
namespace cudf::lists::detail { | ||
|
||
/** | ||
* @brief Generate list labels for elements in the child column of the input lists column. | ||
* | ||
* @param input The input lists column | ||
* @param n_elements The number of elements in the child column of the input lists column | ||
* @param stream CUDA stream used for device memory operations and kernel launches | ||
* @return A column containing list labels corresponding to each element in the child column | ||
*/ | ||
std::unique_ptr<column> generate_labels(lists_column_view const& input, | ||
size_type n_elements, | ||
rmm::cuda_stream_view stream); | ||
|
||
/** | ||
* @brief Reconstruct an offsets column from the input list labels column. | ||
* | ||
* @param labels The list labels corresponding to each list element | ||
* @param n_lists The number of lists to build the offsets column | ||
* @param stream CUDA stream used for device memory operations and kernel launches | ||
* @param mr Device memory resource used to allocate the returned object | ||
* @return The output offsets column | ||
*/ | ||
std::unique_ptr<column> reconstruct_offsets(column_view const& labels, | ||
size_type n_lists, | ||
rmm::cuda_stream_view stream, | ||
rmm::mr::device_memory_resource* mr); | ||
|
||
} // namespace cudf::lists::detail |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
/* | ||
* Copyright (c) 2022, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <cudf/detail/copy_if.cuh> | ||
#include <cudf/detail/stream_compaction.hpp> | ||
#include <cudf/table/table.hpp> | ||
#include <cudf/table/table_view.hpp> | ||
#include <cudf/types.hpp> | ||
|
||
#include <thrust/iterator/constant_iterator.h> | ||
#include <thrust/scatter.h> | ||
#include <thrust/uninitialized_fill.h> | ||
|
||
namespace cudf::detail { | ||
|
||
std::unique_ptr<table> stable_distinct(table_view const& input, | ||
std::vector<size_type> const& keys, | ||
duplicate_keep_option keep, | ||
null_equality nulls_equal, | ||
nan_equality nans_equal, | ||
rmm::cuda_stream_view stream, | ||
rmm::mr::device_memory_resource* mr) | ||
{ | ||
if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) { | ||
return empty_like(input); | ||
} | ||
|
||
auto const distinct_indices = | ||
get_distinct_indices(input.select(keys), keep, nulls_equal, nans_equal, stream); | ||
|
||
// Markers to denote which rows to be copied to the output. | ||
auto const output_markers = [&] { | ||
auto markers = rmm::device_uvector<bool>(input.num_rows(), stream); | ||
thrust::uninitialized_fill(rmm::exec_policy(stream), markers.begin(), markers.end(), false); | ||
thrust::scatter( | ||
rmm::exec_policy(stream), | ||
thrust::constant_iterator<bool>(true, 0), | ||
thrust::constant_iterator<bool>(true, static_cast<size_type>(distinct_indices.size())), | ||
distinct_indices.begin(), | ||
markers.begin()); | ||
return markers; | ||
}(); | ||
|
||
return cudf::detail::copy_if( | ||
input, | ||
[output_markers = output_markers.begin()] __device__(auto const idx) { | ||
return *(output_markers + idx); | ||
}, | ||
stream, | ||
mr); | ||
} | ||
|
||
} // namespace cudf::detail |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
Oops, something went wrong.