diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh new file mode 100644 index 00000000000..707a28424e5 --- /dev/null +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include +#include +#include + +namespace cudf::detail { + +/** + * @brief Fill label values for segments defined by a given offsets array. + * + * Given a pair of iterators accessing to an offset array, generate label values for segments + * defined by the offset values. The output will be an array containing consecutive groups of + * identical labels, the number of elements in each group `i` is defined by + * `offsets[i+1] - offsets[i]`. + * + * The labels always start from `0` regardless of the offset values. + * In case there are empty segments, their corresponding label values will be skipped in the output. + * + * Note that the caller is responsible to make sure the output range have the correct size, which is + * the total segment sizes (i.e., `size = *(offsets_end - 1) - *offsets_begin`). Otherwise, the + * result is undefined. + * + * @code{.pseudo} + * Examples: + * + * offsets = [ 0, 4, 6, 6, 6, 10 ] + * output = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ] + * + * offsets = [ 5, 10, 12 ] + * output = [ 0, 0, 0, 0, 0, 1, 1 ] + * @endcode + * + * @param offsets_begin The beginning of the offsets that define segments. + * @param offsets_end The end of the offsets that define segments. + * @param label_begin The beginning of the output label range. + * @param label_end The end of the output label range. + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +template +void label_segments(InputIterator offsets_begin, + InputIterator offsets_end, + OutputIterator label_begin, + OutputIterator label_end, + rmm::cuda_stream_view stream) +{ + // If the output array is empty, that means we have all empty segments. + // In such cases, we must terminate immediately. Otherwise, the `for_each` loop below may try to + // access memory of the output array, resulting in "illegal memory access" error. + if (thrust::distance(label_begin, label_end) == 0) { return; } + + // When the output array is not empty, always fill it with `0` value first. + using OutputType = typename thrust::iterator_value::type; + thrust::uninitialized_fill(rmm::exec_policy(stream), label_begin, label_end, OutputType{0}); + + // If the offsets array has no more than 2 offset values, there will be at max 1 segment. + // In such cases, the output will just be an array of all `0` values (which we already filled). + // We should terminate here, otherwise the `inclusive_scan` call below still do its entire + // computation. That is unnecessary and may be expensive if we have the input offsets defining a + // very large segment. + if (thrust::distance(offsets_begin, offsets_end) <= 2) { return; } + + thrust::for_each(rmm::exec_policy(stream), + offsets_begin + 1, // exclude the first offset value + offsets_end - 1, // exclude the last offset value + [offsets = offsets_begin, output = label_begin] __device__(auto const idx) { + // Zero-normalized offsets. + auto const dst_idx = idx - (*offsets); + + // Scatter value `1` to the index at (idx - offsets[0]). + // In case we have repeated offsets (i.e., we have empty segments), this + // `atomicAdd` call will make sure the label values corresponding to these + // empty segments will be skipped in the output. + atomicAdd(&output[dst_idx], OutputType{1}); + }); + thrust::inclusive_scan(rmm::exec_policy(stream), label_begin, label_end, label_begin); +} + +} // namespace cudf::detail diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index 10201782854..a0abaf71160 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -32,17 +33,11 @@ #include #include -#include #include #include -#include #include -#include #include -#include -#include #include -#include #include #include @@ -223,22 +218,13 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_labels( _group_labels = std::make_unique(num_keys(stream), stream); auto& group_labels = *_group_labels; - if (num_keys(stream) == 0) return group_labels; - thrust::uninitialized_fill(rmm::exec_policy(stream), - group_labels.begin(), - group_labels.end(), - index_vector::value_type{0}); - thrust::scatter(rmm::exec_policy(stream), - thrust::make_constant_iterator(1, decltype(num_groups(stream))(1)), - thrust::make_constant_iterator(1, num_groups(stream)), - group_offsets(stream).begin() + 1, - group_labels.begin()); - - thrust::inclusive_scan( - rmm::exec_policy(stream), group_labels.begin(), group_labels.end(), group_labels.begin()); - + cudf::detail::label_segments(group_offsets(stream).begin(), + group_offsets(stream).end(), + group_labels.begin(), + group_labels.end(), + stream); return group_labels; } diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 8a4704ad13b..929af866b6d 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -35,17 +36,15 @@ #include #include -#include #include #include +#include #include #include #include -#include #include #include #include -#include #include #include @@ -152,44 +151,6 @@ struct replace_negative_nans_dispatch { } }; -/** - * @brief Populate 1-based list indices for all list entries. - * - * Given a number of total list entries in a lists column and an array containing list offsets, - * generate an array that maps each list entry to a 1-based index of the list containing - * that entry. - * - * Instead of regular 0-based indices, we need to use 1-based indices for later post-processing. - * - * @code{.pseudo} - * num_lists = 3, num_entries = 10, offsets = { 0, 4, 6, 10 } - * output = { 1, 1, 1, 1, 2, 2, 3, 3, 3, 3 } - * @endcode - * - * @param num_lists The size of the input lists column. - * @param num_entries The number of entries in the lists column. - * @param offsets_begin The pointer refers to data of list offsets. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @return An array containing 1-based list indices corresponding to each list entry. - */ -rmm::device_uvector generate_entry_list_indices(size_type num_lists, - size_type num_entries, - offset_type const* offsets_begin, - rmm::cuda_stream_view stream) -{ - auto entry_list_indices = rmm::device_uvector(num_entries, stream); - - auto const input = thrust::make_transform_iterator( - offsets_begin, [offsets_begin] __device__(auto const idx) { return idx - *offsets_begin; }); - thrust::upper_bound(rmm::exec_policy(stream), - input, - input + num_lists, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - entry_list_indices.begin()); - return entry_list_indices; -} - /** * @brief Perform an equality comparison between two entries in a lists column, specialized from * `cudf::element_equality_comparator` to take into account both parameters `nulls_equal` and @@ -533,7 +494,7 @@ std::unique_ptr generate_output_offsets(size_type num_lists, // Generate offsets from sizes. // Given the example above, we will have new_offsets = [0, 3, 3, 3, 4, 7, 7, 9] - thrust::inclusive_scan( + thrust::exclusive_scan( rmm::exec_policy(stream), new_offsets.begin(), new_offsets.end(), new_offsets.begin()); // Done. Hope that your head didn't explode after reading till this point. @@ -570,9 +531,13 @@ std::pair, std::unique_ptr> drop_list_duplicates // The child column containing list entries. auto const keys_child = keys.get_sliced_child(stream); - // Generate a mapping from list entries to their 1-based list indices for the keys column. - auto const entries_list_indices = - generate_entry_list_indices(keys.size(), keys_child.size(), keys.offsets_begin(), stream); + // Generate a mapping from list entries to their list indices for the keys column. + auto const entries_list_indices = [&] { + auto labels = rmm::device_uvector(keys_child.size(), stream); + cudf::detail::label_segments( + keys.offsets_begin(), keys.offsets_end(), labels.begin(), labels.end(), stream); + return labels; + }(); // Generate segmented sorted order for key entries. // The keys column will be sorted (gathered) using this order. diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 945d138c789..54d7ba0a95e 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -319,6 +319,19 @@ TYPED_TEST(DropListDuplicatesTypedTest, TrivialInputTests) CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity); } + // All input lists are empty. + { + auto const lists = ListsCol{ListsCol{}, ListsCol{}, ListsCol{}}; + auto const expected = ListsCol{ListsCol{}, ListsCol{}, ListsCol{}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + + auto const [results_keys, results_vals] = cudf::lists::drop_list_duplicates( + cudf::lists_column_view{lists}, cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity); + } + // Trivial cases. { auto const lists = ListsCol{0, 1, 2, 3, 4, 5};