Generate group labels from offsets (#10945)

This PR adds a small (detail) API that generates group labels from a given offset array `offsets`. The output will be an array containing consecutive groups of identical labels, the number of elements in each group `i` is defined by `offsets[i+1] - offsets[i]`. Examples: ``` offsets = [ 0, 4, 6, 10 ] output = [ 0, 0, 0, 0, 1, 1, 2, 2, 2, 2 ] offsets = [ 5, 10, 12 ] output = [ 0, 0, 0, 0, 0, 1, 1 ] ``` Note that the label values always start from `0`. We can in fact add a parameter to allow specifying any starting value but we don't need it in now. Several places in cudf have been updated to adopt the new API immediately. These places have been tested extensively thus no unit tests for the new API is needed. In addition, I ran a benchmark for groupby aggregations and found no performance difference after adopting this. Closes #10905 and unblocks #10409. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Devavret Makkar (https://github.com/devavret) URL: #10945
rapidsai · Jun 1, 2022 · 7f359e0 · 7f359e0
1 parent c76cb9e
commit 7f359e0
Show file tree

Hide file tree

Showing 4 changed files with 128 additions and 65 deletions.
diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/distance.h>
+#include <thrust/for_each.h>
+#include <thrust/scan.h>
+#include <thrust/uninitialized_fill.h>
+
+namespace cudf::detail {
+
+/**
+ * @brief Fill label values for segments defined by a given offsets array.
+ *
+ * Given a pair of iterators accessing to an offset array, generate label values for segments
+ * defined by the offset values. The output will be an array containing consecutive groups of
+ * identical labels, the number of elements in each group `i` is defined by
+ * `offsets[i+1] - offsets[i]`.
+ *
+ * The labels always start from `0` regardless of the offset values.
+ * In case there are empty segments, their corresponding label values will be skipped in the output.
+ *
+ * Note that the caller is responsible to make sure the output range have the correct size, which is
+ * the total segment sizes (i.e., `size = *(offsets_end - 1) - *offsets_begin`). Otherwise, the
+ * result is undefined.
+ *
+ * @code{.pseudo}
+ * Examples:
+ *
+ * offsets = [ 0, 4, 6, 6, 6, 10 ]
+ * output  = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ]
+ *
+ * offsets = [ 5, 10, 12 ]
+ * output  = [ 0, 0, 0, 0, 0, 1, 1 ]
+ * @endcode
+ *
+ * @param offsets_begin The beginning of the offsets that define segments.
+ * @param offsets_end The end of the offsets that define segments.
+ * @param label_begin The beginning of the output label range.
+ * @param label_end The end of the output label range.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+template <typename InputIterator, typename OutputIterator>
+void label_segments(InputIterator offsets_begin,
+                    InputIterator offsets_end,
+                    OutputIterator label_begin,
+                    OutputIterator label_end,
+                    rmm::cuda_stream_view stream)
+{
+  // If the output array is empty, that means we have all empty segments.
+  // In such cases, we must terminate immediately. Otherwise, the `for_each` loop below may try to
+  // access memory of the output array, resulting in "illegal memory access" error.
+  if (thrust::distance(label_begin, label_end) == 0) { return; }
+
+  // When the output array is not empty, always fill it with `0` value first.
+  using OutputType = typename thrust::iterator_value<OutputIterator>::type;
+  thrust::uninitialized_fill(rmm::exec_policy(stream), label_begin, label_end, OutputType{0});
+
+  // If the offsets array has no more than 2 offset values, there will be at max 1 segment.
+  // In such cases, the output will just be an array of all `0` values (which we already filled).
+  // We should terminate here, otherwise the `inclusive_scan` call below still do its entire
+  // computation. That is unnecessary and may be expensive if we have the input offsets defining a
+  // very large segment.
+  if (thrust::distance(offsets_begin, offsets_end) <= 2) { return; }
+
+  thrust::for_each(rmm::exec_policy(stream),
+                   offsets_begin + 1,  // exclude the first offset value
+                   offsets_end - 1,    // exclude the last offset value
+                   [offsets = offsets_begin, output = label_begin] __device__(auto const idx) {
+                     // Zero-normalized offsets.
+                     auto const dst_idx = idx - (*offsets);
+
+                     // Scatter value `1` to the index at (idx - offsets[0]).
+                     // In case we have repeated offsets (i.e., we have empty segments), this
+                     // `atomicAdd` call will make sure the label values corresponding to these
+                     // empty segments will be skipped in the output.
+                     atomicAdd(&output[dst_idx], OutputType{1});
+                   });
+  thrust::inclusive_scan(rmm::exec_policy(stream), label_begin, label_end, label_begin);
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/labeling/label_segments.cuh>
 #include <cudf/detail/scatter.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/structs/utilities.hpp>
@@ -32,17 +33,11 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/fill.h>
-#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/scan.h>
-#include <thrust/scatter.h>
 #include <thrust/sequence.h>
-#include <thrust/uninitialized_fill.h>
 #include <thrust/unique.h>
 
 #include <algorithm>
@@ -223,22 +218,13 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_labels(
   _group_labels = std::make_unique<index_vector>(num_keys(stream), stream);
 
   auto& group_labels = *_group_labels;
-
   if (num_keys(stream) == 0) return group_labels;
 
-  thrust::uninitialized_fill(rmm::exec_policy(stream),
-                             group_labels.begin(),
-                             group_labels.end(),
-                             index_vector::value_type{0});
-  thrust::scatter(rmm::exec_policy(stream),
-                  thrust::make_constant_iterator(1, decltype(num_groups(stream))(1)),
-                  thrust::make_constant_iterator(1, num_groups(stream)),
-                  group_offsets(stream).begin() + 1,
-                  group_labels.begin());
-
-  thrust::inclusive_scan(
-    rmm::exec_policy(stream), group_labels.begin(), group_labels.end(), group_labels.begin());
-
+  cudf::detail::label_segments(group_offsets(stream).begin(),
+                               group_offsets(stream).end(),
+                               group_labels.begin(),
+                               group_labels.end(),
+                               stream);
   return group_labels;
 }
 

diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/labeling/label_segments.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/replace.hpp>
@@ -35,17 +36,15 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/binary_search.h>
 #include <thrust/count.h>
 #include <thrust/distance.h>
+#include <thrust/equal.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
 #include <thrust/scatter.h>
-#include <thrust/transform.h>
 #include <thrust/uninitialized_fill.h>
 
 #include <optional>
@@ -152,44 +151,6 @@ struct replace_negative_nans_dispatch {
   }
 };
 
-/**
- * @brief Populate 1-based list indices for all list entries.
- *
- * Given a number of total list entries in a lists column and an array containing list offsets,
- * generate an array that maps each list entry to a 1-based index of the list containing
- * that entry.
- *
- * Instead of regular 0-based indices, we need to use 1-based indices for later post-processing.
- *
- * @code{.pseudo}
- * num_lists = 3, num_entries = 10, offsets = { 0, 4, 6, 10 }
- * output = { 1, 1, 1, 1, 2, 2, 3, 3, 3, 3 }
- * @endcode
- *
- * @param num_lists The size of the input lists column.
- * @param num_entries The number of entries in the lists column.
- * @param offsets_begin The pointer refers to data of list offsets.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @return An array containing 1-based list indices corresponding to each list entry.
- */
-rmm::device_uvector<size_type> generate_entry_list_indices(size_type num_lists,
-                                                           size_type num_entries,
-                                                           offset_type const* offsets_begin,
-                                                           rmm::cuda_stream_view stream)
-{
-  auto entry_list_indices = rmm::device_uvector<size_type>(num_entries, stream);
-
-  auto const input = thrust::make_transform_iterator(
-    offsets_begin, [offsets_begin] __device__(auto const idx) { return idx - *offsets_begin; });
-  thrust::upper_bound(rmm::exec_policy(stream),
-                      input,
-                      input + num_lists,
-                      thrust::make_counting_iterator(0),
-                      thrust::make_counting_iterator(num_entries),
-                      entry_list_indices.begin());
-  return entry_list_indices;
-}
-
 /**
  * @brief Perform an equality comparison between two entries in a lists column, specialized from
  * `cudf::element_equality_comparator` to take into account both parameters `nulls_equal` and
@@ -533,7 +494,7 @@ std::unique_ptr<column> generate_output_offsets(size_type num_lists,
 
   // Generate offsets from sizes.
   // Given the example above, we will have new_offsets = [0, 3, 3, 3, 4, 7, 7, 9]
-  thrust::inclusive_scan(
+  thrust::exclusive_scan(
     rmm::exec_policy(stream), new_offsets.begin(), new_offsets.end(), new_offsets.begin());
 
   // Done. Hope that your head didn't explode after reading till this point.
@@ -570,9 +531,13 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> drop_list_duplicates
   // The child column containing list entries.
   auto const keys_child = keys.get_sliced_child(stream);
 
-  // Generate a mapping from list entries to their 1-based list indices for the keys column.
-  auto const entries_list_indices =
-    generate_entry_list_indices(keys.size(), keys_child.size(), keys.offsets_begin(), stream);
+  // Generate a mapping from list entries to their list indices for the keys column.
+  auto const entries_list_indices = [&] {
+    auto labels = rmm::device_uvector<size_type>(keys_child.size(), stream);
+    cudf::detail::label_segments(
+      keys.offsets_begin(), keys.offsets_end(), labels.begin(), labels.end(), stream);
+    return labels;
+  }();
 
   // Generate segmented sorted order for key entries.
   // The keys column will be sorted (gathered) using this order.

diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp
@@ -319,6 +319,19 @@ TYPED_TEST(DropListDuplicatesTypedTest, TrivialInputTests)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity);
   }
 
+  // All input lists are empty.
+  {
+    auto const lists    = ListsCol{ListsCol{}, ListsCol{}, ListsCol{}};
+    auto const expected = ListsCol{ListsCol{}, ListsCol{}, ListsCol{}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+
+    auto const [results_keys, results_vals] = cudf::lists::drop_list_duplicates(
+      cudf::lists_column_view{lists}, cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity);
+  }
+
   // Trivial cases.
   {
     auto const lists    = ListsCol{0, 1, 2, 3, 4, 5};