Merge branch 'branch-24.04' into json-whitespace

rapidsai · Mar 1, 2024 · d4b1b26 · d4b1b26
2 parents 98c103d + 3b228e2
commit d4b1b26
Show file tree

Hide file tree

Showing 6 changed files with 503 additions and 181 deletions.
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -100,5 +100,15 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr);
 
+/**
+ * @copydoc cudf::segmented_row_bit_count
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
+                                                size_type segment_length,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -224,5 +224,28 @@ std::unique_ptr<column> row_bit_count(
   table_view const& t,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
+ * each segment of rows.
+ *
+ * This is similar to counting bit size per row for the input table in `cudf::row_bit_count`,
+ * except that row sizes are accumulated by segments.
+ *
+ * Currently, only fixed-length segments are supported. In case the input table has number of rows
+ * not divisible by `segment_length`, its last segment is considered as shorter than the others.
+ *
+ * @throw std::invalid_argument if the input `segment_length` is non-positive or larger than the
+ * number of rows in the input table.
+ *
+ * @param t The table view to perform the computation on
+ * @param segment_length The number of rows in each segment for which the total size is computed
+ * @param mr Device memory resource used to allocate the returned columns' device memory
+ * @return A 32-bit integer column containing the bit counts for each segment of rows
+ */
+std::unique_ptr<column> segmented_row_bit_count(
+  table_view const& t,
+  size_type segment_length,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
@@ -31,8 +32,10 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/fill.h>
+#include <cuda/functional>
+#include <thrust/iterator/counting_iterator.h>
 #include <thrust/optional.h>
+#include <thrust/tabulate.h>
 
 namespace cudf {
 namespace detail {
@@ -398,26 +401,32 @@ __device__ size_type row_size_functor::operator()<struct_view>(column_device_vie
  * @param cols An span of column_device_views representing a column hierarchy
  * @param info An span of column_info structs corresponding the elements in `cols`
  * @param output Output span of size (# rows) where per-row bit sizes are stored
+ * @param segment_length The number of rows in each segment for which the total size is computed
  * @param max_branch_depth Maximum depth of the span stack needed per-thread
  */
-CUDF_KERNEL void compute_row_sizes(device_span<column_device_view const> cols,
-                                   device_span<column_info const> info,
-                                   device_span<size_type> output,
-                                   size_type max_branch_depth)
+CUDF_KERNEL void compute_segment_sizes(device_span<column_device_view const> cols,
+                                       device_span<column_info const> info,
+                                       device_span<size_type> output,
+                                       size_type segment_length,
+                                       size_type max_branch_depth)
 {
   extern __shared__ row_span thread_branch_stacks[];
   int const tid = threadIdx.x + blockIdx.x * blockDim.x;
 
-  auto const num_rows = output.size();
-  if (tid >= num_rows) { return; }
+  auto const num_segments = static_cast<size_type>(output.size());
+  if (tid >= num_segments) { return; }
 
   // my_branch_stack points to the last span prior to branching. a branch occurs only
   // when we are inside of a list contained within a struct column.
   row_span* my_branch_stack = thread_branch_stacks + (threadIdx.x * max_branch_depth);
   size_type branch_depth{0};
 
-  // current row span - always starts at 1 row.
-  row_span cur_span{tid, tid + 1};
+  // current row span - always starts at spanning over `segment_length` rows.
+  auto const num_rows             = cols[0].size();
+  auto const get_default_row_span = [=] {
+    return row_span{tid * segment_length, cuda::std::min((tid + 1) * segment_length, num_rows)};
+  };
+  auto cur_span = get_default_row_span();
 
   // output size
   size_type& size = output[tid];
@@ -444,7 +453,7 @@ CUDF_KERNEL void compute_row_sizes(device_span<column_device_view const> cols,
     if (info[idx].depth == 0) {
       branch_depth      = 0;
       last_branch_depth = 0;
-      cur_span          = row_span{tid, tid + 1};
+      cur_span          = get_default_row_span();
     }
 
     // add the contributing size of this row
@@ -465,17 +474,18 @@ CUDF_KERNEL void compute_row_sizes(device_span<column_device_view const> cols,
 
 }  // anonymous namespace
 
-/**
- * @copydoc cudf::detail::row_bit_count
- *
- */
-std::unique_ptr<column> row_bit_count(table_view const& t,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
+                                                size_type segment_length,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
 {
-  // no rows
+  // If there is no rows, segment_length will not be checked.
   if (t.num_rows() <= 0) { return cudf::make_empty_column(type_id::INT32); }
 
+  CUDF_EXPECTS(segment_length >= 1 && segment_length <= t.num_rows(),
+               "Invalid segment length.",
+               std::invalid_argument);
+
   // flatten the hierarchy and determine some information about it.
   std::vector<cudf::column_view> cols;
   std::vector<column_info> info;
@@ -484,17 +494,28 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
   CUDF_EXPECTS(info.size() == cols.size(), "Size/info mismatch");
 
   // create output buffer and view
-  auto output = cudf::make_fixed_width_column(
-    data_type{type_id::INT32}, t.num_rows(), mask_state::UNALLOCATED, stream, mr);
+  auto const num_segments = cudf::util::div_rounding_up_safe(t.num_rows(), segment_length);
+  auto output             = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, num_segments, mask_state::UNALLOCATED, stream, mr);
   mutable_column_view mcv = output->mutable_view();
 
   // simple case.  if we have no complex types (lists, strings, etc), the per-row size is already
   // trivially computed
   if (h_info.complex_type_count <= 0) {
-    thrust::fill(rmm::exec_policy(stream),
-                 mcv.begin<size_type>(),
-                 mcv.end<size_type>(),
-                 h_info.simple_per_row_size);
+    thrust::tabulate(
+      rmm::exec_policy_nosync(stream),
+      mcv.begin<size_type>(),
+      mcv.end<size_type>(),
+      cuda::proclaim_return_type<size_type>(
+        [segment_length,
+         num_rows     = t.num_rows(),
+         per_row_size = h_info.simple_per_row_size] __device__(size_type const segment_idx) {
+          // Since the number of rows may not divisible by segment_length,
+          // the last segment may be shorter than the others.
+          auto const current_length =
+            cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
+          return per_row_size * current_length;
+        }));
     return output;
   }
 
@@ -523,22 +544,34 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
   // should we be aborting if we reach some extremely small block size, or just if we hit 0?
   CUDF_EXPECTS(block_size > 0, "Encountered a column hierarchy too complex for row_bit_count");
 
-  cudf::detail::grid_1d grid{t.num_rows(), block_size, 1};
-  compute_row_sizes<<<grid.num_blocks, block_size, shared_mem_size, stream.value()>>>(
+  cudf::detail::grid_1d grid{num_segments, block_size, 1};
+  compute_segment_sizes<<<grid.num_blocks, block_size, shared_mem_size, stream.value()>>>(
     {std::get<1>(d_cols), cols.size()},
     {d_info.data(), info.size()},
-    {mcv.data<size_type>(), static_cast<std::size_t>(t.num_rows())},
+    {mcv.data<size_type>(), static_cast<std::size_t>(mcv.size())},
+    segment_length,
     h_info.max_branch_depth);
 
   return output;
 }
 
+std::unique_ptr<column> row_bit_count(table_view const& t,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  return segmented_row_bit_count(t, 1, stream, mr);
+}
+
 }  // namespace detail
 
-/**
- * @copydoc cudf::row_bit_count
- *
- */
+std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
+                                                size_type segment_length,
+                                                rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::segmented_row_bit_count(t, segment_length, cudf::get_default_stream(), mr);
+}
+
 std::unique_ptr<column> row_bit_count(table_view const& t, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
@@ -259,6 +259,7 @@ ConfigureTest(
   transform/mask_to_bools_test.cpp
   transform/bools_to_mask_test.cpp
   transform/row_bit_count_test.cu
+  transform/segmented_row_bit_count_test.cu
   transform/one_hot_encode_tests.cpp
 )