Use the offsetalator in cudf::concatenate for strings (#14611)

Updates the `cudf::concatenate` logic specialization for strings to use the offsetalator to manage the input and output offsets. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: #14611
rapidsai · Jan 8, 2024 · c0aa8bb · c0aa8bb
1 parent fc142eb
commit c0aa8bb
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 32 deletions.
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,6 +53,20 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Return a normalized offset value from a strings offsets column
+ *
+ * @throw std::invalid_argument if `offsets` is neither INT32 nor INT64
+ *
+ * @param offsets Input column of type INT32 or INT64
+ * @param index Row value to retrieve
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Value at `offsets[index]`
+ */
+int64_t get_offset_value(cudf::column_view const& offsets,
+                         size_type index,
+                         rmm::cuda_stream_view stream);
+
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/concatenate.hpp>
@@ -60,8 +60,8 @@ struct chars_size_transform {
   __device__ size_t operator()(column_device_view const& col) const
   {
     if (col.size() > 0) {
-      constexpr auto offsets_index = strings_column_view::offsets_column_index;
-      auto d_offsets               = col.child(offsets_index).data<int32_t>();
+      auto const offsets   = col.child(strings_column_view::offsets_column_index);
+      auto const d_offsets = cudf::detail::input_offsetalator(offsets.head(), offsets.type());
       return d_offsets[col.size() + col.offset()] - d_offsets[col.offset()];
     } else {
       return 0;
@@ -112,14 +112,15 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
 }
 
 template <size_type block_size, bool Nullable>
-__global__ void fused_concatenate_string_offset_kernel(column_device_view const* input_views,
-                                                       size_t const* input_offsets,
-                                                       size_t const* partition_offsets,
-                                                       size_type const num_input_views,
-                                                       size_type const output_size,
-                                                       int32_t* output_data,
-                                                       bitmask_type* output_mask,
-                                                       size_type* out_valid_count)
+__global__ void fused_concatenate_string_offset_kernel(
+  column_device_view const* input_views,
+  size_t const* input_offsets,
+  size_t const* partition_offsets,
+  size_type const num_input_views,
+  size_type const output_size,
+  cudf::detail::output_offsetalator output_data,
+  bitmask_type* output_mask,
+  size_type* out_valid_count)
 {
   cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
   size_type warp_valid_count           = 0;
@@ -132,10 +133,11 @@ __global__ void fused_concatenate_string_offset_kernel(column_device_view const*
       thrust::seq, input_offsets, input_offsets + num_input_views, output_index));
     size_type const partition_index = offset_it - input_offsets;
 
-    auto const offset_index      = output_index - *offset_it;
-    auto const& input_view       = input_views[partition_index];
-    constexpr auto offsets_child = strings_column_view::offsets_column_index;
-    auto const* input_data       = input_view.child(offsets_child).data<int32_t>();
+    auto const offset_index  = output_index - *offset_it;
+    auto const& input_view   = input_views[partition_index];
+    auto const offsets_child = input_view.child(strings_column_view::offsets_column_index);
+    auto const input_data =
+      cudf::detail::input_offsetalator(offsets_child.head(), offsets_child.type());
     output_data[output_index] =
       input_data[offset_index + input_view.offset()]  // handle parent offset
       - input_data[input_view.offset()]               // subtract first offset if non-zero
@@ -186,8 +188,9 @@ __global__ void fused_concatenate_string_chars_kernel(column_device_view const*
     auto const offset_index = output_index - *offset_it;
     auto const& input_view  = input_views[partition_index];
 
-    constexpr auto offsets_child   = strings_column_view::offsets_column_index;
-    auto const* input_offsets_data = input_view.child(offsets_child).data<int32_t>();
+    auto const offsets_child = input_view.child(strings_column_view::offsets_column_index);
+    auto const input_offsets_data =
+      cudf::detail::input_offsetalator(offsets_child.head(), offsets_child.type());
 
     constexpr auto chars_child   = strings_column_view::chars_column_index;
     auto const* input_chars_data = input_view.child(chars_child).data<char>();
@@ -225,16 +228,16 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   bool const has_nulls =
     std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
 
-  // create chars column
+  // create output chars column
   auto chars_column = create_chars_child_column(total_bytes, stream, mr);
   auto d_new_chars  = chars_column->mutable_view().data<char>();
   chars_column->set_null_count(0);
 
-  // create offsets column
+  // create output offsets column
   auto offsets_column = make_numeric_column(
     data_type{type_id::INT32}, offsets_count, mask_state::UNALLOCATED, stream, mr);
-  auto d_new_offsets = offsets_column->mutable_view().data<int32_t>();
-  offsets_column->set_null_count(0);
+  auto itr_new_offsets =
+    cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view());
 
   rmm::device_buffer null_mask{0, stream, mr};
   size_type null_count{};
@@ -256,7 +259,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
       d_partition_offsets.data(),
       static_cast<size_type>(columns.size()),
       strings_count,
-      d_new_offsets,
+      itr_new_offsets,
       reinterpret_cast<bitmask_type*>(null_mask.data()),
       d_valid_count.data());
 
@@ -286,14 +289,11 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
         column_view offsets_child = column->child(strings_column_view::offsets_column_index);
         column_view chars_child   = column->child(strings_column_view::chars_column_index);
 
-        auto bytes_offset =
-          cudf::detail::get_value<size_type>(offsets_child, column_offset, stream);
-
+        auto const bytes_offset = get_offset_value(offsets_child, column_offset, stream);
+        auto const bytes_end = get_offset_value(offsets_child, column_size + column_offset, stream);
         // copy the chars column data
-        auto d_chars = chars_child.data<char>() + bytes_offset;
-        auto const bytes =
-          cudf::detail::get_value<size_type>(offsets_child, column_size + column_offset, stream) -
-          bytes_offset;
+        auto d_chars     = chars_child.data<char>() + bytes_offset;
+        auto const bytes = bytes_end - bytes_offset;
 
         CUDF_CUDA_TRY(
           cudaMemcpyAsync(d_new_chars, d_chars, bytes, cudaMemcpyDefault, stream.value()));

diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/error.hpp>
@@ -128,6 +129,18 @@ special_case_mapping const* get_special_case_mapping_table()
   });
 }
 
+int64_t get_offset_value(cudf::column_view const& offsets,
+                         size_type index,
+                         rmm::cuda_stream_view stream)
+{
+  auto const otid = offsets.type().id();
+  CUDF_EXPECTS(otid == type_id::INT64 || otid == type_id::INT32,
+               "Offsets must be of type INT32 or INT64",
+               std::invalid_argument);
+  return otid == type_id::INT64 ? cudf::detail::get_value<int64_t>(offsets, index, stream)
+                                : cudf::detail::get_value<int32_t>(offsets, index, stream);
+}
+
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf