Optimize string gather performance for large strings (#7980)

This PR intends to improve the string gather performance for large strings. There are two kernels implemented - String-parallel kernel assigns strings to warps and each warp collectively copies the characters with large data type. This kernel is best suited for large strings. - Char-parallel kernel assigns characters to threads. This is similar to the existing implementation, except this PR uses shared memory and assigns a fixed number of strings per threadblock to improve binary search performance. This kernel is best suited for small strings. This PR uses one of the two kernels depending on the average string size. The following benchmark results are collected on V100 through `./gbenchmarks/STRINGS_BENCH --benchmark_filter=StringCopy/gather` Before this PR at `8a504d19c725e0ff01e28f36e5f1daf02fbf86c4`: ``` ---------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ---------------------------------------------------------------------------------------------------- StringCopy/gather/4096/32/manual_time 0.127 ms 0.151 ms 5269 bytes_per_second=545.365M/s StringCopy/gather/4096/128/manual_time 0.130 ms 0.154 ms 5135 bytes_per_second=2.0329G/s StringCopy/gather/4096/512/manual_time 0.156 ms 0.179 ms 4331 bytes_per_second=6.85358G/s StringCopy/gather/4096/2048/manual_time 0.255 ms 0.277 ms 2731 bytes_per_second=16.711G/s StringCopy/gather/4096/8192/manual_time 0.650 ms 0.673 ms 1076 bytes_per_second=26.0888G/s StringCopy/gather/32768/32/manual_time 0.148 ms 0.171 ms 4602 bytes_per_second=3.64833G/s StringCopy/gather/32768/128/manual_time 0.206 ms 0.228 ms 3345 bytes_per_second=10.3745G/s StringCopy/gather/32768/512/manual_time 0.438 ms 0.462 ms 1599 bytes_per_second=19.421G/s StringCopy/gather/32768/2048/manual_time 1.38 ms 1.40 ms 506 bytes_per_second=24.7168G/s StringCopy/gather/32768/8192/manual_time 5.14 ms 5.16 ms 136 bytes_per_second=26.5093G/s StringCopy/gather/262144/32/manual_time 0.336 ms 0.358 ms 2082 bytes_per_second=12.8318G/s StringCopy/gather/262144/128/manual_time 0.878 ms 0.901 ms 795 bytes_per_second=19.4286G/s StringCopy/gather/262144/512/manual_time 3.05 ms 3.07 ms 229 bytes_per_second=22.3358G/s StringCopy/gather/262144/2048/manual_time 11.8 ms 11.8 ms 59 bytes_per_second=23.2139G/s StringCopy/gather/2097152/32/manual_time 2.05 ms 2.07 ms 341 bytes_per_second=16.8261G/s StringCopy/gather/2097152/128/manual_time 6.96 ms 6.99 ms 100 bytes_per_second=19.6048G/s StringCopy/gather/2097152/512/manual_time 26.7 ms 26.7 ms 26 bytes_per_second=20.434G/s StringCopy/gather/16777216/32/manual_time 19.0 ms 19.0 ms 37 bytes_per_second=14.5447G/s StringCopy/gather/67108864/2/manual_time 34.1 ms 34.2 ms 20 bytes_per_second=2.01153G/s ``` This PR: ``` ---------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ---------------------------------------------------------------------------------------------------- StringCopy/gather/4096/32/manual_time 0.105 ms 0.127 ms 6430 bytes_per_second=660.581M/s StringCopy/gather/4096/128/manual_time 0.103 ms 0.125 ms 6383 bytes_per_second=2.57612G/s StringCopy/gather/4096/512/manual_time 0.105 ms 0.126 ms 6249 bytes_per_second=10.2033G/s StringCopy/gather/4096/2048/manual_time 0.114 ms 0.134 ms 6114 bytes_per_second=37.5549G/s StringCopy/gather/4096/8192/manual_time 0.155 ms 0.178 ms 4547 bytes_per_second=109.744G/s StringCopy/gather/32768/32/manual_time 0.109 ms 0.130 ms 6210 bytes_per_second=4.9546G/s StringCopy/gather/32768/128/manual_time 0.124 ms 0.145 ms 5441 bytes_per_second=17.1911G/s StringCopy/gather/32768/512/manual_time 0.137 ms 0.159 ms 5057 bytes_per_second=62.082G/s StringCopy/gather/32768/2048/manual_time 0.209 ms 0.232 ms 3362 bytes_per_second=163.045G/s StringCopy/gather/32768/8192/manual_time 0.526 ms 0.549 ms 1332 bytes_per_second=259.064G/s StringCopy/gather/262144/32/manual_time 0.184 ms 0.205 ms 3777 bytes_per_second=23.4435G/s StringCopy/gather/262144/128/manual_time 0.328 ms 0.349 ms 2132 bytes_per_second=51.986G/s StringCopy/gather/262144/512/manual_time 0.400 ms 0.421 ms 1751 bytes_per_second=170.506G/s StringCopy/gather/262144/2048/manual_time 0.965 ms 0.987 ms 725 bytes_per_second=282.969G/s StringCopy/gather/2097152/32/manual_time 1.10 ms 1.12 ms 637 bytes_per_second=31.35G/s StringCopy/gather/2097152/128/manual_time 1.92 ms 1.94 ms 364 bytes_per_second=71.1531G/s StringCopy/gather/2097152/512/manual_time 2.48 ms 2.50 ms 282 bytes_per_second=220.297G/s StringCopy/gather/16777216/32/manual_time 11.0 ms 11.0 ms 64 bytes_per_second=25.0771G/s StringCopy/gather/67108864/2/manual_time 33.7 ms 33.7 ms 21 bytes_per_second=2.03768G/s ``` When there are enough strings and string sizes are large (e.g. ` StringCopy/gather/262144/2048`), this PR improves throughput from 23.21 GB/s to 282.97 GB/s, which is a 12x improvement. For large strings, the ncu profile on 524288 strings, with average string size of 2048, shows the kernel takes 3.48ms, so achieved throughput is 308.55GB/s (which is 68.7% of DRAM SOL on V100). Authors: - https://github.com/gaohao95 Approvers: - David Wendt (https://github.com/davidwendt) - Nikolay Sakharnykh (https://github.com/nsakharnykh) - Jake Hemstad (https://github.com/jrhemstad) - Nghia Truong (https://github.com/ttnghia) URL: #7980
rapidsai · Jun 2, 2021 · 53b0170 · 53b0170
1 parent c179b7d
commit 53b0170
Show file tree

Hide file tree

Showing 2 changed files with 190 additions and 14 deletions.
diff --git a/cpp/benchmarks/string/copy_benchmark.cpp b/cpp/benchmarks/string/copy_benchmark.cpp
@@ -74,6 +74,9 @@ static void generate_bench_args(benchmark::internal::Benchmark* b)
   int const max_rowlen = 1 << 13;
   int const len_mult   = 4;
   generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+
+  // Benchmark for very small strings
+  b->Args({67108864, 2});
 }
 
 #define COPY_BENCHMARK_DEFINE(name)                           \

diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
@@ -18,6 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/span.hpp>
@@ -34,6 +35,169 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+// Helper function for loading 16B from a potentially unaligned memory location to registers.
+__forceinline__ __device__ uint4 load_uint4(const char* ptr)
+{
+  auto const offset       = reinterpret_cast<std::uintptr_t>(ptr) % 4;
+  auto const* aligned_ptr = reinterpret_cast<unsigned int const*>(ptr - offset);
+  auto const shift        = offset * 8;
+
+  uint4 regs = {aligned_ptr[0], aligned_ptr[1], aligned_ptr[2], aligned_ptr[3]};
+  uint tail  = 0;
+  if (shift) tail = aligned_ptr[4];
+
+  regs.x = __funnelshift_r(regs.x, regs.y, shift);
+  regs.y = __funnelshift_r(regs.y, regs.z, shift);
+  regs.z = __funnelshift_r(regs.z, regs.w, shift);
+  regs.w = __funnelshift_r(regs.w, tail, shift);
+
+  return regs;
+}
+
+/**
+ * @brief Gather characters from the input iterator, with string parallel strategy.
+ *
+ * This strategy assigns strings to warps so that each warp can cooperatively copy from the input
+ * location of the string to the corresponding output location. Large datatype (uint4) is used for
+ * stores. This strategy is best suited for large strings.
+ *
+ * @tparam StringIterator Iterator should produce `string_view` objects.
+ * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`.
+ *
+ * @param strings_begin Start of the iterator to retrieve `string_view` instances.
+ * @param out_chars Output buffer for gathered characters.
+ * @param out_offsets The offset values associated with the output buffer.
+ * @param string_indices Start of index iterator.
+ * @param total_out_strings Number of output strings to be gathered.
+ */
+template <typename StringIterator, typename MapIterator>
+__global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
+                                                char* out_chars,
+                                                cudf::device_span<int32_t const> const out_offsets,
+                                                MapIterator string_indices,
+                                                size_type total_out_strings)
+{
+  constexpr size_t out_datatype_size = sizeof(uint4);
+  constexpr size_t in_datatype_size  = sizeof(uint);
+
+  int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int global_warp_id   = global_thread_id / cudf::detail::warp_size;
+  int warp_lane        = global_thread_id % cudf::detail::warp_size;
+  int nwarps           = gridDim.x * blockDim.x / cudf::detail::warp_size;
+
+  auto const alignment_offset = reinterpret_cast<std::uintptr_t>(out_chars) % out_datatype_size;
+  uint4* out_chars_aligned    = reinterpret_cast<uint4*>(out_chars - alignment_offset);
+
+  for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
+    auto const out_start = out_offsets[istring];
+    auto const out_end   = out_offsets[istring + 1];
+
+    // This check is necessary because string_indices[istring] may be out of bound.
+    if (out_start == out_end) continue;
+
+    const char* in_start = strings_begin[string_indices[istring]].data();
+
+    // Both `out_start_aligned` and `out_end_aligned` are indices into `out_chars`.
+    // `out_start_aligned` is the first 16B aligned memory location after `out_start + 4`.
+    // `out_end_aligned` is the last 16B aligned memory location before `out_end - 4`. Characters
+    // between `[out_start_aligned, out_end_aligned)` will be copied using uint4.
+    // `out_start + 4` and `out_end - 4` are used instead of `out_start` and `out_end` to avoid
+    // `load_uint4` reading beyond string boundaries.
+    int32_t out_start_aligned =
+      (out_start + in_datatype_size + alignment_offset + out_datatype_size - 1) /
+        out_datatype_size * out_datatype_size -
+      alignment_offset;
+    int32_t out_end_aligned =
+      (out_end - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size -
+      alignment_offset;
+
+    for (size_type ichar = out_start_aligned + warp_lane * out_datatype_size;
+         ichar < out_end_aligned;
+         ichar += cudf::detail::warp_size * out_datatype_size) {
+      *(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) =
+        load_uint4(in_start + ichar - out_start);
+    }
+
+    // Tail logic: copy characters of the current string outside `[out_start_aligned,
+    // out_end_aligned)`.
+    if (out_end_aligned <= out_start_aligned) {
+      // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the
+      // entire string.
+      for (int32_t ichar = out_start + warp_lane; ichar < out_end;
+           ichar += cudf::detail::warp_size) {
+        out_chars[ichar] = in_start[ichar - out_start];
+      }
+    } else {
+      // Copy characters in range `[out_start, out_start_aligned)`.
+      if (out_start + warp_lane < out_start_aligned) {
+        out_chars[out_start + warp_lane] = in_start[warp_lane];
+      }
+      // Copy characters in range `[out_end_aligned, out_end)`.
+      int32_t ichar = out_end_aligned + warp_lane;
+      if (ichar < out_end) { out_chars[ichar] = in_start[ichar - out_start]; }
+    }
+  }
+}
+
+/**
+ * @brief Gather characters from the input iterator, with char parallel strategy.
+ *
+ * This strategy assigns characters to threads, and uses binary search for getting the string
+ * index. To improve the binary search performance, fixed number of strings per threadblock is
+ * used. This strategy is best suited for small strings.
+ *
+ * @tparam StringIterator Iterator should produce `string_view` objects.
+ * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`.
+ *
+ * @param strings_begin Start of the iterator to retrieve `string_view` instances.
+ * @param out_chars Output buffer for gathered characters.
+ * @param out_offsets The offset values associated with the output buffer.
+ * @param string_indices Start of index iterator.
+ * @param total_out_strings Number of output strings to be gathered.
+ */
+template <int strings_per_threadblock, typename StringIterator, typename MapIterator>
+__global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
+                                              char* out_chars,
+                                              cudf::device_span<int32_t const> const out_offsets,
+                                              MapIterator string_indices,
+                                              size_type total_out_strings)
+{
+  __shared__ int32_t out_offsets_threadblock[strings_per_threadblock + 1];
+
+  // Current thread block will process output strings starting at `begin_out_string_idx`.
+  size_type begin_out_string_idx = blockIdx.x * strings_per_threadblock;
+
+  // Number of strings to be processed by the current threadblock.
+  size_type strings_current_threadblock =
+    min(strings_per_threadblock, total_out_strings - begin_out_string_idx);
+
+  if (strings_current_threadblock <= 0) return;
+
+  // Collectively load offsets of strings processed by the current thread block.
+  for (size_type idx = threadIdx.x; idx <= strings_current_threadblock; idx += blockDim.x) {
+    out_offsets_threadblock[idx] = out_offsets[idx + begin_out_string_idx];
+  }
+  __syncthreads();
+
+  for (int32_t out_ibyte = threadIdx.x + out_offsets_threadblock[0];
+       out_ibyte < out_offsets_threadblock[strings_current_threadblock];
+       out_ibyte += blockDim.x) {
+    // binary search for the string index corresponding to out_ibyte
+    auto const string_idx_iter =
+      thrust::prev(thrust::upper_bound(thrust::seq,
+                                       out_offsets_threadblock,
+                                       out_offsets_threadblock + strings_current_threadblock,
+                                       out_ibyte));
+    size_type string_idx = thrust::distance(out_offsets_threadblock, string_idx_iter);
+
+    // calculate which character to load within the string
+    int32_t icharacter = out_ibyte - out_offsets_threadblock[string_idx];
+
+    size_type in_string_idx = string_indices[begin_out_string_idx + string_idx];
+    out_chars[out_ibyte]    = strings_begin[in_string_idx].data()[icharacter];
+  }
+}
+
 /**
  * @brief Returns a new chars column using the specified indices to select
  * strings from the input iterator.
@@ -44,7 +208,7 @@ namespace detail {
  * @tparam StringIterator Iterator should produce `string_view` objects.
  * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`.
  *
- * @param strings_begin Start of the iterator to retrieve `string_view` instances
+ * @param strings_begin Start of the iterator to retrieve `string_view` instances.
  * @param map_begin Start of index iterator.
  * @param map_end End of index iterator.
  * @param offsets The offset values to be associated with the output chars column.
@@ -68,20 +232,29 @@ std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
   auto chars_column  = create_chars_child_column(output_count, chars_bytes, stream, mr);
   auto const d_chars = chars_column->mutable_view().template data<char>();
 
-  auto gather_chars_fn = [strings_begin, map_begin, offsets] __device__(size_type out_idx) -> char {
-    auto const out_row =
-      thrust::prev(thrust::upper_bound(thrust::seq, offsets.begin(), offsets.end(), out_idx));
-    auto const row_idx = map_begin[thrust::distance(offsets.begin(), out_row)];  // get row index
-    auto const d_str   = strings_begin[row_idx];                                 // get row's string
-    auto const offset  = out_idx - *out_row;  // get string's char
-    return d_str.data()[offset];
-  };
+  constexpr int warps_per_threadblock = 4;
+  // String parallel strategy will be used if average string length is above this threshold.
+  // Otherwise, char parallel strategy will be used.
+  constexpr size_type string_parallel_threshold = 32;
 
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(chars_bytes),
-                    d_chars,
-                    gather_chars_fn);
+  size_type average_string_length = chars_bytes / output_count;
+
+  if (average_string_length > string_parallel_threshold) {
+    constexpr int max_threadblocks = 65536;
+    gather_chars_fn_string_parallel<<<
+      min((static_cast<int>(output_count) + warps_per_threadblock - 1) / warps_per_threadblock,
+          max_threadblocks),
+      warps_per_threadblock * cudf::detail::warp_size,
+      0,
+      stream.value()>>>(strings_begin, d_chars, offsets, map_begin, output_count);
+  } else {
+    constexpr int strings_per_threadblock = 32;
+    gather_chars_fn_char_parallel<strings_per_threadblock>
+      <<<(output_count + strings_per_threadblock - 1) / strings_per_threadblock,
+         warps_per_threadblock * cudf::detail::warp_size,
+         0,
+         stream.value()>>>(strings_begin, d_chars, offsets, map_begin, output_count);
+  }
 
   return chars_column;
 }