From da04725729402f0f5b475adf863d32fef6fb898a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 28 Sep 2022 14:13:21 -0400
Subject: [PATCH] Fix regex out-of-bounds write in strided rows logic (#11797)

Fixes an out-of-bounds write error when a large number of strings requires a strided loop to meet an internal memory maximum. For row sizes that do not require strided loops, the row index never exceeds the size of the column preventing any out-of-bounds access. For large row counts, the CUDA `thread index` may be larger than the minimal count used for building the working-memory buffer. Since the kernel is launched with a thread-count with a specific block size, extra threads past the end of the minimal count are necessary to fill out the last block. These threads never contribute to the overall result but will attempt to access past the end of the working memory. Writing to this memory may corrupt memory for another kernel launched in parallel from another CPU thread. This change adds logic to prevent the extra threads from doing any work.

Fixes #11768

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/11797
---
 cpp/src/strings/regex/utilities.cuh | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh
index 9a80be25b3b..3ae42368411 100644
--- a/cpp/src/strings/regex/utilities.cuh
+++ b/cpp/src/strings/regex/utilities.cuh
@@ -44,8 +44,10 @@ __global__ void for_each_kernel(ForEachFunction fn, reprog_device const d_prog,
 
   auto const thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
   auto const stride     = s_prog.thread_count();
-  for (auto idx = thread_idx; idx < size; idx += stride) {
-    fn(idx, s_prog, thread_idx);
+  if (thread_idx < stride) {
+    for (auto idx = thread_idx; idx < size; idx += stride) {
+      fn(idx, s_prog, thread_idx);
+    }
   }
 }
 
@@ -79,8 +81,10 @@ __global__ void transform_kernel(TransformFunction fn,
 
   auto const thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
   auto const stride     = s_prog.thread_count();
-  for (auto idx = thread_idx; idx < size; idx += stride) {
-    d_output[idx] = fn(idx, s_prog, thread_idx);
+  if (thread_idx < stride) {
+    for (auto idx = thread_idx; idx < size; idx += stride) {
+      d_output[idx] = fn(idx, s_prog, thread_idx);
+    }
   }
 }