rapidsai · rapids-bot · Dec 15, 2023 · Dec 8, 2023 · Dec 8, 2023 · Dec 8, 2023
@@ -219,6 +219,8 @@ struct delta_binary_decoder {
 
     // need to account for the first value from header on first pass
     if (current_value_idx == 0) {
+      // make sure all threads access current_value_idx above before incrementing
+      __syncwarp();
       if (lane_id == 0) { current_value_idx++; }
       __syncwarp();
       if (current_value_idx >= value_count) { return; }

@@ -482,7 +482,7 @@ __global__ void __launch_bounds__(decode_block_size)
       target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
       if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); }
     }
-    // TODO(ets): see if this sync can be removed
+    // this needs to be here to prevent warp 3 modifying src_pos before all threads have read it
     __syncthreads();
     if (t < 32) {
       // decode repetition and definition levels.

@@ -243,6 +243,9 @@ __device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(page_state_s* s,
   int pos            = s->dict_pos;
   int str_len        = 0;
 
+  // ensure all threads read s->dict_pos before returning
+  __syncwarp();
+
   while (pos < target_pos) {
     int is_literal, batch_len;
     if (!t) {
@@ -357,6 +360,11 @@ inline __device__ int gpuDecodeRleBooleans(page_state_s* s, state_buf* sb, int t
   uint8_t const* end = s->data_end;
   int64_t pos        = s->dict_pos;
 
+  // ensure all threads read s->dict_pos before returning
+  // NOTE: Removing this does not trigger any race warnings, but is similar to access pattern
+  // in gpuDecodeDictionaryIndices which does.
+  __syncwarp();
+
   while (pos < target_pos) {
     int is_literal, batch_len;
     if (!t) {
@@ -426,6 +434,11 @@ gpuInitStringDescriptors(page_state_s* s, [[maybe_unused]] state_buf* sb, int ta
   int pos       = s->dict_pos;
   int total_len = 0;
 
+  // ensure all threads read s->dict_pos before returning
+  // NOTE: Removing this does not trigger any race warnings, but is similar to access pattern
+  // in gpuDecodeDictionaryIndices which does.
+  __syncwarp();
+
   // This step is purely serial
   if (!t) {
     uint8_t const* cur = s->data_start;
@@ -549,6 +562,9 @@ __device__ void gpuDecodeStream(
     batch_coded_count += batch_len;
     value_count += batch_len;
   }
+  // issue #14597
+  // racecheck reported race between reads at the start of this function and the writes below
+  __syncwarp();
 
   // update the stream info
   if (!t) {
@@ -691,6 +707,10 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
 
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
 
+  // need this to ensure input_value_count is read by all threads before s->input_value_count
+  // is modified below (just in case input_value count >= target_input_value_count).
+  __syncwarp();
+
   // process until we've reached the target
   while (input_value_count < target_input_value_count) {
     // determine the nesting bounds for this thread (the range of nesting depths we

@@ -365,7 +365,7 @@ __global__ void __launch_bounds__(96)
     } else {  // warp2
       target_pos = min(s->nz_count, src_pos + batch_size);
     }
-    // TODO(ets): see if this sync can be removed
+    // this needs to be here to prevent warp 2 modifying src_pos before all threads have read it
     __syncthreads();
 
     // warp0 will decode the rep/def levels, warp1 will unpack a mini-batch of deltas.
@@ -507,7 +507,7 @@ __global__ void __launch_bounds__(decode_block_size)
     } else {  // warp 3
       target_pos = min(s->nz_count, src_pos + batch_size);
     }
-    // TODO(ets): see if this sync can be removed
+    // this needs to be here to prevent warp 3 modifying src_pos before all threads have read it
     __syncthreads();
 
     // warp0 will decode the rep/def levels, warp1 will unpack a mini-batch of prefixes, warp 2 will

@@ -78,8 +78,11 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
 
   // can skip all this if we know there are no nulls
   if (max_def == 0 && !is_bounds_pg) {
-    s->page.num_valids = s->num_input_values;
-    s->page.num_nulls  = 0;
+    if (t == 0) {
+      s->page.num_valids = s->num_input_values;
+      s->page.num_nulls  = 0;
+    }
+    __syncthreads();
     return {0, s->num_input_values};
   }
 
@@ -134,6 +137,7 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
         pp->num_nulls  = 0;
         pp->num_valids = end_row - begin_row;
       }
+      __syncthreads();
       return {begin_row, end_row};
     }
 
@@ -268,6 +272,7 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
       pp->num_nulls               = num_nulls;
       pp->num_valids              = total_leaf_values;
     }
+    __syncthreads();
   }
   // already filtered out unwanted pages, so need to count all non-null values in this page
   else {
@@ -848,7 +853,7 @@ __global__ void __launch_bounds__(decode_block_size)
       target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
       if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); }
     }
-    // TODO(ets): see if this sync can be removed
+    // this needs to be here to prevent warp 1/2 modifying src_pos before all threads have read it
     __syncthreads();
     if (t < 32) {
       // decode repetition and definition levels.