From 5831beb80dab9cc23668b5a701d9a92a4797fe70 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 20 Nov 2023 15:39:35 -0800
Subject: [PATCH] Remove the use of `volatile` in Parquet (#14448)

`volatile` should no be required in our code, unless there are compiler or synchronization issues.
This PR removes the use in Parquet reader and writer.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14448
---
 cpp/src/io/parquet/decode_preprocess.cu  |  2 +-
 cpp/src/io/parquet/page_data.cu          | 31 +++++++++---------------
 cpp/src/io/parquet/page_decode.cuh       | 29 ++++++++++------------
 cpp/src/io/parquet/page_enc.cu           |  2 +-
 cpp/src/io/parquet/page_string_decode.cu |  4 +--
 5 files changed, 28 insertions(+), 40 deletions(-)
diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index 544c93ee616..d9f91ed564c 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -61,7 +61,7 @@ __device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
   } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
     str_len = gpuInitStringDescriptors<true, unused_state_buf>(s, nullptr, target_pos, t);
   }
-  if (!t) { *(int32_t volatile*)&s->dict_pos = target_pos; }
+  if (!t) { s->dict_pos = target_pos; }
   return str_len;
 }
 
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 0c53877f7c7..1a94f05498e 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -39,10 +39,7 @@ constexpr int rolling_buf_size  = decode_block_size * 2;
  * @param[in] dstv Pointer to row output data (string descriptor or 32-bit hash)
  */
 template <typename state_buf>
-inline __device__ void gpuOutputString(volatile page_state_s* s,
-                                       volatile state_buf* sb,
-                                       int src_pos,
-                                       void* dstv)
+inline __device__ void gpuOutputString(page_state_s* s, state_buf* sb, int src_pos, void* dstv)
 {
   auto [ptr, len] = gpuGetStringData(s, sb, src_pos);
   // make sure to only hash `BYTE_ARRAY` when specified with the output type size
@@ -69,7 +66,7 @@ inline __device__ void gpuOutputString(volatile page_state_s* s,
  * @param[in] dst Pointer to row output data
  */
 template <typename state_buf>
-inline __device__ void gpuOutputBoolean(volatile state_buf* sb, int src_pos, uint8_t* dst)
+inline __device__ void gpuOutputBoolean(state_buf* sb, int src_pos, uint8_t* dst)
 {
   *dst = sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)];
 }
@@ -143,8 +140,8 @@ inline __device__ void gpuStoreOutput(uint2* dst,
  * @param[out] dst Pointer to row output data
  */
 template <typename state_buf>
-inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s,
-                                               volatile state_buf* sb,
+inline __device__ void gpuOutputInt96Timestamp(page_state_s* s,
+                                               state_buf* sb,
                                                int src_pos,
                                                int64_t* dst)
 {
@@ -218,8 +215,8 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s,
  * @param[in] dst Pointer to row output data
  */
 template <typename state_buf>
-inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s* s,
-                                               volatile state_buf* sb,
+inline __device__ void gpuOutputInt64Timestamp(page_state_s* s,
+                                               state_buf* sb,
                                                int src_pos,
                                                int64_t* dst)
 {
@@ -301,10 +298,7 @@ __device__ void gpuOutputByteArrayAsInt(char const* ptr, int32_t len, T* dst)
  * @param[in] dst Pointer to row output data
  */
 template <typename T, typename state_buf>
-__device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s,
-                                                volatile state_buf* sb,
-                                                int src_pos,
-                                                T* dst)
+__device__ void gpuOutputFixedLenByteArrayAsInt(page_state_s* s, state_buf* sb, int src_pos, T* dst)
 {
   uint32_t const dtype_len_in = s->dtype_len_in;
   uint8_t const* data         = s->dict_base ? s->dict_base : s->data_start;
@@ -338,10 +332,7 @@ __device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s,
  * @param[in] dst Pointer to row output data
  */
 template <typename T, typename state_buf>
-inline __device__ void gpuOutputFast(volatile page_state_s* s,
-                                     volatile state_buf* sb,
-                                     int src_pos,
-                                     T* dst)
+inline __device__ void gpuOutputFast(page_state_s* s, state_buf* sb, int src_pos, T* dst)
 {
   uint8_t const* dict;
   uint32_t dict_pos, dict_size = s->dict_size;
@@ -371,7 +362,7 @@ inline __device__ void gpuOutputFast(volatile page_state_s* s,
  */
 template <typename state_buf>
 static __device__ void gpuOutputGeneric(
-  volatile page_state_s* s, volatile state_buf* sb, int src_pos, uint8_t* dst8, int len)
+  page_state_s* s, state_buf* sb, int src_pos, uint8_t* dst8, int len)
 {
   uint8_t const* dict;
   uint32_t dict_pos, dict_size = s->dict_size;
@@ -512,7 +503,7 @@ __global__ void __launch_bounds__(decode_block_size)
                  (s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
         gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
       }
-      if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
+      if (t == 32) { s->dict_pos = src_target_pos; }
     } else {
       // WARP1..WARP3: Decode values
       int const dtype = s->col.data_type & 7;
@@ -601,7 +592,7 @@ __global__ void __launch_bounds__(decode_block_size)
         }
       }
 
-      if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; }
+      if (t == out_thread0) { s->src_pos = target_pos; }
     }
     __syncthreads();
   }
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 4db9bd3904b..a521f4af039 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -71,15 +71,15 @@ struct page_state_s {
   // points to either nesting_decode_cache above when possible, or to the global source otherwise
   PageNestingDecodeInfo* nesting_info{};
 
-  inline __device__ void set_error_code(decode_error err) volatile
+  inline __device__ void set_error_code(decode_error err)
   {
-    cuda::atomic_ref<int32_t, cuda::thread_scope_block> ref{const_cast<int&>(error)};
+    cuda::atomic_ref<int32_t, cuda::thread_scope_block> ref{error};
     ref.fetch_or(static_cast<int32_t>(err), cuda::std::memory_order_relaxed);
   }
 
-  inline __device__ void reset_error_code() volatile
+  inline __device__ void reset_error_code()
   {
-    cuda::atomic_ref<int32_t, cuda::thread_scope_block> ref{const_cast<int&>(error)};
+    cuda::atomic_ref<int32_t, cuda::thread_scope_block> ref{error};
     ref.store(0, cuda::std::memory_order_release);
   }
 };
@@ -185,8 +185,8 @@ inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row
  * @return A pair containing a pointer to the string and its length
  */
 template <typename state_buf>
-inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(page_state_s volatile* s,
-                                                                        state_buf volatile* sb,
+inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(page_state_s* s,
+                                                                        state_buf* sb,
                                                                         int src_pos)
 {
   char const* ptr = nullptr;
@@ -232,8 +232,10 @@ inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(page_sta
  * additional values.
  */
 template <bool sizes_only, typename state_buf>
-__device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
-  page_state_s volatile* s, [[maybe_unused]] state_buf volatile* sb, int target_pos, int t)
+__device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(page_state_s* s,
+                                                                [[maybe_unused]] state_buf* sb,
+                                                                int target_pos,
+                                                                int t)
 {
   uint8_t const* end = s->data_end;
   int dict_bits      = s->dict_bits;
@@ -349,10 +351,7 @@ __device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
  * @return The new output position
  */
 template <typename state_buf>
-inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s,
-                                           state_buf volatile* sb,
-                                           int target_pos,
-                                           int t)
+inline __device__ int gpuDecodeRleBooleans(page_state_s* s, state_buf* sb, int target_pos, int t)
 {
   uint8_t const* end = s->data_end;
   int64_t pos        = s->dict_pos;
@@ -420,10 +419,8 @@ inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s,
  * @return Total length of strings processed
  */
 template <bool sizes_only, typename state_buf>
-__device__ size_type gpuInitStringDescriptors(page_state_s volatile* s,
-                                              [[maybe_unused]] state_buf volatile* sb,
-                                              int target_pos,
-                                              int t)
+__device__ size_type
+gpuInitStringDescriptors(page_state_s* s, [[maybe_unused]] state_buf* sb, int target_pos, int t)
 {
   int pos       = s->dict_pos;
   int total_len = 0;
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 2b7980c93e9..d75608930d5 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -90,7 +90,7 @@ struct page_enc_state_s {
   uint32_t rle_rpt_count;
   uint32_t page_start_val;
   uint32_t chunk_start_val;
-  volatile uint32_t rpt_map[num_encode_warps];
+  uint32_t rpt_map[num_encode_warps];
   EncPage page;
   EncColumnChunk ck;
   parquet_column_device_view col;
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index e29db042401..916eaa3d681 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -851,7 +851,7 @@ __global__ void __launch_bounds__(decode_block_size)
       } else {
         gpuInitStringDescriptors<false>(s, sb, src_target_pos, lane_id);
       }
-      if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
+      if (t == 32) { s->dict_pos = src_target_pos; }
     } else {
       int const me = t - out_thread0;
 
@@ -934,7 +934,7 @@ __global__ void __launch_bounds__(decode_block_size)
         }
       }
 
-      if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; }
+      if (t == out_thread0) { s->src_pos = target_pos; }
     }
     __syncthreads();
   }