From 5831beb80dab9cc23668b5a701d9a92a4797fe70 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 20 Nov 2023 15:39:35 -0800 Subject: [PATCH] Remove the use of `volatile` in Parquet (#14448) `volatile` should no be required in our code, unless there are compiler or synchronization issues. This PR removes the use in Parquet reader and writer. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - David Wendt (https://github.com/davidwendt) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14448 --- cpp/src/io/parquet/decode_preprocess.cu | 2 +- cpp/src/io/parquet/page_data.cu | 31 +++++++++--------------- cpp/src/io/parquet/page_decode.cuh | 29 ++++++++++------------ cpp/src/io/parquet/page_enc.cu | 2 +- cpp/src/io/parquet/page_string_decode.cu | 4 +-- 5 files changed, 28 insertions(+), 40 deletions(-) diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index 544c93ee616..d9f91ed564c 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -61,7 +61,7 @@ __device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t) } else if ((s->col.data_type & 7) == BYTE_ARRAY) { str_len = gpuInitStringDescriptors(s, nullptr, target_pos, t); } - if (!t) { *(int32_t volatile*)&s->dict_pos = target_pos; } + if (!t) { s->dict_pos = target_pos; } return str_len; } diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 0c53877f7c7..1a94f05498e 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -39,10 +39,7 @@ constexpr int rolling_buf_size = decode_block_size * 2; * @param[in] dstv Pointer to row output data (string descriptor or 32-bit hash) */ template -inline __device__ void gpuOutputString(volatile page_state_s* s, - volatile state_buf* sb, - int src_pos, - void* dstv) +inline __device__ void gpuOutputString(page_state_s* s, state_buf* sb, int src_pos, void* dstv) { auto [ptr, len] = gpuGetStringData(s, sb, src_pos); // make sure to only hash `BYTE_ARRAY` when specified with the output type size @@ -69,7 +66,7 @@ inline __device__ void gpuOutputString(volatile page_state_s* s, * @param[in] dst Pointer to row output data */ template -inline __device__ void gpuOutputBoolean(volatile state_buf* sb, int src_pos, uint8_t* dst) +inline __device__ void gpuOutputBoolean(state_buf* sb, int src_pos, uint8_t* dst) { *dst = sb->dict_idx[rolling_index(src_pos)]; } @@ -143,8 +140,8 @@ inline __device__ void gpuStoreOutput(uint2* dst, * @param[out] dst Pointer to row output data */ template -inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, - volatile state_buf* sb, +inline __device__ void gpuOutputInt96Timestamp(page_state_s* s, + state_buf* sb, int src_pos, int64_t* dst) { @@ -218,8 +215,8 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, * @param[in] dst Pointer to row output data */ template -inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s* s, - volatile state_buf* sb, +inline __device__ void gpuOutputInt64Timestamp(page_state_s* s, + state_buf* sb, int src_pos, int64_t* dst) { @@ -301,10 +298,7 @@ __device__ void gpuOutputByteArrayAsInt(char const* ptr, int32_t len, T* dst) * @param[in] dst Pointer to row output data */ template -__device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s, - volatile state_buf* sb, - int src_pos, - T* dst) +__device__ void gpuOutputFixedLenByteArrayAsInt(page_state_s* s, state_buf* sb, int src_pos, T* dst) { uint32_t const dtype_len_in = s->dtype_len_in; uint8_t const* data = s->dict_base ? s->dict_base : s->data_start; @@ -338,10 +332,7 @@ __device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s, * @param[in] dst Pointer to row output data */ template -inline __device__ void gpuOutputFast(volatile page_state_s* s, - volatile state_buf* sb, - int src_pos, - T* dst) +inline __device__ void gpuOutputFast(page_state_s* s, state_buf* sb, int src_pos, T* dst) { uint8_t const* dict; uint32_t dict_pos, dict_size = s->dict_size; @@ -371,7 +362,7 @@ inline __device__ void gpuOutputFast(volatile page_state_s* s, */ template static __device__ void gpuOutputGeneric( - volatile page_state_s* s, volatile state_buf* sb, int src_pos, uint8_t* dst8, int len) + page_state_s* s, state_buf* sb, int src_pos, uint8_t* dst8, int len) { uint8_t const* dict; uint32_t dict_pos, dict_size = s->dict_size; @@ -512,7 +503,7 @@ __global__ void __launch_bounds__(decode_block_size) (s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) { gpuInitStringDescriptors(s, sb, src_target_pos, t & 0x1f); } - if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; } + if (t == 32) { s->dict_pos = src_target_pos; } } else { // WARP1..WARP3: Decode values int const dtype = s->col.data_type & 7; @@ -601,7 +592,7 @@ __global__ void __launch_bounds__(decode_block_size) } } - if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; } + if (t == out_thread0) { s->src_pos = target_pos; } } __syncthreads(); } diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index 4db9bd3904b..a521f4af039 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -71,15 +71,15 @@ struct page_state_s { // points to either nesting_decode_cache above when possible, or to the global source otherwise PageNestingDecodeInfo* nesting_info{}; - inline __device__ void set_error_code(decode_error err) volatile + inline __device__ void set_error_code(decode_error err) { - cuda::atomic_ref ref{const_cast(error)}; + cuda::atomic_ref ref{error}; ref.fetch_or(static_cast(err), cuda::std::memory_order_relaxed); } - inline __device__ void reset_error_code() volatile + inline __device__ void reset_error_code() { - cuda::atomic_ref ref{const_cast(error)}; + cuda::atomic_ref ref{error}; ref.store(0, cuda::std::memory_order_release); } }; @@ -185,8 +185,8 @@ inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row * @return A pair containing a pointer to the string and its length */ template -inline __device__ cuda::std::pair gpuGetStringData(page_state_s volatile* s, - state_buf volatile* sb, +inline __device__ cuda::std::pair gpuGetStringData(page_state_s* s, + state_buf* sb, int src_pos) { char const* ptr = nullptr; @@ -232,8 +232,10 @@ inline __device__ cuda::std::pair gpuGetStringData(page_sta * additional values. */ template -__device__ cuda::std::pair gpuDecodeDictionaryIndices( - page_state_s volatile* s, [[maybe_unused]] state_buf volatile* sb, int target_pos, int t) +__device__ cuda::std::pair gpuDecodeDictionaryIndices(page_state_s* s, + [[maybe_unused]] state_buf* sb, + int target_pos, + int t) { uint8_t const* end = s->data_end; int dict_bits = s->dict_bits; @@ -349,10 +351,7 @@ __device__ cuda::std::pair gpuDecodeDictionaryIndices( * @return The new output position */ template -inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s, - state_buf volatile* sb, - int target_pos, - int t) +inline __device__ int gpuDecodeRleBooleans(page_state_s* s, state_buf* sb, int target_pos, int t) { uint8_t const* end = s->data_end; int64_t pos = s->dict_pos; @@ -420,10 +419,8 @@ inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s, * @return Total length of strings processed */ template -__device__ size_type gpuInitStringDescriptors(page_state_s volatile* s, - [[maybe_unused]] state_buf volatile* sb, - int target_pos, - int t) +__device__ size_type +gpuInitStringDescriptors(page_state_s* s, [[maybe_unused]] state_buf* sb, int target_pos, int t) { int pos = s->dict_pos; int total_len = 0; diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 2b7980c93e9..d75608930d5 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -90,7 +90,7 @@ struct page_enc_state_s { uint32_t rle_rpt_count; uint32_t page_start_val; uint32_t chunk_start_val; - volatile uint32_t rpt_map[num_encode_warps]; + uint32_t rpt_map[num_encode_warps]; EncPage page; EncColumnChunk ck; parquet_column_device_view col; diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index e29db042401..916eaa3d681 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -851,7 +851,7 @@ __global__ void __launch_bounds__(decode_block_size) } else { gpuInitStringDescriptors(s, sb, src_target_pos, lane_id); } - if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; } + if (t == 32) { s->dict_pos = src_target_pos; } } else { int const me = t - out_thread0; @@ -934,7 +934,7 @@ __global__ void __launch_bounds__(decode_block_size) } } - if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; } + if (t == out_thread0) { s->src_pos = target_pos; } } __syncthreads(); }