Skip to content

Commit

Permalink
Remove the use of volatile in Parquet (#14448)
Browse files Browse the repository at this point in the history
`volatile` should no be required in our code, unless there are compiler or synchronization issues.
This PR removes the use in Parquet reader and writer.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: #14448
  • Loading branch information
vuule authored Nov 20, 2023
1 parent c8603c2 commit 5831beb
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 40 deletions.
2 changes: 1 addition & 1 deletion cpp/src/io/parquet/decode_preprocess.cu
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ __device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
} else if ((s->col.data_type & 7) == BYTE_ARRAY) {
str_len = gpuInitStringDescriptors<true, unused_state_buf>(s, nullptr, target_pos, t);
}
if (!t) { *(int32_t volatile*)&s->dict_pos = target_pos; }
if (!t) { s->dict_pos = target_pos; }
return str_len;
}

Expand Down
31 changes: 11 additions & 20 deletions cpp/src/io/parquet/page_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,7 @@ constexpr int rolling_buf_size = decode_block_size * 2;
* @param[in] dstv Pointer to row output data (string descriptor or 32-bit hash)
*/
template <typename state_buf>
inline __device__ void gpuOutputString(volatile page_state_s* s,
volatile state_buf* sb,
int src_pos,
void* dstv)
inline __device__ void gpuOutputString(page_state_s* s, state_buf* sb, int src_pos, void* dstv)
{
auto [ptr, len] = gpuGetStringData(s, sb, src_pos);
// make sure to only hash `BYTE_ARRAY` when specified with the output type size
Expand All @@ -69,7 +66,7 @@ inline __device__ void gpuOutputString(volatile page_state_s* s,
* @param[in] dst Pointer to row output data
*/
template <typename state_buf>
inline __device__ void gpuOutputBoolean(volatile state_buf* sb, int src_pos, uint8_t* dst)
inline __device__ void gpuOutputBoolean(state_buf* sb, int src_pos, uint8_t* dst)
{
*dst = sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)];
}
Expand Down Expand Up @@ -143,8 +140,8 @@ inline __device__ void gpuStoreOutput(uint2* dst,
* @param[out] dst Pointer to row output data
*/
template <typename state_buf>
inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s,
volatile state_buf* sb,
inline __device__ void gpuOutputInt96Timestamp(page_state_s* s,
state_buf* sb,
int src_pos,
int64_t* dst)
{
Expand Down Expand Up @@ -218,8 +215,8 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s,
* @param[in] dst Pointer to row output data
*/
template <typename state_buf>
inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s* s,
volatile state_buf* sb,
inline __device__ void gpuOutputInt64Timestamp(page_state_s* s,
state_buf* sb,
int src_pos,
int64_t* dst)
{
Expand Down Expand Up @@ -301,10 +298,7 @@ __device__ void gpuOutputByteArrayAsInt(char const* ptr, int32_t len, T* dst)
* @param[in] dst Pointer to row output data
*/
template <typename T, typename state_buf>
__device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s,
volatile state_buf* sb,
int src_pos,
T* dst)
__device__ void gpuOutputFixedLenByteArrayAsInt(page_state_s* s, state_buf* sb, int src_pos, T* dst)
{
uint32_t const dtype_len_in = s->dtype_len_in;
uint8_t const* data = s->dict_base ? s->dict_base : s->data_start;
Expand Down Expand Up @@ -338,10 +332,7 @@ __device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s,
* @param[in] dst Pointer to row output data
*/
template <typename T, typename state_buf>
inline __device__ void gpuOutputFast(volatile page_state_s* s,
volatile state_buf* sb,
int src_pos,
T* dst)
inline __device__ void gpuOutputFast(page_state_s* s, state_buf* sb, int src_pos, T* dst)
{
uint8_t const* dict;
uint32_t dict_pos, dict_size = s->dict_size;
Expand Down Expand Up @@ -371,7 +362,7 @@ inline __device__ void gpuOutputFast(volatile page_state_s* s,
*/
template <typename state_buf>
static __device__ void gpuOutputGeneric(
volatile page_state_s* s, volatile state_buf* sb, int src_pos, uint8_t* dst8, int len)
page_state_s* s, state_buf* sb, int src_pos, uint8_t* dst8, int len)
{
uint8_t const* dict;
uint32_t dict_pos, dict_size = s->dict_size;
Expand Down Expand Up @@ -512,7 +503,7 @@ __global__ void __launch_bounds__(decode_block_size)
(s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
}
if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
if (t == 32) { s->dict_pos = src_target_pos; }
} else {
// WARP1..WARP3: Decode values
int const dtype = s->col.data_type & 7;
Expand Down Expand Up @@ -601,7 +592,7 @@ __global__ void __launch_bounds__(decode_block_size)
}
}

if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; }
if (t == out_thread0) { s->src_pos = target_pos; }
}
__syncthreads();
}
Expand Down
29 changes: 13 additions & 16 deletions cpp/src/io/parquet/page_decode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,15 @@ struct page_state_s {
// points to either nesting_decode_cache above when possible, or to the global source otherwise
PageNestingDecodeInfo* nesting_info{};

inline __device__ void set_error_code(decode_error err) volatile
inline __device__ void set_error_code(decode_error err)
{
cuda::atomic_ref<int32_t, cuda::thread_scope_block> ref{const_cast<int&>(error)};
cuda::atomic_ref<int32_t, cuda::thread_scope_block> ref{error};
ref.fetch_or(static_cast<int32_t>(err), cuda::std::memory_order_relaxed);
}

inline __device__ void reset_error_code() volatile
inline __device__ void reset_error_code()
{
cuda::atomic_ref<int32_t, cuda::thread_scope_block> ref{const_cast<int&>(error)};
cuda::atomic_ref<int32_t, cuda::thread_scope_block> ref{error};
ref.store(0, cuda::std::memory_order_release);
}
};
Expand Down Expand Up @@ -185,8 +185,8 @@ inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row
* @return A pair containing a pointer to the string and its length
*/
template <typename state_buf>
inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(page_state_s volatile* s,
state_buf volatile* sb,
inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(page_state_s* s,
state_buf* sb,
int src_pos)
{
char const* ptr = nullptr;
Expand Down Expand Up @@ -232,8 +232,10 @@ inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(page_sta
* additional values.
*/
template <bool sizes_only, typename state_buf>
__device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
page_state_s volatile* s, [[maybe_unused]] state_buf volatile* sb, int target_pos, int t)
__device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(page_state_s* s,
[[maybe_unused]] state_buf* sb,
int target_pos,
int t)
{
uint8_t const* end = s->data_end;
int dict_bits = s->dict_bits;
Expand Down Expand Up @@ -349,10 +351,7 @@ __device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
* @return The new output position
*/
template <typename state_buf>
inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s,
state_buf volatile* sb,
int target_pos,
int t)
inline __device__ int gpuDecodeRleBooleans(page_state_s* s, state_buf* sb, int target_pos, int t)
{
uint8_t const* end = s->data_end;
int64_t pos = s->dict_pos;
Expand Down Expand Up @@ -420,10 +419,8 @@ inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s,
* @return Total length of strings processed
*/
template <bool sizes_only, typename state_buf>
__device__ size_type gpuInitStringDescriptors(page_state_s volatile* s,
[[maybe_unused]] state_buf volatile* sb,
int target_pos,
int t)
__device__ size_type
gpuInitStringDescriptors(page_state_s* s, [[maybe_unused]] state_buf* sb, int target_pos, int t)
{
int pos = s->dict_pos;
int total_len = 0;
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/parquet/page_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ struct page_enc_state_s {
uint32_t rle_rpt_count;
uint32_t page_start_val;
uint32_t chunk_start_val;
volatile uint32_t rpt_map[num_encode_warps];
uint32_t rpt_map[num_encode_warps];
EncPage page;
EncColumnChunk ck;
parquet_column_device_view col;
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/parquet/page_string_decode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -851,7 +851,7 @@ __global__ void __launch_bounds__(decode_block_size)
} else {
gpuInitStringDescriptors<false>(s, sb, src_target_pos, lane_id);
}
if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
if (t == 32) { s->dict_pos = src_target_pos; }
} else {
int const me = t - out_thread0;

Expand Down Expand Up @@ -934,7 +934,7 @@ __global__ void __launch_bounds__(decode_block_size)
}
}

if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; }
if (t == out_thread0) { s->src_pos = target_pos; }
}
__syncthreads();
}
Expand Down

0 comments on commit 5831beb

Please sign in to comment.