From de8c0b8ee90629d1880953413de4b47907627958 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 6 Jan 2022 20:07:23 -0800 Subject: [PATCH] Resolve racecheck errors in ORC kernels (#9916) Running ORC Python tests with `compute-sanitizer --tool racecheck` results in a number of errors/warnings. This PR resolves the errors originating in ORC kernels. Remaining errors come from `gpu_inflate`. Adds a few missing block/warp syncs and minor clean up in the affected code. Causes ~4~2% slowdown on average in ORC reader benchmarks. Not negligible, will double check whether the changes are required, or just resolving false positives in `racecheck`. Ran the benchmarks many more times, and the average time difference is smaller than variations between runs. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Elias Stehle (https://github.com/elstehle) - Devavret Makkar (https://github.com/devavret) URL: https://github.com/rapidsai/cudf/pull/9916 --- cpp/src/io/comp/gpuinflate.cu | 17 +++++++---------- cpp/src/io/orc/stripe_data.cu | 35 ++++++++++++++++++----------------- cpp/src/io/orc/stripe_enc.cu | 7 ++----- 3 files changed, 27 insertions(+), 32 deletions(-) diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index 338af72e4c9..dab8ce1afa5 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -780,22 +780,19 @@ __device__ void process_symbols(inflate_state_s* s, int t) do { volatile uint32_t* b = &s->x.u.symqueue[batch * batch_size]; - int batch_len, pos; - int32_t symt; - uint32_t lit_mask; - + int batch_len = 0; if (t == 0) { while ((batch_len = s->x.batch_len[batch]) == 0) {} - } else { - batch_len = 0; } batch_len = shuffle(batch_len); if (batch_len < 0) { break; } - symt = (t < batch_len) ? b[t] : 256; - lit_mask = ballot(symt >= 256); - pos = min((__ffs(lit_mask) - 1) & 0xff, 32); + auto const symt = (t < batch_len) ? b[t] : 256; + auto const lit_mask = ballot(symt >= 256); + auto pos = min((__ffs(lit_mask) - 1) & 0xff, 32); + if (t == 0) { s->x.batch_len[batch] = 0; } + if (t < pos && out + t < outend) { out[t] = symt; } out += pos; batch_len -= pos; @@ -825,7 +822,7 @@ __device__ void process_symbols(inflate_state_s* s, int t) } } batch = (batch + 1) & (batch_count - 1); - } while (1); + } while (true); if (t == 0) { s->out = out; } } diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 8f8bb87d9e4..05bc25597c2 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -409,7 +409,7 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int p if (b > 0x7f) { b = bytestream_readbyte(bs, pos++); v = (v & 0x0fffffff) | (b << 28); - if (sizeof(T) > 4) { + if constexpr (sizeof(T) > 4) { uint32_t lo = v; uint64_t hi; v = b >> 4; @@ -650,13 +650,11 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, int t, bool has_buffered_values = false) { - uint32_t numvals, numruns; - int r, tr; - if (t == 0) { uint32_t maxpos = min(bs->len, bs->pos + (bytestream_buffer_size - 8u)); uint32_t lastpos = bs->pos; - numvals = numruns = 0; + auto numvals = 0; + auto numruns = 0; // Find the length and start location of each run while (numvals < maxvals) { uint32_t pos = lastpos; @@ -713,9 +711,9 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, } __syncthreads(); // Process the runs, 1 warp per run - numruns = rle->num_runs; - r = t >> 5; - tr = t & 0x1f; + auto const numruns = rle->num_runs; + auto const r = t >> 5; + auto const tr = t & 0x1f; for (uint32_t run = r; run < numruns; run += num_warps) { uint32_t base, pos, w, n; int mode; @@ -731,7 +729,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, w = 8 + (byte0 & 0x38); // 8 to 64 bits n = 3 + (byte0 & 7); // 3 to 10 values bytestream_readbe(bs, pos * 8, w, baseval); - if (sizeof(T) <= 4) { + if constexpr (sizeof(T) <= 4) { rle->baseval.u32[r] = baseval; } else { rle->baseval.u64[r] = baseval; @@ -746,7 +744,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, uint32_t byte3 = bytestream_readbyte(bs, pos++); uint32_t bw = 1 + (byte2 >> 5); // base value width, 1 to 8 bytes uint32_t pw = kRLEv2_W[byte2 & 0x1f]; // patch width, 1 to 64 bits - if (sizeof(T) <= 4) { + if constexpr (sizeof(T) <= 4) { uint32_t baseval, mask; bytestream_readbe(bs, pos * 8, bw * 8, baseval); mask = (1 << (bw * 8 - 1)) - 1; @@ -766,7 +764,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, int64_t delta; // Delta pos = decode_varint(bs, pos, baseval); - if (sizeof(T) <= 4) { + if constexpr (sizeof(T) <= 4) { rle->baseval.u32[r] = baseval; } else { rle->baseval.u64[r] = baseval; @@ -782,8 +780,9 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, pos = shuffle(pos); n = shuffle(n); w = shuffle(w); + __syncwarp(); // Not required, included to fix the racecheck warning for (uint32_t i = tr; i < n; i += 32) { - if (sizeof(T) <= 4) { + if constexpr (sizeof(T) <= 4) { if (mode == 0) { vals[base + i] = rle->baseval.u32[r]; } else if (mode == 1) { @@ -860,7 +859,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, if (j & i) vals[base + j] += vals[base + ((j & ~i) | (i - 1))]; } } - if (sizeof(T) <= 4) + if constexpr (sizeof(T) <= 4) baseval = rle->baseval.u32[r]; else baseval = rle->baseval.u64[r]; @@ -868,6 +867,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, vals[base + j] += baseval; } } + __syncwarp(); } __syncthreads(); return rle->num_vals; @@ -1679,11 +1679,12 @@ __global__ void __launch_bounds__(block_size) } } } - if (t == 0 && numvals + vals_skipped > 0 && numvals < s->top.data.max_vals) { - if (s->chunk.type_kind == TIMESTAMP) { - s->top.data.buffered_count = s->top.data.max_vals - numvals; + if (t == 0 && numvals + vals_skipped > 0) { + auto const max_vals = s->top.data.max_vals; + if (max_vals > numvals) { + if (s->chunk.type_kind == TIMESTAMP) { s->top.data.buffered_count = max_vals - numvals; } + s->top.data.max_vals = numvals; } - s->top.data.max_vals = numvals; } __syncthreads(); // Use the valid bits to compute non-null row positions until we get a full batch of values to diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 829e4877c44..660ec025d00 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -414,7 +414,7 @@ static __device__ uint32_t IntegerRLE( uint32_t mode1_w, mode2_w; typename std::make_unsigned::type vrange_mode1, vrange_mode2; block_vmin = static_cast(vmin); - if (sizeof(T) > 4) { + if constexpr (sizeof(T) > 4) { vrange_mode1 = (is_signed) ? max(zigzag(vmin), zigzag(vmax)) : vmax; vrange_mode2 = vmax - vmin; mode1_w = 8 - min(CountLeadingBytes64(vrange_mode1), 7); @@ -705,10 +705,7 @@ static __device__ void encode_null_mask(orcenc_state_s* s, } // reset shared state - if (t == 0) { - s->nnz = 0; - s->numvals = 0; - } + if (t == 0) { s->nnz = 0; } } /**