Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolve racecheck errors in ORC kernels #9916

Merged
merged 8 commits into from
Jan 7, 2022
17 changes: 7 additions & 10 deletions cpp/src/io/comp/gpuinflate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -780,22 +780,19 @@ __device__ void process_symbols(inflate_state_s* s, int t)

do {
volatile uint32_t* b = &s->x.u.symqueue[batch * batch_size];
int batch_len, pos;
int32_t symt;
uint32_t lit_mask;

int batch_len = 0;
if (t == 0) {
while ((batch_len = s->x.batch_len[batch]) == 0) {}
} else {
batch_len = 0;
}
batch_len = shuffle(batch_len);
if (batch_len < 0) { break; }

symt = (t < batch_len) ? b[t] : 256;
lit_mask = ballot(symt >= 256);
pos = min((__ffs(lit_mask) - 1) & 0xff, 32);
auto const symt = (t < batch_len) ? b[t] : 256;
auto const lit_mask = ballot(symt >= 256);
auto pos = min((__ffs(lit_mask) - 1) & 0xff, 32);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't spot the fix in this file. Is this code cleanup only?


if (t == 0) { s->x.batch_len[batch] = 0; }

if (t < pos && out + t < outend) { out[t] = symt; }
out += pos;
batch_len -= pos;
Expand Down Expand Up @@ -825,7 +822,7 @@ __device__ void process_symbols(inflate_state_s* s, int t)
}
}
batch = (batch + 1) & (batch_count - 1);
} while (1);
} while (true);

if (t == 0) { s->out = out; }
}
Expand Down
29 changes: 15 additions & 14 deletions cpp/src/io/orc/stripe_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -650,13 +650,11 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
int t,
bool has_buffered_values = false)
{
uint32_t numvals, numruns;
int r, tr;

if (t == 0) {
uint32_t maxpos = min(bs->len, bs->pos + (bytestream_buffer_size - 8u));
uint32_t lastpos = bs->pos;
numvals = numruns = 0;
auto numvals = 0;
auto numruns = 0;
// Find the length and start location of each run
while (numvals < maxvals) {
uint32_t pos = lastpos;
Expand Down Expand Up @@ -713,9 +711,9 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
}
__syncthreads();
// Process the runs, 1 warp per run
numruns = rle->num_runs;
r = t >> 5;
tr = t & 0x1f;
auto const numruns = rle->num_runs;
auto const r = t >> 5;
auto const tr = t & 0x1f;
for (uint32_t run = r; run < numruns; run += num_warps) {
uint32_t base, pos, w, n;
int mode;
Expand All @@ -731,7 +729,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
w = 8 + (byte0 & 0x38); // 8 to 64 bits
n = 3 + (byte0 & 7); // 3 to 10 values
bytestream_readbe(bs, pos * 8, w, baseval);
if (sizeof(T) <= 4) {
if constexpr (sizeof(T) <= 4) {
rle->baseval.u32[r] = baseval;
} else {
rle->baseval.u64[r] = baseval;
Expand Down Expand Up @@ -766,7 +764,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
int64_t delta;
// Delta
pos = decode_varint(bs, pos, baseval);
if (sizeof(T) <= 4) {
if constexpr (sizeof(T) <= 4) {
rle->baseval.u32[r] = baseval;
} else {
rle->baseval.u64[r] = baseval;
Expand All @@ -782,6 +780,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
pos = shuffle(pos);
n = shuffle(n);
w = shuffle(w);
__syncwarp();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure this one is needed here, as our shuffle is an alias for __shfl_sync, which, to my understanding, would converge threads participating in the shuffle (in our case: there is no mask, so all threads participate).
If, despite, __syncwarp should be required, we should leave a note that clarifies why we need __syncwarp here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll add a comment. Really want to go towards error-free memcheck/racecheck reports.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one resolves the following racecheck warnings, presumably because the tool does not recognize shuffle_sync as a sync point.

Warning: Race reported between Write access at 0x19520 in /cudf/cpp/src/io/orc/stripe_data.cu:735:unsigned int Integer_RLEv2
and Read access at 0x19be0 in /cudf/cpp/src/io/orc/stripe_data.cu:807:unsigned int Integer_RLEv2[488 hazards]

Warning: Race reported between Write access at 0x19050 in /cudf/cpp/src/io/orc/stripe_data.cu:773:unsigned int Integer_RLEv2
and Read access at 0x196d0 in /cudf/cpp/src/io/orc/stripe_data.cu:816:unsigned int Integer_RLEv2 [16 hazards]

for (uint32_t i = tr; i < n; i += 32) {
if (sizeof(T) <= 4) {
if (mode == 0) {
Expand Down Expand Up @@ -860,14 +859,15 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
if (j & i) vals[base + j] += vals[base + ((j & ~i) | (i - 1))];
}
}
if (sizeof(T) <= 4)
if constexpr (sizeof(T) <= 4)
baseval = rle->baseval.u32[r];
else
baseval = rle->baseval.u64[r];
for (uint32_t j = tr; j < n; j += 32) {
vals[base + j] += baseval;
}
}
__syncwarp();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one fixes the following warning:

Warning: Race reported between Write access at 0x19520 in /cudf/cpp/src/io/orc/stripe_data.cu:735:unsigned int Integer_RLEv2
and Read access at 0x1a4e0 in /cudf/cpp/src/io/orc/stripe_data.cu:865:unsigned int Integer_RLEv2 [8 hazards]

}
__syncthreads();
return rle->num_vals;
Expand Down Expand Up @@ -1679,11 +1679,12 @@ __global__ void __launch_bounds__(block_size)
}
}
}
if (t == 0 && numvals + vals_skipped > 0 && numvals < s->top.data.max_vals) {
if (s->chunk.type_kind == TIMESTAMP) {
s->top.data.buffered_count = s->top.data.max_vals - numvals;
if (t == 0 && numvals + vals_skipped > 0) {
auto const max_vals = s->top.data.max_vals;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Workaround for a presumable false positive:

Warning: Race reported between Write access at 0x19520 in /cudf/cpp/src/io/orc/stripe_data.cu:735:unsigned int Integer_RLEv2
and Read access at 0x1a4e0 in /cudf/cpp/src/io/orc/stripe_data.cu:865:unsigned int Integer_RLEv2 [8 hazards]

if (max_vals > numvals) {
if (s->chunk.type_kind == TIMESTAMP) { s->top.data.buffered_count = max_vals - numvals; }
s->top.data.max_vals = numvals;
vuule marked this conversation as resolved.
Show resolved Hide resolved
}
s->top.data.max_vals = numvals;
}
__syncthreads();
// Use the valid bits to compute non-null row positions until we get a full batch of values to
Expand Down
5 changes: 1 addition & 4 deletions cpp/src/io/orc/stripe_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -705,10 +705,7 @@ static __device__ void encode_null_mask(orcenc_state_s* s,
}

// reset shared state
if (t == 0) {
s->nnz = 0;
s->numvals = 0;
}
if (t == 0) { s->nnz = 0; }
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixes the error:

Error: Race reported between Read access at 0x2ce0 in /cudf/cpp/src/io/orc/stripe_enc.cu:629: encode_null_mask
and Write access at 0x2d30 in /cudf/cpp/src/io/orc/stripe_enc.cu:709:encode_null_mask [8 hazards]

Resetting numvals can be skipped because it is guaranteed to be zero after the loop above.

}

/**
Expand Down