From 03207a4ad9e4ce819fef981fce8c7ada4f7a24fd Mon Sep 17 00:00:00 2001 From: Olivier Lapicque Date: Thu, 31 Oct 2019 13:24:20 -0700 Subject: [PATCH 1/2] Fix incorrect ByteRLE encoding of literal_run=128 --- cpp/src/io/orc/stripe_enc.cu | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index b1ab59d6822..72f38b74152 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -228,7 +228,7 @@ static __device__ uint32_t ByteRLE(orcenc_state_s *s, const uint8_t *inbuf, uint if (t < literal_run) { uint32_t run_id = t >> 7; - uint32_t run = (run_id == num_runs - 1) ? literal_run & 0x7f : 0x80; + uint32_t run = min(literal_run - run_id * 128, 128); if (!(t & 0x7f)) dst[run_id + t] = 0x100 - run; dst[run_id + t + 1] = (cid == CI_PRESENT) ? __brev(v0) >> 24 : v0; @@ -254,11 +254,10 @@ static __device__ uint32_t ByteRLE(orcenc_state_s *s, const uint8_t *inbuf, uint inpos += 130; repeat_run -= 130; } - if (!flush) + if (!flush && repeat_run == numvals) { // Wait for more data in case we can continue the run later - if (repeat_run == numvals && !flush) - break; + break; } if (repeat_run >= 3) { From 803cd575dadbc8cb08e2dad9ac80ad10719182b3 Mon Sep 17 00:00:00 2001 From: Olivier Lapicque Date: Thu, 31 Oct 2019 13:28:58 -0700 Subject: [PATCH 2/2] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 851872ef1e5..5295304f8fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -94,6 +94,7 @@ - PR #3236 Fix Numba 0.46+/CuPy 6.3 interface compatibility - PR #3256 Fix orc writer crash with multiple string columns - PR #3211 Fix breaking change caused by rapidsai/rmm#167 +- PR #3267 ORC writer: fix incorrect ByteRLE encoding of long literal runs # cuDF 0.10.0 (16 Oct 2019)