diff --git a/CHANGELOG.md b/CHANGELOG.md index baf3f09fa84..44c77947b19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -113,6 +113,7 @@ - PR #5459 Fix str.translate to convert table characters to UTF-8 - PR #5480 Fix merge sort docs - PR #5465 Fix benchmark out of memory errors due to multiple initialization +- PR #5473 Fix RLEv2 patched base in ORC reader - PR #5472 Fix str concat issue with indexed series - PR #5478 Fix `loc` and `iloc` doc - PR #5484 Ensure flat index after groupby if nlevels == 1 diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 117b3623a7a..2617d9df035 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -642,6 +642,20 @@ static const __device__ __constant__ uint8_t kRLEv2_W[32] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 28, 30, 32, 40, 48, 56, 64}; +/** + * @brief Maps the RLEv2 patch size (pw + pgw) to number of bits + * + * Patch size (in bits) is only allowed to be from the below set. If `pw + pgw == 34` then the size + * of the patch in the file is the smallest size in the set that can fit 34 bits i.e. + * `ClosestFixedBitsMap[34] == 40` + * + * @see https://github.com/apache/orc/commit/9faf7f5147a7bc69 + */ +static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = { + 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 26, 26, 28, 28, 30, 30, 32, 32, 40, 40, 40, 40, 40, 40, 40, 40, 48, 48, 48, + 48, 48, 48, 48, 48, 56, 56, 56, 56, 56, 56, 56, 56, 64, 64, 64, 64, 64, 64, 64, 64}; + /** * @brief ORC Integer RLEv2 decoding * @@ -684,15 +698,16 @@ static __device__ uint32_t Integer_RLEv2( l = (l * n + 7) >> 3; } else if (mode == 2) { // 10wwwwwn.nnnnnnnn.xxxxxxxx.yyyyyyyy: patched base encoding - uint32_t byte2 = bytestream_readbyte(bs, pos++); - uint32_t byte3 = bytestream_readbyte(bs, pos++); - uint32_t bw = 1 + (byte2 >> 5); // base value width, 1 to 8 bytes - uint32_t pw = kRLEv2_W[byte2 & 0x1f]; // patch width, 1 to 64 bits - uint32_t pgw = 1 + (byte3 >> 5); // patch gap width, 1 to 8 bits - uint32_t pll = byte3 & 0x1f; // patch list length - l = (l * n + 7) >> 3; + uint32_t byte2 = bytestream_readbyte(bs, pos++); + uint32_t byte3 = bytestream_readbyte(bs, pos++); + uint32_t bw = 1 + (byte2 >> 5); // base value width, 1 to 8 bytes + uint32_t pw = kRLEv2_W[byte2 & 0x1f]; // patch width, 1 to 64 bits + uint32_t pgw = 1 + (byte3 >> 5); // patch gap width, 1 to 8 bits + uint32_t pgw_pw_len = ClosestFixedBitsMap[min(pw + pgw, 64u)]; // ceiled patch width + uint32_t pll = byte3 & 0x1f; // patch list length + l = (l * n + 7) >> 3; l += bw; - l += (pll * (pgw + pw) + 7) >> 3; + l += (pll * (pgw_pw_len) + 7) >> 3; } else { // 11wwwwwn.nnnnnnnn..: delta encoding uint32_t deltapos = varint_length(bs, pos); @@ -812,7 +827,7 @@ static __device__ uint32_t Integer_RLEv2( bytestream_readbe(bs, pos * 8 + i * w, w, v); vals[base + i] = v; } else if (mode == 2) { - uint32_t ofs = bytestream_readbits64(bs, pos * 8 + i * w, w); + uint64_t ofs = bytestream_readbits64(bs, pos * 8 + i * w, w); vals[base + i] = rle->baseval.u64[r] + ofs; } else { int64_t delta = rle->delta[r], ofs; @@ -834,10 +849,11 @@ static __device__ uint32_t Integer_RLEv2( uint32_t pgw = 1 + ((pw_byte3 >> 5) & 7); // patch gap width, 1 to 8 bits uint32_t pll = pw_byte3 & 0x1f; // patch list length if (pll != 0) { + uint32_t pgw_pw_len = ClosestFixedBitsMap[min(pw + pgw, 64u)]; uint64_t patch_pos64 = - (tr < pll) - ? bytestream_readbits64(bs, pos * 8 + ((n * w + 7) & ~7) + tr * (pgw + pw), pgw + pw) - : 0; + (tr < pll) ? bytestream_readbits64( + bs, pos * 8 + ((n * w + 7) & ~7) + tr * (pgw_pw_len), pgw_pw_len) + : 0; uint32_t patch_pos; T patch = 1; patch <<= pw; diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.RLEv2.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.RLEv2.orc new file mode 100644 index 00000000000..26535e09549 Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.RLEv2.orc differ diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index de52c72f0fe..2883fa91d7d 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -67,6 +67,7 @@ def _make_path_or_buf(src): "double1", ], ), + ("TestOrcFile.RLEv2.orc", ["x", "y"]), ("TestOrcFile.testSnappy.orc", None), ("TestOrcFile.demo-12-zlib.orc", ["_col2", "_col3", "_col4", "_col5"]), ],