Skip to content

Commit

Permalink
Merge pull request #5473 from devavret/fix-orc-reader-rle
Browse files Browse the repository at this point in the history
[WIP] Fix orc reader RLEv2 reader
  • Loading branch information
devavret authored Jun 22, 2020
2 parents b22e99b + 9fc167c commit 55f213b
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 12 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@
- PR #5459 Fix str.translate to convert table characters to UTF-8
- PR #5480 Fix merge sort docs
- PR #5465 Fix benchmark out of memory errors due to multiple initialization
- PR #5473 Fix RLEv2 patched base in ORC reader
- PR #5472 Fix str concat issue with indexed series
- PR #5478 Fix `loc` and `iloc` doc
- PR #5484 Ensure flat index after groupby if nlevels == 1
Expand Down
40 changes: 28 additions & 12 deletions cpp/src/io/orc/stripe_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,20 @@ static const __device__ __constant__ uint8_t kRLEv2_W[32] = {
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 26, 28, 30, 32, 40, 48, 56, 64};

/**
* @brief Maps the RLEv2 patch size (pw + pgw) to number of bits
*
* Patch size (in bits) is only allowed to be from the below set. If `pw + pgw == 34` then the size
* of the patch in the file is the smallest size in the set that can fit 34 bits i.e.
* `ClosestFixedBitsMap[34] == 40`
*
* @see https://github.com/apache/orc/commit/9faf7f5147a7bc69
*/
static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = {
1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 26, 26, 28, 28, 30, 30, 32, 32, 40, 40, 40, 40, 40, 40, 40, 40, 48, 48, 48,
48, 48, 48, 48, 48, 56, 56, 56, 56, 56, 56, 56, 56, 64, 64, 64, 64, 64, 64, 64, 64};

/**
* @brief ORC Integer RLEv2 decoding
*
Expand Down Expand Up @@ -684,15 +698,16 @@ static __device__ uint32_t Integer_RLEv2(
l = (l * n + 7) >> 3;
} else if (mode == 2) {
// 10wwwwwn.nnnnnnnn.xxxxxxxx.yyyyyyyy: patched base encoding
uint32_t byte2 = bytestream_readbyte(bs, pos++);
uint32_t byte3 = bytestream_readbyte(bs, pos++);
uint32_t bw = 1 + (byte2 >> 5); // base value width, 1 to 8 bytes
uint32_t pw = kRLEv2_W[byte2 & 0x1f]; // patch width, 1 to 64 bits
uint32_t pgw = 1 + (byte3 >> 5); // patch gap width, 1 to 8 bits
uint32_t pll = byte3 & 0x1f; // patch list length
l = (l * n + 7) >> 3;
uint32_t byte2 = bytestream_readbyte(bs, pos++);
uint32_t byte3 = bytestream_readbyte(bs, pos++);
uint32_t bw = 1 + (byte2 >> 5); // base value width, 1 to 8 bytes
uint32_t pw = kRLEv2_W[byte2 & 0x1f]; // patch width, 1 to 64 bits
uint32_t pgw = 1 + (byte3 >> 5); // patch gap width, 1 to 8 bits
uint32_t pgw_pw_len = ClosestFixedBitsMap[min(pw + pgw, 64u)]; // ceiled patch width
uint32_t pll = byte3 & 0x1f; // patch list length
l = (l * n + 7) >> 3;
l += bw;
l += (pll * (pgw + pw) + 7) >> 3;
l += (pll * (pgw_pw_len) + 7) >> 3;
} else {
// 11wwwwwn.nnnnnnnn.<base>.<delta>: delta encoding
uint32_t deltapos = varint_length<T>(bs, pos);
Expand Down Expand Up @@ -812,7 +827,7 @@ static __device__ uint32_t Integer_RLEv2(
bytestream_readbe(bs, pos * 8 + i * w, w, v);
vals[base + i] = v;
} else if (mode == 2) {
uint32_t ofs = bytestream_readbits64(bs, pos * 8 + i * w, w);
uint64_t ofs = bytestream_readbits64(bs, pos * 8 + i * w, w);
vals[base + i] = rle->baseval.u64[r] + ofs;
} else {
int64_t delta = rle->delta[r], ofs;
Expand All @@ -834,10 +849,11 @@ static __device__ uint32_t Integer_RLEv2(
uint32_t pgw = 1 + ((pw_byte3 >> 5) & 7); // patch gap width, 1 to 8 bits
uint32_t pll = pw_byte3 & 0x1f; // patch list length
if (pll != 0) {
uint32_t pgw_pw_len = ClosestFixedBitsMap[min(pw + pgw, 64u)];
uint64_t patch_pos64 =
(tr < pll)
? bytestream_readbits64(bs, pos * 8 + ((n * w + 7) & ~7) + tr * (pgw + pw), pgw + pw)
: 0;
(tr < pll) ? bytestream_readbits64(
bs, pos * 8 + ((n * w + 7) & ~7) + tr * (pgw_pw_len), pgw_pw_len)
: 0;
uint32_t patch_pos;
T patch = 1;
patch <<= pw;
Expand Down
Binary file not shown.
1 change: 1 addition & 0 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def _make_path_or_buf(src):
"double1",
],
),
("TestOrcFile.RLEv2.orc", ["x", "y"]),
("TestOrcFile.testSnappy.orc", None),
("TestOrcFile.demo-12-zlib.orc", ["_col2", "_col3", "_col4", "_col5"]),
],
Expand Down

0 comments on commit 55f213b

Please sign in to comment.