Skip to content

Commit

Permalink
Fix offset of the string dictionary length stream (#8515)
Browse files Browse the repository at this point in the history
Fixes #8514

String dictionary length is RLE encoded and `rle_data_size` and `non_rle_data_size` take this into account. However, When computing chunk stream offsets, these streams were treated as non-RLE and `non_rle_data_size` was not added. This caused discrepancy between non-RLE stream sizes and available space, leading to overlap between chunk streams.

Applied the `non_rle_data_size` to the offset to correct the discrepancy and added a test that uses decimal columns to increase the size of non-RLE encoded data and enable the overflow.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - David Wendt (https://github.com/davidwendt)

URL: #8515
  • Loading branch information
vuule authored Jun 16, 2021
1 parent 91c727f commit 15b18f4
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 1 deletion.
9 changes: 8 additions & 1 deletion cpp/src/io/orc/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,8 @@ orc_streams::orc_stream_offsets orc_streams::compute_offsets(
// Everything else uses RLE
return true;
}();
// non-RLE and RLE streams are separated in the buffer that stores encoded data
// The computed offsets do not take the streams of the other type into account
if (is_rle_data) {
strm_offsets[i] = rle_data_size;
rle_data_size += (stream.length * num_rowgroups + 7) & ~7;
Expand Down Expand Up @@ -681,6 +683,10 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
: (((stripe_dict->num_strings + 0x1ff) >> 9) * (512 * 4 + 2));
if (stripe.id == 0) {
strm.data_ptrs[strm_type] = encoded_data.data() + stream_offsets.offsets[strm_id];
// Dictionary lengths are encoded as RLE, which are all stored after non-RLE data:
// include non-RLE data size in the offset only in that case
if (strm_type == gpu::CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)
strm.data_ptrs[strm_type] += stream_offsets.non_rle_data_size;
} else {
auto const &strm_up = col_streams[stripe_dict[-dict_stride].start_chunk];
strm.data_ptrs[strm_type] =
Expand Down Expand Up @@ -710,7 +716,8 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
: (col_streams[rg_idx - 1].data_ptrs[strm_type] +
col_streams[rg_idx - 1].lengths[strm_type]);
} else {
strm.lengths[strm_type] = streams[strm_id].length;
strm.lengths[strm_type] = streams[strm_id].length;
// RLE encoded streams are stored after all non-RLE streams
strm.data_ptrs[strm_type] = encoded_data.data() + stream_offsets.non_rle_data_size +
stream_offsets.offsets[strm_id] +
streams[strm_id].length * rg_idx;
Expand Down
15 changes: 15 additions & 0 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pyarrow.orc
import pyorc
import pytest
import decimal

import cudf
from cudf.io.orc import ORCWriter
Expand Down Expand Up @@ -780,3 +781,17 @@ def test_orc_writer_decimal(tmpdir, scale):

got = pd.read_orc(fname)
assert_eq(expected.to_pandas()["dec_val"], got["dec_val"])


def test_orc_string_stream_offset_issue():
size = 30000
vals = {
str(x): [decimal.Decimal(1)] * size if x != 0 else ["XYZ"] * size
for x in range(0, 5)
}
df = cudf.DataFrame(vals)

buffer = BytesIO()
df.to_orc(buffer)

assert_eq(df, cudf.read_orc(buffer))

0 comments on commit 15b18f4

Please sign in to comment.