From b9a116544d569cea49547f206bce820cee6cca8e Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 17 Jun 2021 06:20:45 -0700 Subject: [PATCH] Fix offset of the string dictionary length stream (#8538) Fixes #8514 String dictionary length is RLE encoded and rle_data_size and non_rle_data_size take this into account. However, When computing chunk stream offsets, these streams were treated as non-RLE and non_rle_data_size was not added. This caused discrepancy between non-RLE stream sizes and available space, leading to overlap between chunk streams. Applied the non_rle_data_size to the offset to correct the discrepancy and added a test that uses decimal columns to increase the size of non-RLE encoded data and enable the overflow. Author: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Nghia Truong (https://github.com/ttnghia) - Conor Hoekstra (https://github.com/codereport) - David Wendt (https://github.com/davidwendt) - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/8538 --- cpp/src/io/orc/writer_impl.cu | 9 ++++++++- python/cudf/cudf/tests/test_orc.py | 15 +++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 2aa1e2d866a..4a2330d479b 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -565,6 +565,8 @@ orc_streams::orc_stream_offsets orc_streams::compute_offsets( // Everything else uses RLE return true; }(); + // non-RLE and RLE streams are separated in the buffer that stores encoded data + // The computed offsets do not take the streams of the other type into account if (is_rle_data) { strm_offsets[i] = rle_data_size; rle_data_size += (stream.length * num_rowgroups + 7) & ~7; @@ -681,6 +683,10 @@ encoded_data writer::impl::encode_columns(const table_device_view &view, : (((stripe_dict->num_strings + 0x1ff) >> 9) * (512 * 4 + 2)); if (stripe.id == 0) { strm.data_ptrs[strm_type] = encoded_data.data() + stream_offsets.offsets[strm_id]; + // Dictionary lengths are encoded as RLE, which are all stored after non-RLE data: + // include non-RLE data size in the offset only in that case + if (strm_type == gpu::CI_DATA2 && ck.encoding_kind == DICTIONARY_V2) + strm.data_ptrs[strm_type] += stream_offsets.non_rle_data_size; } else { auto const &strm_up = col_streams[stripe_dict[-dict_stride].start_chunk]; strm.data_ptrs[strm_type] = @@ -710,7 +716,8 @@ encoded_data writer::impl::encode_columns(const table_device_view &view, : (col_streams[rg_idx - 1].data_ptrs[strm_type] + col_streams[rg_idx - 1].lengths[strm_type]); } else { - strm.lengths[strm_type] = streams[strm_id].length; + strm.lengths[strm_type] = streams[strm_id].length; + // RLE encoded streams are stored after all non-RLE streams strm.data_ptrs[strm_type] = encoded_data.data() + stream_offsets.non_rle_data_size + stream_offsets.offsets[strm_id] + streams[strm_id].length * rg_idx; diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 71666846b96..c3a33f75ee3 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -10,6 +10,7 @@ import pyarrow.orc import pyorc import pytest +import decimal import cudf from cudf.io.orc import ORCWriter @@ -780,3 +781,17 @@ def test_orc_writer_decimal(tmpdir, scale): got = pd.read_orc(fname) assert_eq(expected.to_pandas()["dec_val"], got["dec_val"]) + + +def test_orc_string_stream_offset_issue(): + size = 30000 + vals = { + str(x): [decimal.Decimal(1)] * size if x != 0 else ["XYZ"] * size + for x in range(0, 5) + } + df = cudf.DataFrame(vals) + + buffer = BytesIO() + df.to_orc(buffer) + + assert_eq(df, cudf.read_orc(buffer))