Fix offset of the string dictionary length stream (#8515)

Fixes #8514 String dictionary length is RLE encoded and `rle_data_size` and `non_rle_data_size` take this into account. However, When computing chunk stream offsets, these streams were treated as non-RLE and `non_rle_data_size` was not added. This caused discrepancy between non-RLE stream sizes and available space, leading to overlap between chunk streams. Applied the `non_rle_data_size` to the offset to correct the discrepancy and added a test that uses decimal columns to increase the size of non-RLE encoded data and enable the overflow. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) - Charles Blackmon-Luca (https://github.com/charlesbluca) - David Wendt (https://github.com/davidwendt) URL: #8515
rapidsai · Jun 16, 2021 · 15b18f4 · 15b18f4
1 parent 91c727f
commit 15b18f4
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 1 deletion.
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
@@ -565,6 +565,8 @@ orc_streams::orc_stream_offsets orc_streams::compute_offsets(
       // Everything else uses RLE
       return true;
     }();
+    // non-RLE and RLE streams are separated in the buffer that stores encoded data
+    // The computed offsets do not take the streams of the other type into account
     if (is_rle_data) {
       strm_offsets[i] = rle_data_size;
       rle_data_size += (stream.length * num_rowgroups + 7) & ~7;
@@ -681,6 +683,10 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
                     : (((stripe_dict->num_strings + 0x1ff) >> 9) * (512 * 4 + 2));
                 if (stripe.id == 0) {
                   strm.data_ptrs[strm_type] = encoded_data.data() + stream_offsets.offsets[strm_id];
+                  // Dictionary lengths are encoded as RLE, which are all stored after non-RLE data:
+                  // include non-RLE data size in the offset only in that case
+                  if (strm_type == gpu::CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)
+                    strm.data_ptrs[strm_type] += stream_offsets.non_rle_data_size;
                 } else {
                   auto const &strm_up = col_streams[stripe_dict[-dict_stride].start_chunk];
                   strm.data_ptrs[strm_type] =
@@ -710,7 +716,8 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
                                             : (col_streams[rg_idx - 1].data_ptrs[strm_type] +
                                                col_streams[rg_idx - 1].lengths[strm_type]);
             } else {
-              strm.lengths[strm_type]   = streams[strm_id].length;
+              strm.lengths[strm_type] = streams[strm_id].length;
+              // RLE encoded streams are stored after all non-RLE streams
               strm.data_ptrs[strm_type] = encoded_data.data() + stream_offsets.non_rle_data_size +
                                           stream_offsets.offsets[strm_id] +
                                           streams[strm_id].length * rg_idx;

diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
@@ -10,6 +10,7 @@
 import pyarrow.orc
 import pyorc
 import pytest
+import decimal
 
 import cudf
 from cudf.io.orc import ORCWriter
@@ -780,3 +781,17 @@ def test_orc_writer_decimal(tmpdir, scale):
 
     got = pd.read_orc(fname)
     assert_eq(expected.to_pandas()["dec_val"], got["dec_val"])
+
+
+def test_orc_string_stream_offset_issue():
+    size = 30000
+    vals = {
+        str(x): [decimal.Decimal(1)] * size if x != 0 else ["XYZ"] * size
+        for x in range(0, 5)
+    }
+    df = cudf.DataFrame(vals)
+
+    buffer = BytesIO()
+    df.to_orc(buffer)
+
+    assert_eq(df, cudf.read_orc(buffer))