From b9a116544d569cea49547f206bce820cee6cca8e Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 17 Jun 2021 06:20:45 -0700
Subject: [PATCH] Fix offset of the string dictionary length stream (#8538)

Fixes #8514

String dictionary length is RLE encoded and rle_data_size and non_rle_data_size take this into account. However, When computing chunk stream offsets, these streams were treated as non-RLE and non_rle_data_size was not added. This caused discrepancy between non-RLE stream sizes and available space, leading to overlap between chunk streams.

Applied the non_rle_data_size to the offset to correct the discrepancy and added a test that uses decimal columns to increase the size of non-RLE encoded data and enable the overflow.

Author:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Conor Hoekstra (https://github.com/codereport)
  - David Wendt (https://github.com/davidwendt)
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/8538
---
 cpp/src/io/orc/writer_impl.cu      |  9 ++++++++-
 python/cudf/cudf/tests/test_orc.py | 15 +++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 2aa1e2d866a..4a2330d479b 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -565,6 +565,8 @@ orc_streams::orc_stream_offsets orc_streams::compute_offsets(
       // Everything else uses RLE
       return true;
     }();
+    // non-RLE and RLE streams are separated in the buffer that stores encoded data
+    // The computed offsets do not take the streams of the other type into account
     if (is_rle_data) {
       strm_offsets[i] = rle_data_size;
       rle_data_size += (stream.length * num_rowgroups + 7) & ~7;
@@ -681,6 +683,10 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
                     : (((stripe_dict->num_strings + 0x1ff) >> 9) * (512 * 4 + 2));
                 if (stripe.id == 0) {
                   strm.data_ptrs[strm_type] = encoded_data.data() + stream_offsets.offsets[strm_id];
+                  // Dictionary lengths are encoded as RLE, which are all stored after non-RLE data:
+                  // include non-RLE data size in the offset only in that case
+                  if (strm_type == gpu::CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)
+                    strm.data_ptrs[strm_type] += stream_offsets.non_rle_data_size;
                 } else {
                   auto const &strm_up = col_streams[stripe_dict[-dict_stride].start_chunk];
                   strm.data_ptrs[strm_type] =
@@ -710,7 +716,8 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
                                             : (col_streams[rg_idx - 1].data_ptrs[strm_type] +
                                                col_streams[rg_idx - 1].lengths[strm_type]);
             } else {
-              strm.lengths[strm_type]   = streams[strm_id].length;
+              strm.lengths[strm_type] = streams[strm_id].length;
+              // RLE encoded streams are stored after all non-RLE streams
               strm.data_ptrs[strm_type] = encoded_data.data() + stream_offsets.non_rle_data_size +
                                           stream_offsets.offsets[strm_id] +
                                           streams[strm_id].length * rg_idx;
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 71666846b96..c3a33f75ee3 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -10,6 +10,7 @@
 import pyarrow.orc
 import pyorc
 import pytest
+import decimal
 
 import cudf
 from cudf.io.orc import ORCWriter
@@ -780,3 +781,17 @@ def test_orc_writer_decimal(tmpdir, scale):
 
     got = pd.read_orc(fname)
     assert_eq(expected.to_pandas()["dec_val"], got["dec_val"])
+
+
+def test_orc_string_stream_offset_issue():
+    size = 30000
+    vals = {
+        str(x): [decimal.Decimal(1)] * size if x != 0 else ["XYZ"] * size
+        for x in range(0, 5)
+    }
+    df = cudf.DataFrame(vals)
+
+    buffer = BytesIO()
+    df.to_orc(buffer)
+
+    assert_eq(df, cudf.read_orc(buffer))