From 8a40902962af4981fc893d03d78e8a5ef0c023fc Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 12 Dec 2022 13:28:27 -0800
Subject: [PATCH] Fail loudly to avoid data corruption with unsupported input
 in `read_orc` (#12325)

Issue https://github.com/rapidsai/cudf/issues/11890

Motivating issue: The ORC reader reads nulls in row groups after the first one when reading a string column encoded with Pandas, with direct encoding. The root cause is that cuDF reads offsets from the row group index as larger then the stream sizes.

This PR does not fix the issue, but ensures that the reader fails loudly when the row group index offsets are read as too large to be correct. This should prevent data corruption until the fix is implemented.

This PR also sets up a mechanism to report decode errors from unsupported data.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/12325
---
 cpp/src/io/orc/orc_gpu.hpp         |  2 ++
 cpp/src/io/orc/reader_impl.cu      |  8 +++++++-
 cpp/src/io/orc/stripe_data.cu      | 12 ++++++++++--
 python/cudf/cudf/tests/test_orc.py | 13 +++++++++++++
 4 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 1e4e36ee91c..1c5c9b2746c 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -287,6 +287,7 @@ void DecodeNullsAndStringDictionaries(ColumnDesc* chunks,
  * @param[in] num_rowgroups Number of row groups in row index data
  * @param[in] rowidx_stride Row index stride
  * @param[in] level Current nesting level being processed
+ * @param[out] error_count Number of errors during decode
  * @param[in] stream CUDA stream used for device memory operations and kernel launches
  */
 void DecodeOrcColumnData(ColumnDesc* chunks,
@@ -299,6 +300,7 @@ void DecodeOrcColumnData(ColumnDesc* chunks,
                          uint32_t num_rowgroups,
                          uint32_t rowidx_stride,
                          size_t level,
+                         size_type* error_count,
                          rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index fd995dbf84e..761158c9310 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -36,6 +36,7 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <rmm/device_scalar.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -641,6 +642,7 @@ void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::Col
     update_null_mask(chunks, out_buffers, stream, _mr);
   }
 
+  rmm::device_scalar<size_type> error_count(0, stream);
   // Update the null map for child columns
   gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
                            global_dict.data(),
@@ -652,8 +654,12 @@ void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::Col
                            row_groups.size().first,
                            row_index_stride,
                            level,
+                           error_count.data(),
                            stream);
-  chunks.device_to_host(stream, true);
+  chunks.device_to_host(stream);
+  // `value` synchronizes
+  auto const num_errors = error_count.value(stream);
+  CUDF_EXPECTS(num_errors == 0, "ORC data decode failed");
 
   std::for_each(col_idx_it + 0, col_idx_it + num_columns, [&](auto col_idx) {
     out_buffers[col_idx].null_count() =
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index bf883986c84..d0d077d2611 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1378,7 +1378,8 @@ __global__ void __launch_bounds__(block_size)
                          device_2dspan<RowGroup> row_groups,
                          size_t first_row,
                          uint32_t rowidx_stride,
-                         size_t level)
+                         size_t level,
+                         size_type* error_count)
 {
   __shared__ __align__(16) orcdec_state_s state_g;
   using block_reduce = cub::BlockReduce<uint64_t, block_size>;
@@ -1410,6 +1411,12 @@ __global__ void __launch_bounds__(block_size)
   if (t == 0 and is_valid) {
     // If we have an index, seek to the initial run and update row positions
     if (num_rowgroups > 0) {
+      if (s->top.data.index.strm_offset[0] > s->chunk.strm_len[CI_DATA]) {
+        atomicAdd(error_count, 1);
+      }
+      if (s->top.data.index.strm_offset[1] > s->chunk.strm_len[CI_DATA2]) {
+        atomicAdd(error_count, 1);
+      }
       uint32_t ofs0 = min(s->top.data.index.strm_offset[0], s->chunk.strm_len[CI_DATA]);
       uint32_t ofs1 = min(s->top.data.index.strm_offset[1], s->chunk.strm_len[CI_DATA2]);
       uint32_t rowgroup_rowofs =
@@ -1884,6 +1891,7 @@ void __host__ DecodeOrcColumnData(ColumnDesc* chunks,
                                   uint32_t num_rowgroups,
                                   uint32_t rowidx_stride,
                                   size_t level,
+                                  size_type* error_count,
                                   rmm::cuda_stream_view stream)
 {
   uint32_t num_chunks = num_columns * num_stripes;
@@ -1891,7 +1899,7 @@ void __host__ DecodeOrcColumnData(ColumnDesc* chunks,
   dim3 dim_grid((num_rowgroups > 0) ? num_columns : num_chunks,
                 (num_rowgroups > 0) ? num_rowgroups : 1);
   gpuDecodeOrcColumnData<block_size><<<dim_grid, dim_block, 0, stream.value()>>>(
-    chunks, global_dictionary, tz_table, row_groups, first_row, rowidx_stride, level);
+    chunks, global_dictionary, tz_table, row_groups, first_row, rowidx_stride, level, error_count);
 }
 
 }  // namespace gpu
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index ffb32ee7588..312f49967c9 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1848,3 +1848,16 @@ def test_reader_empty_stripe(datadir, fname):
     expected = pd.read_orc(path)
     got = cudf.read_orc(path)
     assert_eq(expected, got)
+
+
+@pytest.mark.xfail(
+    reason="https://github.com/rapidsai/cudf/issues/11890", raises=RuntimeError
+)
+def test_reader_unsupported_offsets():
+    # needs enough data for more than one row group
+    expected = cudf.DataFrame({"str": ["*"] * 10001}, dtype="string")
+
+    buffer = BytesIO()
+    expected.to_pandas().to_orc(buffer)
+    got = cudf.read_orc(buffer)
+    assert_eq(expected, got)