Fix out-of-bounds access in ORC writer (#7902)

Avoid out-of-bounds access due to streams holding column ids as `index + 1`, and the first index stream using zero for its column id. In a few places the corresponding column is accessed as `[column_id - 1]`, even when the id is zero. Other changes: Small refactoring of ORC streams creation. and stream offset computation. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - https://github.com/nvdbaranec - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) URL: #7902
rapidsai · Apr 9, 2021 · 348ad4d · 348ad4d
1 parent 80afff3
commit 348ad4d
Show file tree

Hide file tree

Showing 4 changed files with 119 additions and 79 deletions.
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
@@ -104,8 +104,9 @@ void ProtobufReader::read(StripeFooter &s, size_t maxlen)
 
 void ProtobufReader::read(Stream &s, size_t maxlen)
 {
-  auto op = std::make_tuple(
-    make_field_reader(1, s.kind), make_field_reader(2, s.column), make_field_reader(3, s.length));
+  auto op = std::make_tuple(make_field_reader(1, s.kind),
+                            make_field_reader(2, s.column_id),
+                            make_field_reader(3, s.length));
   function_builder(s, maxlen, op);
 }
 
@@ -318,7 +319,7 @@ size_t ProtobufWriter::write(const Stream &s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.kind);
-  w.field_uint(2, s.column);
+  if (s.column_id) w.field_uint(2, *s.column_id);
   w.field_uint(3, s.length);
   return w.value();
 }

diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
@@ -16,20 +16,22 @@
 
 #pragma once
 
+#include "orc_common.h"
+
+#include <io/comp/io_uncomp.h>
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/orc_metadata.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <thrust/optional.h>
+
 #include <stddef.h>
 #include <stdint.h>
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <vector>
 
-#include <cudf/utilities/error.hpp>
-
-#include <io/comp/io_uncomp.h>
-#include <cudf/io/datasource.hpp>
-#include <cudf/io/orc_metadata.hpp>
-#include "orc_common.h"
-
 namespace cudf {
 namespace io {
 namespace orc {
@@ -80,8 +82,15 @@ struct FileFooter {
 
 struct Stream {
   StreamKind kind = INVALID_STREAM_KIND;
-  uint32_t column = ~0;  // the column id
-  uint64_t length = 0;   // the number of bytes in the file
+  std::optional<uint32_t> column_id;  // ORC column id (different from column index in the table!)
+  uint64_t length = 0;                // the number of bytes in the file
+
+  // Returns index of the column in the table, if any
+  // Stream of the 'column 0' does not have a corresponding column in the table
+  thrust::optional<uint32_t> column_index() const noexcept
+  {
+    return column_id.value_or(0) > 0 ? thrust::optional<uint32_t>{*column_id - 1} : thrust::nullopt;
+  }
 };
 
 struct ColumnEncoding {
@@ -227,6 +236,15 @@ class ProtobufReader {
     return encode_field_number_base<typename T::element_type>(field_number);
   }
 
+  // optional fields don't change the field number encoding
+  template <typename T,
+            typename std::enable_if_t<std::is_same<T, std::optional<typename T::value_type>>::value>
+              * = nullptr>
+  int static constexpr encode_field_number(int field_number) noexcept
+  {
+    return encode_field_number_base<typename T::value_type>(field_number);
+  }
+
   uint32_t read_field_size(const uint8_t *end);
 
   template <typename T, typename std::enable_if_t<std::is_integral<T>::value> * = nullptr>
@@ -279,6 +297,16 @@ class ProtobufReader {
     value = std::make_unique<typename T::element_type>(std::move(contained_value));
   }
 
+  template <typename T,
+            typename std::enable_if_t<std::is_same<T, std::optional<typename T::value_type>>::value>
+              * = nullptr>
+  void read_field(T &value, const uint8_t *end)
+  {
+    typename T::value_type contained_value;
+    read_field(contained_value, end);
+    value = std::optional<typename T::value_type>{std::move(contained_value)};
+  }
+
   template <typename T>
   auto read_field(T &value, const uint8_t *end) -> decltype(read(value, 0))
   {

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
@@ -162,17 +162,18 @@ size_t gather_stream_info(const size_t stripe_index,
   uint64_t src_offset    = 0;
   uint64_t dst_offset    = 0;
   for (const auto &stream : stripefooter->streams) {
-    if (stream.column >= orc2gdf.size()) {
+    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
       dst_offset += stream.length;
       continue;
     }
 
-    auto col = orc2gdf[stream.column];
+    auto const column_id = *stream.column_id;
+    auto col             = orc2gdf[column_id];
     if (col == -1) {
       // A struct-type column has no data itself, but rather child columns
       // for each of its fields. There is only a PRESENT stream, which
       // needs to be included for the reader.
-      const auto schema_type = types[stream.column];
+      const auto schema_type = types[column_id];
       if (schema_type.subtypes.size() != 0) {
         if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
           for (const auto &idx : schema_type.subtypes) {
@@ -192,16 +193,16 @@ size_t gather_stream_info(const size_t stripe_index,
         // NOTE: skip_count field is temporarily used to track index ordering
         auto &chunk = chunks[stripe_index * num_columns + col];
         const auto idx =
-          get_index_type_and_pos(stream.kind, chunk.skip_count, col == orc2gdf[stream.column]);
+          get_index_type_and_pos(stream.kind, chunk.skip_count, col == orc2gdf[column_id]);
         if (idx.first < gpu::CI_NUM_STREAMS) {
           chunk.strm_id[idx.first]  = stream_info.size();
           chunk.strm_len[idx.first] = stream.length;
           chunk.skip_count          = idx.second;
 
           if (idx.first == gpu::CI_DICTIONARY) {
             chunk.dictionary_start = *num_dictionary_entries;
-            chunk.dict_len         = stripefooter->columns[stream.column].dictionarySize;
-            *num_dictionary_entries += stripefooter->columns[stream.column].dictionarySize;
+            chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
+            *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
           }
         }
       }