Skip to content

Commit

Permalink
Fix out-of-bounds access in ORC writer (#7902)
Browse files Browse the repository at this point in the history
Avoid out-of-bounds access due to streams holding column ids as `index + 1`, and the first index stream using zero for its column id. In a few places the corresponding column is accessed as `[column_id - 1]`, even when the id is zero.

Other changes:
Small refactoring of ORC streams creation. and stream offset computation.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - https://github.com/nvdbaranec
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)

URL: #7902
  • Loading branch information
vuule authored Apr 9, 2021
1 parent 80afff3 commit 348ad4d
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 79 deletions.
7 changes: 4 additions & 3 deletions cpp/src/io/orc/orc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,9 @@ void ProtobufReader::read(StripeFooter &s, size_t maxlen)

void ProtobufReader::read(Stream &s, size_t maxlen)
{
auto op = std::make_tuple(
make_field_reader(1, s.kind), make_field_reader(2, s.column), make_field_reader(3, s.length));
auto op = std::make_tuple(make_field_reader(1, s.kind),
make_field_reader(2, s.column_id),
make_field_reader(3, s.length));
function_builder(s, maxlen, op);
}

Expand Down Expand Up @@ -318,7 +319,7 @@ size_t ProtobufWriter::write(const Stream &s)
{
ProtobufFieldWriter w(this);
w.field_uint(1, s.kind);
w.field_uint(2, s.column);
if (s.column_id) w.field_uint(2, *s.column_id);
w.field_uint(3, s.length);
return w.value();
}
Expand Down
46 changes: 37 additions & 9 deletions cpp/src/io/orc/orc.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,22 @@

#pragma once

#include "orc_common.h"

#include <io/comp/io_uncomp.h>
#include <cudf/io/datasource.hpp>
#include <cudf/io/orc_metadata.hpp>
#include <cudf/utilities/error.hpp>

#include <thrust/optional.h>

#include <stddef.h>
#include <stdint.h>
#include <algorithm>
#include <memory>
#include <string>
#include <vector>

#include <cudf/utilities/error.hpp>

#include <io/comp/io_uncomp.h>
#include <cudf/io/datasource.hpp>
#include <cudf/io/orc_metadata.hpp>
#include "orc_common.h"

namespace cudf {
namespace io {
namespace orc {
Expand Down Expand Up @@ -80,8 +82,15 @@ struct FileFooter {

struct Stream {
StreamKind kind = INVALID_STREAM_KIND;
uint32_t column = ~0; // the column id
uint64_t length = 0; // the number of bytes in the file
std::optional<uint32_t> column_id; // ORC column id (different from column index in the table!)
uint64_t length = 0; // the number of bytes in the file

// Returns index of the column in the table, if any
// Stream of the 'column 0' does not have a corresponding column in the table
thrust::optional<uint32_t> column_index() const noexcept
{
return column_id.value_or(0) > 0 ? thrust::optional<uint32_t>{*column_id - 1} : thrust::nullopt;
}
};

struct ColumnEncoding {
Expand Down Expand Up @@ -227,6 +236,15 @@ class ProtobufReader {
return encode_field_number_base<typename T::element_type>(field_number);
}

// optional fields don't change the field number encoding
template <typename T,
typename std::enable_if_t<std::is_same<T, std::optional<typename T::value_type>>::value>
* = nullptr>
int static constexpr encode_field_number(int field_number) noexcept
{
return encode_field_number_base<typename T::value_type>(field_number);
}

uint32_t read_field_size(const uint8_t *end);

template <typename T, typename std::enable_if_t<std::is_integral<T>::value> * = nullptr>
Expand Down Expand Up @@ -279,6 +297,16 @@ class ProtobufReader {
value = std::make_unique<typename T::element_type>(std::move(contained_value));
}

template <typename T,
typename std::enable_if_t<std::is_same<T, std::optional<typename T::value_type>>::value>
* = nullptr>
void read_field(T &value, const uint8_t *end)
{
typename T::value_type contained_value;
read_field(contained_value, end);
value = std::optional<typename T::value_type>{std::move(contained_value)};
}

template <typename T>
auto read_field(T &value, const uint8_t *end) -> decltype(read(value, 0))
{
Expand Down
13 changes: 7 additions & 6 deletions cpp/src/io/orc/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -162,17 +162,18 @@ size_t gather_stream_info(const size_t stripe_index,
uint64_t src_offset = 0;
uint64_t dst_offset = 0;
for (const auto &stream : stripefooter->streams) {
if (stream.column >= orc2gdf.size()) {
if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
dst_offset += stream.length;
continue;
}

auto col = orc2gdf[stream.column];
auto const column_id = *stream.column_id;
auto col = orc2gdf[column_id];
if (col == -1) {
// A struct-type column has no data itself, but rather child columns
// for each of its fields. There is only a PRESENT stream, which
// needs to be included for the reader.
const auto schema_type = types[stream.column];
const auto schema_type = types[column_id];
if (schema_type.subtypes.size() != 0) {
if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
for (const auto &idx : schema_type.subtypes) {
Expand All @@ -192,16 +193,16 @@ size_t gather_stream_info(const size_t stripe_index,
// NOTE: skip_count field is temporarily used to track index ordering
auto &chunk = chunks[stripe_index * num_columns + col];
const auto idx =
get_index_type_and_pos(stream.kind, chunk.skip_count, col == orc2gdf[stream.column]);
get_index_type_and_pos(stream.kind, chunk.skip_count, col == orc2gdf[column_id]);
if (idx.first < gpu::CI_NUM_STREAMS) {
chunk.strm_id[idx.first] = stream_info.size();
chunk.strm_len[idx.first] = stream.length;
chunk.skip_count = idx.second;

if (idx.first == gpu::CI_DICTIONARY) {
chunk.dictionary_start = *num_dictionary_entries;
chunk.dict_len = stripefooter->columns[stream.column].dictionarySize;
*num_dictionary_entries += stripefooter->columns[stream.column].dictionarySize;
chunk.dict_len = stripefooter->columns[column_id].dictionarySize;
*num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
}
}
}
Expand Down
Loading

0 comments on commit 348ad4d

Please sign in to comment.