Skip to content

Commit

Permalink
Merge branch 'branch-22.02' into bug/include_optional_where_used
Browse files Browse the repository at this point in the history
  • Loading branch information
robertmaynard committed Jan 19, 2022
2 parents d2dffc0 + 8fd7dd2 commit 7c656a5
Show file tree
Hide file tree
Showing 29 changed files with 1,674 additions and 599 deletions.
65 changes: 38 additions & 27 deletions cpp/src/io/orc/orc.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -38,10 +38,10 @@ uint32_t ProtobufReader::read_field_size(const uint8_t* end)
void ProtobufReader::skip_struct_field(int t)
{
switch (t) {
case PB_TYPE_VARINT: get<uint32_t>(); break;
case PB_TYPE_FIXED64: skip_bytes(8); break;
case PB_TYPE_FIXEDLEN: skip_bytes(get<uint32_t>()); break;
case PB_TYPE_FIXED32: skip_bytes(4); break;
case ProtofType::VARINT: get<uint32_t>(); break;
case ProtofType::FIXED64: skip_bytes(8); break;
case ProtofType::FIXEDLEN: skip_bytes(get<uint32_t>()); break;
case ProtofType::FIXED32: skip_bytes(4); break;
default: break;
}
}
Expand Down Expand Up @@ -209,43 +209,54 @@ void ProtobufWriter::put_row_index_entry(int32_t present_blk,
int32_t data_ofs,
int32_t data2_blk,
int32_t data2_ofs,
TypeKind kind)
TypeKind kind,
ColStatsBlob const* stats)
{
size_t sz = 0, lpos;
putb(1 * 8 + PB_TYPE_FIXEDLEN); // 1:RowIndex.entry
put_uint(encode_field_number(1, ProtofType::FIXEDLEN)); // 1:RowIndex.entry
lpos = m_buf->size();
putb(0xcd); // sz+2
putb(1 * 8 + PB_TYPE_FIXEDLEN); // 1:positions[packed=true]
putb(0xcd); // sz
put_byte(0xcd); // sz+2
put_uint(encode_field_number(1, ProtofType::FIXEDLEN)); // 1:positions[packed=true]
put_byte(0xcd); // sz
if (present_blk >= 0) sz += put_uint(present_blk);
if (present_ofs >= 0) {
sz += put_uint(present_ofs) + 2;
putb(0); // run pos = 0
putb(0); // bit pos = 0
sz += put_uint(present_ofs);
sz += put_byte(0); // run pos = 0
sz += put_byte(0); // bit pos = 0
}
if (data_blk >= 0) { sz += put_uint(data_blk); }
if (data_ofs >= 0) {
sz += put_uint(data_ofs);
if (kind != STRING && kind != FLOAT && kind != DOUBLE && kind != DECIMAL) {
putb(0); // RLE run pos always zero (assumes RLE aligned with row index boundaries)
sz++;
// RLE run pos always zero (assumes RLE aligned with row index boundaries)
sz += put_byte(0);
if (kind == BOOLEAN) {
putb(0); // bit position in byte, always zero
sz++;
// bit position in byte, always zero
sz += put_byte(0);
}
}
}
if (kind !=
INT) // INT kind can be passed in to bypass 2nd stream index (dictionary length streams)
{
// INT kind can be passed in to bypass 2nd stream index (dictionary length streams)
if (kind != INT) {
if (data2_blk >= 0) { sz += put_uint(data2_blk); }
if (data2_ofs >= 0) {
sz += put_uint(data2_ofs) + 1;
putb(0); // RLE run pos always zero (assumes RLE aligned with row index boundaries)
sz += put_uint(data2_ofs);
// RLE run pos always zero (assumes RLE aligned with row index boundaries)
sz += put_byte(0);
}
}
m_buf->data()[lpos] = (uint8_t)(sz + 2);
// size of the field 1
m_buf->data()[lpos + 2] = (uint8_t)(sz);

if (stats != nullptr) {
sz += put_uint(encode_field_number<decltype(*stats)>(2)); // 2: statistics
// Statistics field contains its length as varint and dtype specific data (encoded on the GPU)
sz += put_uint(stats->size());
sz += put_bytes<typename ColStatsBlob::value_type>(*stats);
}

// size of the whole row index entry
m_buf->data()[lpos] = (uint8_t)(sz + 2);
}

size_t ProtobufWriter::write(const PostScript& s)
Expand All @@ -256,7 +267,7 @@ size_t ProtobufWriter::write(const PostScript& s)
if (s.compression != NONE) { w.field_uint(3, s.compressionBlockSize); }
w.field_packed_uint(4, s.version);
w.field_uint(5, s.metadataLength);
w.field_string(8000, s.magic);
w.field_blob(8000, s.magic);
return w.value();
}

Expand Down Expand Up @@ -300,8 +311,8 @@ size_t ProtobufWriter::write(const SchemaType& s)
size_t ProtobufWriter::write(const UserMetadataItem& s)
{
ProtobufFieldWriter w(this);
w.field_string(1, s.name);
w.field_string(2, s.value);
w.field_blob(1, s.name);
w.field_blob(2, s.value);
return w.value();
}

Expand All @@ -310,7 +321,7 @@ size_t ProtobufWriter::write(const StripeFooter& s)
ProtobufFieldWriter w(this);
w.field_repeated_struct(1, s.streams);
w.field_repeated_struct(2, s.columns);
if (s.writerTimezone != "") { w.field_string(3, s.writerTimezone); }
if (s.writerTimezone != "") { w.field_blob(3, s.writerTimezone); }
return w.value();
}

Expand Down
138 changes: 79 additions & 59 deletions cpp/src/io/orc/orc.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -131,6 +131,67 @@ struct Metadata {
std::vector<StripeStatistics> stripeStats;
};

int inline constexpr encode_field_number(int field_number, ProtofType field_type) noexcept
{
return (field_number * 8) + static_cast<int>(field_type);
}

namespace {
template <typename base_t,
typename std::enable_if_t<!std::is_arithmetic<base_t>::value and
!std::is_enum<base_t>::value>* = nullptr>
int static constexpr encode_field_number_base(int field_number) noexcept
{
return encode_field_number(field_number, ProtofType::FIXEDLEN);
}

template <typename base_t,
typename std::enable_if_t<std::is_integral<base_t>::value or
std::is_enum<base_t>::value>* = nullptr>
int static constexpr encode_field_number_base(int field_number) noexcept
{
return encode_field_number(field_number, ProtofType::VARINT);
}

template <typename base_t, typename std::enable_if_t<std::is_same_v<base_t, float>>* = nullptr>
int static constexpr encode_field_number_base(int field_number) noexcept
{
return encode_field_number(field_number, ProtofType::FIXED32);
}

template <typename base_t, typename std::enable_if_t<std::is_same_v<base_t, double>>* = nullptr>
int static constexpr encode_field_number_base(int field_number) noexcept
{
return encode_field_number(field_number, ProtofType::FIXED64);
}
}; // namespace

template <
typename T,
typename std::enable_if_t<!std::is_class<T>::value or std::is_same_v<T, std::string>>* = nullptr>
int constexpr encode_field_number(int field_number) noexcept
{
return encode_field_number_base<T>(field_number);
}

// containters change the field number encoding
template <
typename T,
typename std::enable_if_t<std::is_same<T, std::vector<typename T::value_type>>::value>* = nullptr>
int constexpr encode_field_number(int field_number) noexcept
{
return encode_field_number_base<T>(field_number);
}

// optional fields don't change the field number encoding
template <typename T,
typename std::enable_if_t<
std::is_same<T, std::optional<typename T::value_type>>::value>* = nullptr>
int constexpr encode_field_number(int field_number) noexcept
{
return encode_field_number_base<typename T::value_type>(field_number);
}

/**
* @brief Class for parsing Orc's Protocol Buffers encoded metadata
*/
Expand Down Expand Up @@ -181,60 +242,6 @@ class ProtobufReader {
template <typename T, typename... Operator>
void function_builder(T& s, size_t maxlen, std::tuple<Operator...>& op);

template <typename base_t,
typename std::enable_if_t<!std::is_arithmetic<base_t>::value and
!std::is_enum<base_t>::value>* = nullptr>
int static constexpr encode_field_number_base(int field_number) noexcept
{
return (field_number * 8) + PB_TYPE_FIXEDLEN;
}

template <typename base_t,
typename std::enable_if_t<std::is_integral<base_t>::value or
std::is_enum<base_t>::value>* = nullptr>
int static constexpr encode_field_number_base(int field_number) noexcept
{
return (field_number * 8) + PB_TYPE_VARINT;
}

template <typename base_t, typename std::enable_if_t<std::is_same_v<base_t, float>>* = nullptr>
int static constexpr encode_field_number_base(int field_number) noexcept
{
return (field_number * 8) + PB_TYPE_FIXED32;
}

template <typename base_t, typename std::enable_if_t<std::is_same_v<base_t, double>>* = nullptr>
int static constexpr encode_field_number_base(int field_number) noexcept
{
return (field_number * 8) + PB_TYPE_FIXED64;
}

template <typename T,
typename std::enable_if_t<!std::is_class<T>::value or std::is_same_v<T, std::string>>* =
nullptr>
int static constexpr encode_field_number(int field_number) noexcept
{
return encode_field_number_base<T>(field_number);
}

// containters change the field number encoding
template <typename T,
typename std::enable_if_t<
std::is_same<T, std::vector<typename T::value_type>>::value>* = nullptr>
int static constexpr encode_field_number(int field_number) noexcept
{
return encode_field_number_base<T>(field_number);
}

// optional fields don't change the field number encoding
template <typename T,
typename std::enable_if_t<
std::is_same<T, std::optional<typename T::value_type>>::value>* = nullptr>
int static constexpr encode_field_number(int field_number) noexcept
{
return encode_field_number_base<typename T::value_type>(field_number);
}

uint32_t read_field_size(const uint8_t* end);

template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
Expand Down Expand Up @@ -470,16 +477,28 @@ class ProtobufWriter {
public:
ProtobufWriter() { m_buf = nullptr; }
ProtobufWriter(std::vector<uint8_t>* output) { m_buf = output; }
void putb(uint8_t v) { m_buf->push_back(v); }
uint32_t put_byte(uint8_t v)
{
m_buf->push_back(v);
return 1;
}
template <typename T>
uint32_t put_bytes(host_span<T const> values)
{
static_assert(sizeof(T) == 1);
m_buf->reserve(m_buf->size() + values.size());
m_buf->insert(m_buf->end(), values.begin(), values.end());
return values.size();
}
uint32_t put_uint(uint64_t v)
{
int l = 1;
while (v > 0x7f) {
putb(static_cast<uint8_t>(v | 0x80));
put_byte(static_cast<uint8_t>(v | 0x80));
v >>= 7;
l++;
}
putb(static_cast<uint8_t>(v));
put_byte(static_cast<uint8_t>(v));
return l;
}
uint32_t put_int(int64_t v)
Expand All @@ -493,7 +512,8 @@ class ProtobufWriter {
int32_t data_ofs,
int32_t data2_blk,
int32_t data2_ofs,
TypeKind kind);
TypeKind kind,
ColStatsBlob const* stats);

public:
size_t write(const PostScript&);
Expand Down
20 changes: 10 additions & 10 deletions cpp/src/io/orc/orc_common.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -76,15 +76,15 @@ enum ColumnEncodingKind : int8_t {
DICTIONARY_V2 = 3, // the encoding is dictionary-based using RLE v2
};

enum : uint8_t { // Protobuf field types
PB_TYPE_VARINT = 0,
PB_TYPE_FIXED64 = 1,
PB_TYPE_FIXEDLEN = 2,
PB_TYPE_START_GROUP = 3, // deprecated
PB_TYPE_END_GROUP = 4, // deprecated
PB_TYPE_FIXED32 = 5,
PB_TYPE_INVALID_6 = 6,
PB_TYPE_INVALID_7 = 7,
enum ProtofType : uint8_t {
VARINT = 0,
FIXED64 = 1,
FIXEDLEN = 2,
START_GROUP = 3, // deprecated
END_GROUP = 4, // deprecated
FIXED32 = 5,
INVALID_6 = 6,
INVALID_7 = 7,
};

} // namespace orc
Expand Down
Loading

0 comments on commit 7c656a5

Please sign in to comment.