From 184fe983ab3c9ac6d088a675dca5f1373787c54a Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 4 Dec 2023 18:13:01 -0800 Subject: [PATCH 01/12] read()s --- .../io/parquet/compact_protocol_reader.cpp | 90 ++++++++++--------- .../io/parquet/compact_protocol_reader.hpp | 42 ++++----- cpp/src/io/parquet/reader_impl_helpers.cpp | 2 +- cpp/tests/io/parquet_test.cpp | 25 +++--- 4 files changed, 79 insertions(+), 80 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 5a2b8aa8f2a..7e5d9a49acb 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -266,7 +266,9 @@ class parquet_field_struct : public parquet_field { inline bool operator()(CompactProtocolReader* cpr, int field_type) { - return (field_type != ST_FLD_STRUCT || !(cpr->read(&val))); + if (field_type != ST_FLD_STRUCT) { return true; } + cpr->read(&val); + return false; } }; @@ -332,7 +334,7 @@ struct parquet_field_struct_list : public parquet_field_list { parquet_field_struct_list(int f, std::vector& v) : parquet_field_list(f, v, ST_FLD_STRUCT) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { - if (not cpr->read(&this->val[i])) { return true; } + cpr->read(&this->val[i]); return false; }; this->bind_read_func(read_value); @@ -524,7 +526,7 @@ inline bool function_builder(CompactProtocolReader* cpr, std::tuple return true; } -bool CompactProtocolReader::read(FileMetaData* f) +void CompactProtocolReader::read(FileMetaData* f) { using optional_list_column_order = parquet_field_optional, parquet_field_struct_list>; @@ -535,10 +537,10 @@ bool CompactProtocolReader::read(FileMetaData* f) parquet_field_struct_list(5, f->key_value_metadata), parquet_field_string(6, f->created_by), optional_list_column_order(7, f->column_orders)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(SchemaElement* s) +void CompactProtocolReader::read(SchemaElement* s) { using optional_converted_type = parquet_field_optional>; @@ -554,10 +556,10 @@ bool CompactProtocolReader::read(SchemaElement* s) parquet_field_int32(8, s->decimal_precision), parquet_field_optional(9, s->field_id), optional_logical_type(10, s->logical_type)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(LogicalType* l) +void CompactProtocolReader::read(LogicalType* l) { auto op = std::make_tuple( parquet_field_union_enumerator(1, l->type), @@ -572,52 +574,52 @@ bool CompactProtocolReader::read(LogicalType* l) parquet_field_union_enumerator(11, l->type), parquet_field_union_enumerator(12, l->type), parquet_field_union_enumerator(13, l->type)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(DecimalType* d) +void CompactProtocolReader::read(DecimalType* d) { auto op = std::make_tuple(parquet_field_int32(1, d->scale), parquet_field_int32(2, d->precision)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(TimeType* t) +void CompactProtocolReader::read(TimeType* t) { auto op = std::make_tuple(parquet_field_bool(1, t->isAdjustedToUTC), parquet_field_struct(2, t->unit)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(TimestampType* t) +void CompactProtocolReader::read(TimestampType* t) { auto op = std::make_tuple(parquet_field_bool(1, t->isAdjustedToUTC), parquet_field_struct(2, t->unit)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(TimeUnit* u) +void CompactProtocolReader::read(TimeUnit* u) { auto op = std::make_tuple(parquet_field_union_enumerator(1, u->type), parquet_field_union_enumerator(2, u->type), parquet_field_union_enumerator(3, u->type)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(IntType* i) +void CompactProtocolReader::read(IntType* i) { auto op = std::make_tuple(parquet_field_int8(1, i->bitWidth), parquet_field_bool(2, i->isSigned)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(RowGroup* r) +void CompactProtocolReader::read(RowGroup* r) { auto op = std::make_tuple(parquet_field_struct_list(1, r->columns), parquet_field_int64(2, r->total_byte_size), parquet_field_int64(3, r->num_rows)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(ColumnChunk* c) +void CompactProtocolReader::read(ColumnChunk* c) { auto op = std::make_tuple(parquet_field_string(1, c->file_path), parquet_field_int64(2, c->file_offset), @@ -626,10 +628,10 @@ bool CompactProtocolReader::read(ColumnChunk* c) parquet_field_int32(5, c->offset_index_length), parquet_field_int64(6, c->column_index_offset), parquet_field_int32(7, c->column_index_length)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(ColumnChunkMetaData* c) +void CompactProtocolReader::read(ColumnChunkMetaData* c) { auto op = std::make_tuple(parquet_field_enum(1, c->type), parquet_field_enum_list(2, c->encodings), @@ -642,10 +644,10 @@ bool CompactProtocolReader::read(ColumnChunkMetaData* c) parquet_field_int64(10, c->index_page_offset), parquet_field_int64(11, c->dictionary_page_offset), parquet_field_struct(12, c->statistics)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(PageHeader* p) +void CompactProtocolReader::read(PageHeader* p) { auto op = std::make_tuple(parquet_field_enum(1, p->type), parquet_field_int32(2, p->uncompressed_page_size), @@ -653,26 +655,26 @@ bool CompactProtocolReader::read(PageHeader* p) parquet_field_struct(5, p->data_page_header), parquet_field_struct(7, p->dictionary_page_header), parquet_field_struct(8, p->data_page_header_v2)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(DataPageHeader* d) +void CompactProtocolReader::read(DataPageHeader* d) { auto op = std::make_tuple(parquet_field_int32(1, d->num_values), parquet_field_enum(2, d->encoding), parquet_field_enum(3, d->definition_level_encoding), parquet_field_enum(4, d->repetition_level_encoding)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(DictionaryPageHeader* d) +void CompactProtocolReader::read(DictionaryPageHeader* d) { auto op = std::make_tuple(parquet_field_int32(1, d->num_values), parquet_field_enum(2, d->encoding)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(DataPageHeaderV2* d) +void CompactProtocolReader::read(DataPageHeaderV2* d) { auto op = std::make_tuple(parquet_field_int32(1, d->num_values), parquet_field_int32(2, d->num_nulls), @@ -681,40 +683,40 @@ bool CompactProtocolReader::read(DataPageHeaderV2* d) parquet_field_int32(5, d->definition_levels_byte_length), parquet_field_int32(6, d->repetition_levels_byte_length), parquet_field_bool(7, d->is_compressed)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(KeyValue* k) +void CompactProtocolReader::read(KeyValue* k) { auto op = std::make_tuple(parquet_field_string(1, k->key), parquet_field_string(2, k->value)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(PageLocation* p) +void CompactProtocolReader::read(PageLocation* p) { auto op = std::make_tuple(parquet_field_int64(1, p->offset), parquet_field_int32(2, p->compressed_page_size), parquet_field_int64(3, p->first_row_index)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(OffsetIndex* o) +void CompactProtocolReader::read(OffsetIndex* o) { auto op = std::make_tuple(parquet_field_struct_list(1, o->page_locations)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(ColumnIndex* c) +void CompactProtocolReader::read(ColumnIndex* c) { auto op = std::make_tuple(parquet_field_bool_list(1, c->null_pages), parquet_field_binary_list(2, c->min_values), parquet_field_binary_list(3, c->max_values), parquet_field_enum(4, c->boundary_order), parquet_field_int64_list(5, c->null_counts)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(Statistics* s) +void CompactProtocolReader::read(Statistics* s) { using optional_binary = parquet_field_optional, parquet_field_binary>; using optional_int64 = parquet_field_optional; @@ -725,13 +727,13 @@ bool CompactProtocolReader::read(Statistics* s) optional_int64(4, s->distinct_count), optional_binary(5, s->max_value), optional_binary(6, s->min_value)); - return function_builder(this, op); + function_builder(this, op); } -bool CompactProtocolReader::read(ColumnOrder* c) +void CompactProtocolReader::read(ColumnOrder* c) { auto op = std::make_tuple(parquet_field_union_enumerator(1, c->type)); - return function_builder(this, op); + function_builder(this, op); } /** diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp index cbb4161b138..5be72280251 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.hpp +++ b/cpp/src/io/parquet/compact_protocol_reader.hpp @@ -98,27 +98,27 @@ class CompactProtocolReader { public: // Generate Thrift structure parsing routines - bool read(FileMetaData* f); - bool read(SchemaElement* s); - bool read(LogicalType* l); - bool read(DecimalType* d); - bool read(TimeType* t); - bool read(TimeUnit* u); - bool read(TimestampType* t); - bool read(IntType* t); - bool read(RowGroup* r); - bool read(ColumnChunk* c); - bool read(ColumnChunkMetaData* c); - bool read(PageHeader* p); - bool read(DataPageHeader* d); - bool read(DictionaryPageHeader* d); - bool read(DataPageHeaderV2* d); - bool read(KeyValue* k); - bool read(PageLocation* p); - bool read(OffsetIndex* o); - bool read(ColumnIndex* c); - bool read(Statistics* s); - bool read(ColumnOrder* c); + void read(FileMetaData* f); + void read(SchemaElement* s); + void read(LogicalType* l); + void read(DecimalType* d); + void read(TimeType* t); + void read(TimeUnit* u); + void read(TimestampType* t); + void read(IntType* t); + void read(RowGroup* r); + void read(ColumnChunk* c); + void read(ColumnChunkMetaData* c); + void read(PageHeader* p); + void read(DataPageHeader* d); + void read(DictionaryPageHeader* d); + void read(DataPageHeaderV2* d); + void read(KeyValue* k); + void read(PageLocation* p); + void read(OffsetIndex* o); + void read(ColumnIndex* c); + void read(Statistics* s); + void read(ColumnOrder* c); public: static int NumRequiredBits(uint32_t max_level) noexcept diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index a9c84143e1a..f583a779290 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -264,7 +264,7 @@ metadata::metadata(datasource* source) auto const buffer = source->host_read(len - ender->footer_len - ender_len, ender->footer_len); CompactProtocolReader cp(buffer->data(), ender->footer_len); - CUDF_EXPECTS(cp.read(this), "Cannot parse metadata"); + cp.read(this); CUDF_EXPECTS(cp.InitSchema(this), "Cannot initialize schema"); sanitize_schema(); } diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index fece83f891b..3774193096e 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -225,9 +225,7 @@ void read_footer(std::unique_ptr const& source, source->host_read(len - ender->footer_len - ender_len, ender->footer_len); cudf::io::parquet::detail::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len); - // returns true on success - bool res = cp.read(file_meta_data); - ASSERT_TRUE(res); + cp.read(file_meta_data); } // returns the number of bits used for dictionary encoding data at the given page location. @@ -242,8 +240,7 @@ int read_dict_bits(std::unique_ptr const& source, cudf::io::parquet::detail::PageHeader page_hdr; auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); - bool res = cp.read(&page_hdr); - CUDF_EXPECTS(res, "Cannot parse page header"); + cp.read(&page_hdr); // cp should be pointing at the start of page data now. the first byte // should be the encoding bit size @@ -263,8 +260,7 @@ cudf::io::parquet::detail::ColumnIndex read_column_index( cudf::io::parquet::detail::ColumnIndex colidx; auto const ci_buf = source->host_read(chunk.column_index_offset, chunk.column_index_length); cudf::io::parquet::detail::CompactProtocolReader cp(ci_buf->data(), ci_buf->size()); - bool res = cp.read(&colidx); - CUDF_EXPECTS(res, "Cannot parse column index"); + cp.read(&colidx); return colidx; } @@ -281,8 +277,7 @@ cudf::io::parquet::detail::OffsetIndex read_offset_index( cudf::io::parquet::detail::OffsetIndex offidx; auto const oi_buf = source->host_read(chunk.offset_index_offset, chunk.offset_index_length); cudf::io::parquet::detail::CompactProtocolReader cp(oi_buf->data(), oi_buf->size()); - bool res = cp.read(&offidx); - CUDF_EXPECTS(res, "Cannot parse offset index"); + cp.read(&offidx); return offidx; } @@ -306,8 +301,7 @@ cudf::io::parquet::detail::PageHeader read_page_header( cudf::io::parquet::detail::PageHeader page_hdr; auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); - bool res = cp.read(&page_hdr); - CUDF_EXPECTS(res, "Cannot parse page header"); + cp.read(&page_hdr); return page_hdr; } @@ -351,10 +345,12 @@ struct ParquetWriterSchemaTest : public ParquetWriterTest { }; template -struct ParquetReaderSourceTest : public ParquetReaderTest {}; +struct ParquetReaderSourceTest : public ParquetReaderTest { +}; template -struct ParquetWriterDeltaTest : public ParquetWriterTest {}; +struct ParquetWriterDeltaTest : public ParquetWriterTest { +}; // Declare typed test cases // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352 @@ -6084,7 +6080,8 @@ TEST_F(ParquetReaderTest, FilterNamedExpression) // Test for Types - numeric, chrono, string. template -struct ParquetReaderPredicatePushdownTest : public ParquetReaderTest {}; +struct ParquetReaderPredicatePushdownTest : public ParquetReaderTest { +}; // These chrono types are not supported because parquet writer does not have a type to represent // them. From 400825d0b54efbcd17b5537305aaa2adac967c92 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 5 Dec 2023 12:26:11 -0800 Subject: [PATCH 02/12] skip and builder --- .../io/parquet/compact_protocol_reader.cpp | 32 +++++++++---------- .../io/parquet/compact_protocol_reader.hpp | 2 +- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 7e5d9a49acb..6ff70df703e 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -16,6 +16,8 @@ #include "compact_protocol_reader.hpp" +#include + #include #include #include @@ -440,7 +442,7 @@ class parquet_field_optional : public parquet_field { * * @return True if the struct type is recognized, false otherwise */ -bool CompactProtocolReader::skip_struct_field(int t, int depth) +void CompactProtocolReader::skip_struct_field(int t, int depth) { switch (t) { case ST_FLD_TRUE: @@ -454,7 +456,7 @@ bool CompactProtocolReader::skip_struct_field(int t, int depth) case ST_FLD_LIST: [[fallthrough]]; case ST_FLD_SET: { auto const [t, n] = get_listh(); - if (depth > 10) { return false; } + CUDF_EXPECTS(depth <= 10, "struct nesting too deep"); for (uint32_t i = 0; i < n; i++) { skip_struct_field(t, depth + 1); } @@ -465,7 +467,7 @@ bool CompactProtocolReader::skip_struct_field(int t, int depth) t = c & 0xf; if (c == 0) { break; } // end of struct if ((c & 0xf0) == 0) { get_i16(); } // field id is not a delta - if (depth > 10) { return false; } + CUDF_EXPECTS(depth <= 10, "struct nesting too deep"); skip_struct_field(t, depth + 1); } break; @@ -473,21 +475,20 @@ bool CompactProtocolReader::skip_struct_field(int t, int depth) // printf("unsupported skip for type %d\n", t); break; } - return true; } template struct FunctionSwitchImpl { template - static inline bool run(CompactProtocolReader* cpr, + static inline void run(CompactProtocolReader* cpr, int field_type, int const& field, std::tuple& ops) { if (field == std::get(ops).field()) { - return std::get(ops)(cpr, field_type); + std::get(ops)(cpr, field_type); } else { - return FunctionSwitchImpl::run(cpr, field_type, field, ops); + FunctionSwitchImpl::run(cpr, field_type, field, ops); } } }; @@ -495,35 +496,32 @@ struct FunctionSwitchImpl { template <> struct FunctionSwitchImpl<0> { template - static inline bool run(CompactProtocolReader* cpr, + static inline void run(CompactProtocolReader* cpr, int field_type, int const& field, std::tuple& ops) { if (field == std::get<0>(ops).field()) { - return std::get<0>(ops)(cpr, field_type); + std::get<0>(ops)(cpr, field_type); } else { cpr->skip_struct_field(field_type); - return false; } } }; template -inline bool function_builder(CompactProtocolReader* cpr, std::tuple& op) +inline void function_builder(CompactProtocolReader* cpr, std::tuple& op) { constexpr int index = std::tuple_size>::value - 1; int field = 0; while (true) { int const current_byte = cpr->getb(); if (!current_byte) { break; } - int const field_delta = current_byte >> 4; - int const field_type = current_byte & 0xf; - field = field_delta ? field + field_delta : cpr->get_i16(); - bool const exit_function = FunctionSwitchImpl::run(cpr, field_type, field, op); - if (exit_function) { return false; } + int const field_delta = current_byte >> 4; + int const field_type = current_byte & 0xf; + field = field_delta ? field + field_delta : cpr->get_i16(); + FunctionSwitchImpl::run(cpr, field_type, field, op); } - return true; } void CompactProtocolReader::read(FileMetaData* f) diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp index 5be72280251..609f65a4f20 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.hpp +++ b/cpp/src/io/parquet/compact_protocol_reader.hpp @@ -94,7 +94,7 @@ class CompactProtocolReader { return {t, sz}; } - bool skip_struct_field(int t, int depth = 0); + void skip_struct_field(int t, int depth = 0); public: // Generate Thrift structure parsing routines From 122c423271bf26abdc2e2667a8c0cda8d52ac1d9 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 5 Dec 2023 17:46:11 -0800 Subject: [PATCH 03/12] field functors --- .../io/parquet/compact_protocol_reader.cpp | 120 +++++++----------- 1 file changed, 48 insertions(+), 72 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 6ff70df703e..3404560494e 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -48,7 +48,7 @@ class parquet_field { template class parquet_field_list : public parquet_field { private: - using read_func_type = std::function; + using read_func_type = std::function; FieldType _expected_type; read_func_type _read_value; @@ -63,16 +63,15 @@ class parquet_field_list : public parquet_field { } public: - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { - if (field_type != ST_FLD_LIST) { return true; } + CUDF_EXPECTS(field_type == ST_FLD_LIST, "expected list field"); auto const [t, n] = cpr->get_listh(); - if (t != _expected_type) { return true; } + CUDF_EXPECTS(t == _expected_type, "list type mismatch"); val.resize(n); for (uint32_t i = 0; i < n; i++) { - if (_read_value(i, cpr)) { return true; } + _read_value(i, cpr); } - return false; } }; @@ -89,11 +88,10 @@ class parquet_field_bool : public parquet_field { public: parquet_field_bool(int f, bool& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { - if (field_type != ST_FLD_TRUE && field_type != ST_FLD_FALSE) { return true; } + CUDF_EXPECTS(field_type == ST_FLD_TRUE || field_type == ST_FLD_FALSE, "expected bool field"); val = field_type == ST_FLD_TRUE; - return false; } }; @@ -108,9 +106,9 @@ struct parquet_field_bool_list : public parquet_field_list { { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { auto const current_byte = cpr->getb(); - if (current_byte != ST_FLD_TRUE && current_byte != ST_FLD_FALSE) { return true; } + CUDF_EXPECTS(current_byte == ST_FLD_TRUE || current_byte == ST_FLD_FALSE, + "expected bool field"); this->val[i] = current_byte == ST_FLD_TRUE; - return false; }; bind_read_func(read_value); } @@ -132,14 +130,14 @@ class parquet_field_int : public parquet_field { public: parquet_field_int(int f, T& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { if constexpr (is_byte) { val = cpr->getb(); } else { val = cpr->get_zigzag(); } - return (field_type != EXPECTED_TYPE); + CUDF_EXPECTS(field_type == EXPECTED_TYPE, "unexpected int field type"); } }; @@ -159,7 +157,6 @@ struct parquet_field_int_list : public parquet_field_list { { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { this->val[i] = cpr->get_zigzag(); - return false; }; this->bind_read_func(read_value); } @@ -179,17 +176,14 @@ class parquet_field_string : public parquet_field { public: parquet_field_string(int f, std::string& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { - if (field_type != ST_FLD_BINARY) { return true; } + CUDF_EXPECTS(field_type == ST_FLD_BINARY, "expected binary field"); auto const n = cpr->get_u32(); - if (n < static_cast(cpr->m_end - cpr->m_cur)) { - val.assign(reinterpret_cast(cpr->m_cur), n); - cpr->m_cur += n; - return false; - } else { - return true; - } + CUDF_EXPECTS(n < static_cast(cpr->m_end - cpr->m_cur), "string length mismatch"); + + val.assign(reinterpret_cast(cpr->m_cur), n); + cpr->m_cur += n; } }; @@ -205,13 +199,10 @@ struct parquet_field_string_list : public parquet_field_list { { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { auto const l = cpr->get_u32(); - if (l < static_cast(cpr->m_end - cpr->m_cur)) { - this->val[i].assign(reinterpret_cast(cpr->m_cur), l); - cpr->m_cur += l; - } else { - return true; - } - return false; + CUDF_EXPECTS(l < static_cast(cpr->m_end - cpr->m_cur), "string length mismatch"); + + this->val[i].assign(reinterpret_cast(cpr->m_cur), l); + cpr->m_cur += l; }; bind_read_func(read_value); } @@ -228,10 +219,10 @@ class parquet_field_enum : public parquet_field { public: parquet_field_enum(int f, Enum& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { val = static_cast(cpr->get_i32()); - return (field_type != ST_FLD_I32); + CUDF_EXPECTS(field_type == ST_FLD_I32, "unexpected enum field type"); } }; @@ -247,7 +238,6 @@ struct parquet_field_enum_list : public parquet_field_list { { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { this->val[i] = static_cast(cpr->get_i32()); - return false; }; this->bind_read_func(read_value); } @@ -266,11 +256,10 @@ class parquet_field_struct : public parquet_field { public: parquet_field_struct(int f, T& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { - if (field_type != ST_FLD_STRUCT) { return true; } + CUDF_EXPECTS(field_type == ST_FLD_STRUCT, "expected struct field"); cpr->read(&val); - return false; } }; @@ -290,15 +279,12 @@ class parquet_field_union_struct : public parquet_field { { } - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { T v; - bool const res = parquet_field_struct(field(), v).operator()(cpr, field_type); - if (!res) { - val = v; - enum_val = static_cast(field()); - } - return res; + parquet_field_struct(field(), v).operator()(cpr, field_type); + val = v; + enum_val = static_cast(field()); } }; @@ -316,12 +302,11 @@ class parquet_field_union_enumerator : public parquet_field { public: parquet_field_union_enumerator(int f, E& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { - if (field_type != ST_FLD_STRUCT) { return true; } + CUDF_EXPECTS(field_type == ST_FLD_STRUCT, "expected struct field"); cpr->skip_struct_field(field_type); val = static_cast(field()); - return false; } }; @@ -337,7 +322,6 @@ struct parquet_field_struct_list : public parquet_field_list { { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { cpr->read(&this->val[i]); - return false; }; this->bind_read_func(read_value); } @@ -355,18 +339,15 @@ class parquet_field_binary : public parquet_field { public: parquet_field_binary(int f, std::vector& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { - if (field_type != ST_FLD_BINARY) { return true; } + CUDF_EXPECTS(field_type == ST_FLD_BINARY, "expected binary field"); auto const n = cpr->get_u32(); - if (n <= static_cast(cpr->m_end - cpr->m_cur)) { - val.resize(n); - val.assign(cpr->m_cur, cpr->m_cur + n); - cpr->m_cur += n; - return false; - } else { - return true; - } + CUDF_EXPECTS(n <= static_cast(cpr->m_end - cpr->m_cur), "binary length mismatch"); + + val.resize(n); + val.assign(cpr->m_cur, cpr->m_cur + n); + cpr->m_cur += n; } }; @@ -382,14 +363,11 @@ struct parquet_field_binary_list : public parquet_field_listget_u32(); - if (l <= static_cast(cpr->m_end - cpr->m_cur)) { - val[i].resize(l); - val[i].assign(cpr->m_cur, cpr->m_cur + l); - cpr->m_cur += l; - } else { - return true; - } - return false; + CUDF_EXPECTS(l <= static_cast(cpr->m_end - cpr->m_cur), "binary length mismatch"); + + val[i].resize(l); + val[i].assign(cpr->m_cur, cpr->m_cur + l); + cpr->m_cur += l; }; bind_read_func(read_value); } @@ -405,13 +383,12 @@ class parquet_field_struct_blob : public parquet_field { public: parquet_field_struct_blob(int f, std::vector& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { - if (field_type != ST_FLD_STRUCT) { return true; } + CUDF_EXPECTS(field_type == ST_FLD_STRUCT, "expected struct type"); uint8_t const* const start = cpr->m_cur; cpr->skip_struct_field(field_type); if (cpr->m_cur > start) { val.assign(start, cpr->m_cur - 1); } - return false; } }; @@ -425,12 +402,11 @@ class parquet_field_optional : public parquet_field { public: parquet_field_optional(int f, thrust::optional& v) : parquet_field(f), val(v) {} - inline bool operator()(CompactProtocolReader* cpr, int field_type) + inline void operator()(CompactProtocolReader* cpr, int field_type) { T v; - bool const res = FieldFunctor(field(), v).operator()(cpr, field_type); - if (!res) { val = v; } - return res; + FieldFunctor(field(), v).operator()(cpr, field_type); + val = v; } }; From 572534a82cc6a3b656a8115d93aa715a51e68b80 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 5 Dec 2023 17:55:45 -0800 Subject: [PATCH 04/12] style --- cpp/tests/io/parquet_test.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 3774193096e..f0b8d3f21ad 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -345,12 +345,10 @@ struct ParquetWriterSchemaTest : public ParquetWriterTest { }; template -struct ParquetReaderSourceTest : public ParquetReaderTest { -}; +struct ParquetReaderSourceTest : public ParquetReaderTest {}; template -struct ParquetWriterDeltaTest : public ParquetWriterTest { -}; +struct ParquetWriterDeltaTest : public ParquetWriterTest {}; // Declare typed test cases // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352 @@ -6080,8 +6078,7 @@ TEST_F(ParquetReaderTest, FilterNamedExpression) // Test for Types - numeric, chrono, string. template -struct ParquetReaderPredicatePushdownTest : public ParquetReaderTest { -}; +struct ParquetReaderPredicatePushdownTest : public ParquetReaderTest {}; // These chrono types are not supported because parquet writer does not have a type to represent // them. From ffa31893d81304cd51aa605c666382a32788f8b3 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 6 Dec 2023 13:49:11 -0800 Subject: [PATCH 05/12] fail early --- cpp/src/io/parquet/compact_protocol_reader.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 3404560494e..cafdc025d74 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -132,12 +132,12 @@ class parquet_field_int : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { + CUDF_EXPECTS(field_type == EXPECTED_TYPE, "unexpected int field type"); if constexpr (is_byte) { val = cpr->getb(); } else { val = cpr->get_zigzag(); } - CUDF_EXPECTS(field_type == EXPECTED_TYPE, "unexpected int field type"); } }; @@ -221,8 +221,8 @@ class parquet_field_enum : public parquet_field { parquet_field_enum(int f, Enum& v) : parquet_field(f), val(v) {} inline void operator()(CompactProtocolReader* cpr, int field_type) { - val = static_cast(cpr->get_i32()); CUDF_EXPECTS(field_type == ST_FLD_I32, "unexpected enum field type"); + val = static_cast(cpr->get_i32()); } }; From bc50e6cffff2be6902d78726c43f2c8d266ff75a Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 6 Dec 2023 14:46:53 -0800 Subject: [PATCH 06/12] finish merge --- cpp/src/io/parquet/compact_protocol_reader.cpp | 2 +- cpp/src/io/parquet/reader_impl_helpers.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 7b3443e5005..8392548b09a 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -686,7 +686,7 @@ void CompactProtocolReader::read(OffsetIndex* o) function_builder(this, op); } -bool CompactProtocolReader::read(SizeStatistics* s) +void CompactProtocolReader::read(SizeStatistics* s) { using optional_i64 = parquet_field_optional; using optional_list_i64 = parquet_field_optional, parquet_field_int64_list>; diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 80f932f15bf..ef51f373b24 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -275,7 +275,7 @@ metadata::metadata(datasource* source) source->host_read(col.column_index_offset, col.column_index_length); cp.init(col_idx_buf->data(), col_idx_buf->size()); ColumnIndex ci; - CUDF_EXPECTS(cp.read(&ci), "Cannot parse column index"); + cp.read(&ci); col.column_index = std::move(ci); } if (col.offset_index_length > 0 && col.offset_index_offset > 0) { @@ -283,7 +283,7 @@ metadata::metadata(datasource* source) source->host_read(col.offset_index_offset, col.offset_index_length); cp.init(off_idx_buf->data(), off_idx_buf->size()); OffsetIndex oi; - CUDF_EXPECTS(cp.read(&oi), "Cannot parse offset index"); + cp.read(&oi); col.offset_index = std::move(oi); } } From fa724005f3089472feb8230669c571122abd4f7e Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 12 Dec 2023 16:07:49 -0800 Subject: [PATCH 07/12] print actual type in parquet_field_int --- cpp/src/io/parquet/compact_protocol_reader.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 8392548b09a..8601c05d220 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -127,12 +127,24 @@ class parquet_field_int : public parquet_field { T& val; + std::string type_string() + { + if constexpr (EXPECTED_TYPE == ST_FLD_BYTE) { + return "byte"; + } else if constexpr (EXPECTED_TYPE == ST_FLD_I32) { + return "int32"; + } else if constexpr (EXPECTED_TYPE == ST_FLD_I64) { + return "int64"; + } + return "unknown(EXPECTED_TYPE=" + std::to_string(EXPECTED_TYPE) + ")"; + } + public: parquet_field_int(int f, T& v) : parquet_field(f), val(v) {} inline void operator()(CompactProtocolReader* cpr, int field_type) { - CUDF_EXPECTS(field_type == EXPECTED_TYPE, "unexpected int field type"); + CUDF_EXPECTS(field_type != EXPECTED_TYPE, "expected " + type_string() + " field"); if constexpr (is_byte) { val = cpr->getb(); } else { From dfbc03d1171dbd8bef9fc7b833f6f504e9bc0450 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 12 Dec 2023 18:34:09 -0800 Subject: [PATCH 08/12] unbreak my heart --- cpp/src/io/parquet/compact_protocol_reader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 8601c05d220..d73c5094e4a 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -144,7 +144,7 @@ class parquet_field_int : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - CUDF_EXPECTS(field_type != EXPECTED_TYPE, "expected " + type_string() + " field"); + CUDF_EXPECTS(field_type == EXPECTED_TYPE, "expected " + type_string() + " field"); if constexpr (is_byte) { val = cpr->getb(); } else { From d51530b92ffe77d2776898a60374dfbaec8e60fa Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 13 Dec 2023 12:17:32 -0800 Subject: [PATCH 09/12] centralize field type checking --- .../io/parquet/compact_protocol_reader.cpp | 57 ++++++++++++------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index d73c5094e4a..d7497e4bcf5 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -42,6 +42,31 @@ class parquet_field { int field() const { return _field_val; } }; +std::string field_type_string(FieldType type) +{ + switch (type) { + case ST_FLD_TRUE: return "bool(true)"; + case ST_FLD_FALSE: return "bool(false)"; + case ST_FLD_BYTE: return "int8"; + case ST_FLD_I16: return "int16"; + case ST_FLD_I32: return "int32"; + case ST_FLD_I64: return "int64"; + case ST_FLD_DOUBLE: return "double"; + case ST_FLD_BINARY: return "binary"; + case ST_FLD_STRUCT: return "struct"; + case ST_FLD_LIST: return "list"; + case ST_FLD_SET: return "set"; + default: return "unknown(" + std::to_string(type) + ")"; + } +} + +void assert_field_type(int type, FieldType expected) +{ + CUDF_EXPECTS(type != expected, + "expected " + field_type_string(expected) + " field, got " + + field_type_string(static_cast(200)) + " field instead"); +} + /** * @brief Abstract base class for list functors. */ @@ -65,9 +90,9 @@ class parquet_field_list : public parquet_field { public: inline void operator()(CompactProtocolReader* cpr, int field_type) { - CUDF_EXPECTS(field_type == ST_FLD_LIST, "expected list field"); + assert_field_type(field_type, ST_FLD_LIST); auto const [t, n] = cpr->get_listh(); - CUDF_EXPECTS(t == _expected_type, "list type mismatch"); + assert_field_type(t, _expected_type); val.resize(n); for (uint32_t i = 0; i < n; i++) { _read_value(i, cpr); @@ -121,30 +146,18 @@ struct parquet_field_bool_list : public parquet_field_list { * * @return True if there is a type mismatch */ -template +template class parquet_field_int : public parquet_field { static constexpr bool is_byte = std::is_same_v; T& val; - std::string type_string() - { - if constexpr (EXPECTED_TYPE == ST_FLD_BYTE) { - return "byte"; - } else if constexpr (EXPECTED_TYPE == ST_FLD_I32) { - return "int32"; - } else if constexpr (EXPECTED_TYPE == ST_FLD_I64) { - return "int64"; - } - return "unknown(EXPECTED_TYPE=" + std::to_string(EXPECTED_TYPE) + ")"; - } - public: parquet_field_int(int f, T& v) : parquet_field(f), val(v) {} inline void operator()(CompactProtocolReader* cpr, int field_type) { - CUDF_EXPECTS(field_type == EXPECTED_TYPE, "expected " + type_string() + " field"); + assert_field_type(field_type, EXPECTED_TYPE); if constexpr (is_byte) { val = cpr->getb(); } else { @@ -190,7 +203,7 @@ class parquet_field_string : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - CUDF_EXPECTS(field_type == ST_FLD_BINARY, "expected binary field"); + assert_field_type(field_type, ST_FLD_BINARY); auto const n = cpr->get_u32(); CUDF_EXPECTS(n < static_cast(cpr->m_end - cpr->m_cur), "string length mismatch"); @@ -233,7 +246,7 @@ class parquet_field_enum : public parquet_field { parquet_field_enum(int f, Enum& v) : parquet_field(f), val(v) {} inline void operator()(CompactProtocolReader* cpr, int field_type) { - CUDF_EXPECTS(field_type == ST_FLD_I32, "unexpected enum field type"); + assert_field_type(field_type, ST_FLD_I32); val = static_cast(cpr->get_i32()); } }; @@ -270,7 +283,7 @@ class parquet_field_struct : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - CUDF_EXPECTS(field_type == ST_FLD_STRUCT, "expected struct field"); + assert_field_type(field_type, ST_FLD_STRUCT); cpr->read(&val); } }; @@ -316,7 +329,7 @@ class parquet_field_union_enumerator : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - CUDF_EXPECTS(field_type == ST_FLD_STRUCT, "expected struct field"); + assert_field_type(field_type, ST_FLD_STRUCT); cpr->skip_struct_field(field_type); val = static_cast(field()); } @@ -353,7 +366,7 @@ class parquet_field_binary : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - CUDF_EXPECTS(field_type == ST_FLD_BINARY, "expected binary field"); + assert_field_type(field_type, ST_FLD_BINARY); auto const n = cpr->get_u32(); CUDF_EXPECTS(n <= static_cast(cpr->m_end - cpr->m_cur), "binary length mismatch"); @@ -397,7 +410,7 @@ class parquet_field_struct_blob : public parquet_field { parquet_field_struct_blob(int f, std::vector& v) : parquet_field(f), val(v) {} inline void operator()(CompactProtocolReader* cpr, int field_type) { - CUDF_EXPECTS(field_type == ST_FLD_STRUCT, "expected struct type"); + assert_field_type(field_type, ST_FLD_STRUCT); uint8_t const* const start = cpr->m_cur; cpr->skip_struct_field(field_type); if (cpr->m_cur > start) { val.assign(start, cpr->m_cur - 1); } From a105f9a66c6253d436337d19d042367a975b717b Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 13 Dec 2023 12:18:02 -0800 Subject: [PATCH 10/12] undo another debug version --- cpp/src/io/parquet/compact_protocol_reader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index d7497e4bcf5..c9c9430fdc9 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -62,7 +62,7 @@ std::string field_type_string(FieldType type) void assert_field_type(int type, FieldType expected) { - CUDF_EXPECTS(type != expected, + CUDF_EXPECTS(type == expected, "expected " + field_type_string(expected) + " field, got " + field_type_string(static_cast(200)) + " field instead"); } From 485af3d9092cab5c972f0c6cbe0f1966a71aa7a6 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 13 Dec 2023 13:05:37 -0800 Subject: [PATCH 11/12] param to template param --- .../io/parquet/compact_protocol_reader.cpp | 36 ++++++++----------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index c9c9430fdc9..0b354f97890 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -70,11 +70,10 @@ void assert_field_type(int type, FieldType expected) /** * @brief Abstract base class for list functors. */ -template +template class parquet_field_list : public parquet_field { private: using read_func_type = std::function; - FieldType _expected_type; read_func_type _read_value; protected: @@ -82,17 +81,14 @@ class parquet_field_list : public parquet_field { void bind_read_func(read_func_type fn) { _read_value = fn; } - parquet_field_list(int f, std::vector& v, FieldType t) - : parquet_field(f), _expected_type(t), val(v) - { - } + parquet_field_list(int f, std::vector& v) : parquet_field(f), val(v) {} public: inline void operator()(CompactProtocolReader* cpr, int field_type) { assert_field_type(field_type, ST_FLD_LIST); auto const [t, n] = cpr->get_listh(); - assert_field_type(t, _expected_type); + assert_field_type(t, EXPECTED_ELEM_TYPE); val.resize(n); for (uint32_t i = 0; i < n; i++) { _read_value(i, cpr); @@ -126,8 +122,8 @@ class parquet_field_bool : public parquet_field { * @return True if field types mismatch or if the process of reading a * bool fails */ -struct parquet_field_bool_list : public parquet_field_list { - parquet_field_bool_list(int f, std::vector& v) : parquet_field_list(f, v, ST_FLD_TRUE) +struct parquet_field_bool_list : public parquet_field_list { + parquet_field_bool_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { auto const current_byte = cpr->getb(); @@ -177,8 +173,8 @@ using parquet_field_int64 = parquet_field_int; * integer fails */ template -struct parquet_field_int_list : public parquet_field_list { - parquet_field_int_list(int f, std::vector& v) : parquet_field_list(f, v, EXPECTED_TYPE) +struct parquet_field_int_list : public parquet_field_list { + parquet_field_int_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { this->val[i] = cpr->get_zigzag(); @@ -218,9 +214,8 @@ class parquet_field_string : public parquet_field { * @return True if field types mismatch or if the process of reading a * string fails */ -struct parquet_field_string_list : public parquet_field_list { - parquet_field_string_list(int f, std::vector& v) - : parquet_field_list(f, v, ST_FLD_BINARY) +struct parquet_field_string_list : public parquet_field_list { + parquet_field_string_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { auto const l = cpr->get_u32(); @@ -258,8 +253,8 @@ class parquet_field_enum : public parquet_field { * enum fails */ template -struct parquet_field_enum_list : public parquet_field_list { - parquet_field_enum_list(int f, std::vector& v) : parquet_field_list(f, v, ST_FLD_I32) +struct parquet_field_enum_list : public parquet_field_list { + parquet_field_enum_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { this->val[i] = static_cast(cpr->get_i32()); @@ -342,8 +337,8 @@ class parquet_field_union_enumerator : public parquet_field { * struct fails */ template -struct parquet_field_struct_list : public parquet_field_list { - parquet_field_struct_list(int f, std::vector& v) : parquet_field_list(f, v, ST_FLD_STRUCT) +struct parquet_field_struct_list : public parquet_field_list { + parquet_field_struct_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { cpr->read(&this->val[i]); @@ -382,9 +377,8 @@ class parquet_field_binary : public parquet_field { * @return True if field types mismatch or if the process of reading a * binary fails */ -struct parquet_field_binary_list : public parquet_field_list> { - parquet_field_binary_list(int f, std::vector>& v) - : parquet_field_list(f, v, ST_FLD_BINARY) +struct parquet_field_binary_list : public parquet_field_list, ST_FLD_BINARY> { + parquet_field_binary_list(int f, std::vector>& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { auto const l = cpr->get_u32(); From ec2ef6442d87357ab7875e9589bce350377029bb Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 14 Dec 2023 10:51:30 -0800 Subject: [PATCH 12/12] more debug mess reverted --- cpp/src/io/parquet/compact_protocol_reader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 7e96fdf739a..55848802f12 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -64,7 +64,7 @@ void assert_field_type(int type, FieldType expected) { CUDF_EXPECTS(type == expected, "expected " + field_type_string(expected) + " field, got " + - field_type_string(static_cast(200)) + " field instead"); + field_type_string(static_cast(type)) + " field instead"); } /**