From 433bdc32eeecf8fa2fcd900214a9005eb4a07992 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 9 Jan 2024 14:54:16 -0800 Subject: [PATCH] Convert `FieldType` to scoped enum (#14642) Switch to scoped enum (`enum class`); they are better because, well, values now have a scope. Another benefit in this case - values are now named consistently with compact protocol. De-duplicated some code, now that more static_casts are required and duplication stands out more. Authors: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) Approvers: - Nghia Truong (https://github.com/ttnghia) - MithunR (https://github.com/mythrocks) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14642 --- .../io/parquet/compact_protocol_reader.cpp | 110 ++++++++++-------- .../io/parquet/compact_protocol_writer.cpp | 43 +++---- .../io/parquet/compact_protocol_writer.hpp | 11 +- cpp/src/io/parquet/page_enc.cu | 49 ++++---- cpp/src/io/parquet/page_hdr.cu | 34 +++--- cpp/src/io/parquet/parquet_common.hpp | 29 ++--- 6 files changed, 153 insertions(+), 123 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 55848802f12..d39d832c18c 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -45,28 +45,37 @@ class parquet_field { std::string field_type_string(FieldType type) { switch (type) { - case ST_FLD_TRUE: return "bool(true)"; - case ST_FLD_FALSE: return "bool(false)"; - case ST_FLD_BYTE: return "int8"; - case ST_FLD_I16: return "int16"; - case ST_FLD_I32: return "int32"; - case ST_FLD_I64: return "int64"; - case ST_FLD_DOUBLE: return "double"; - case ST_FLD_BINARY: return "binary"; - case ST_FLD_STRUCT: return "struct"; - case ST_FLD_LIST: return "list"; - case ST_FLD_SET: return "set"; - default: return "unknown(" + std::to_string(type) + ")"; + case FieldType::BOOLEAN_TRUE: return "bool(true)"; + case FieldType::BOOLEAN_FALSE: return "bool(false)"; + case FieldType::I8: return "int8"; + case FieldType::I16: return "int16"; + case FieldType::I32: return "int32"; + case FieldType::I64: return "int64"; + case FieldType::DOUBLE: return "double"; + case FieldType::BINARY: return "binary"; + case FieldType::LIST: return "list"; + case FieldType::SET: return "set"; + case FieldType::MAP: return "map"; + case FieldType::STRUCT: return "struct"; + case FieldType::UUID: return "UUID"; + default: return "unknown(" + std::to_string(static_cast(type)) + ")"; } } void assert_field_type(int type, FieldType expected) { - CUDF_EXPECTS(type == expected, + CUDF_EXPECTS(type == static_cast(expected), "expected " + field_type_string(expected) + " field, got " + field_type_string(static_cast(type)) + " field instead"); } +void assert_bool_field_type(int type) +{ + auto const field_type = static_cast(type); + CUDF_EXPECTS(field_type == FieldType::BOOLEAN_TRUE || field_type == FieldType::BOOLEAN_FALSE, + "expected bool field, got " + field_type_string(field_type) + " field instead"); +} + /** * @brief Abstract base class for list functors. */ @@ -86,7 +95,7 @@ class parquet_field_list : public parquet_field { public: inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_LIST); + assert_field_type(field_type, FieldType::LIST); auto const [t, n] = cpr->get_listh(); assert_field_type(t, EXPECTED_ELEM_TYPE); val.resize(n); @@ -111,8 +120,8 @@ class parquet_field_bool : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - CUDF_EXPECTS(field_type == ST_FLD_TRUE || field_type == ST_FLD_FALSE, "expected bool field"); - val = field_type == ST_FLD_TRUE; + assert_bool_field_type(field_type); + val = field_type == static_cast(FieldType::BOOLEAN_TRUE); } }; @@ -122,14 +131,13 @@ class parquet_field_bool : public parquet_field { * @return True if field types mismatch or if the process of reading a * bool fails */ -struct parquet_field_bool_list : public parquet_field_list { +struct parquet_field_bool_list : public parquet_field_list { parquet_field_bool_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { auto const current_byte = cpr->getb(); - CUDF_EXPECTS(current_byte == ST_FLD_TRUE || current_byte == ST_FLD_FALSE, - "expected bool field"); - this->val[i] = current_byte == ST_FLD_TRUE; + assert_bool_field_type(current_byte); + this->val[i] = current_byte == static_cast(FieldType::BOOLEAN_TRUE); }; bind_read_func(read_value); } @@ -162,9 +170,9 @@ class parquet_field_int : public parquet_field { } }; -using parquet_field_int8 = parquet_field_int; -using parquet_field_int32 = parquet_field_int; -using parquet_field_int64 = parquet_field_int; +using parquet_field_int8 = parquet_field_int; +using parquet_field_int32 = parquet_field_int; +using parquet_field_int64 = parquet_field_int; /** * @brief Functor to read a vector of integers from CompactProtocolReader @@ -183,7 +191,7 @@ struct parquet_field_int_list : public parquet_field_list { } }; -using parquet_field_int64_list = parquet_field_int_list; +using parquet_field_int64_list = parquet_field_int_list; /** * @brief Functor to read a string from CompactProtocolReader @@ -199,7 +207,7 @@ class parquet_field_string : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_BINARY); + assert_field_type(field_type, FieldType::BINARY); auto const n = cpr->get_u32(); CUDF_EXPECTS(n < static_cast(cpr->m_end - cpr->m_cur), "string length mismatch"); @@ -214,7 +222,7 @@ class parquet_field_string : public parquet_field { * @return True if field types mismatch or if the process of reading a * string fails */ -struct parquet_field_string_list : public parquet_field_list { +struct parquet_field_string_list : public parquet_field_list { parquet_field_string_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { @@ -241,7 +249,7 @@ class parquet_field_enum : public parquet_field { parquet_field_enum(int f, Enum& v) : parquet_field(f), val(v) {} inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_I32); + assert_field_type(field_type, FieldType::I32); val = static_cast(cpr->get_i32()); } }; @@ -253,8 +261,9 @@ class parquet_field_enum : public parquet_field { * enum fails */ template -struct parquet_field_enum_list : public parquet_field_list { - parquet_field_enum_list(int f, std::vector& v) : parquet_field_list(f, v) +struct parquet_field_enum_list : public parquet_field_list { + parquet_field_enum_list(int f, std::vector& v) + : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { this->val[i] = static_cast(cpr->get_i32()); @@ -278,7 +287,7 @@ class parquet_field_struct : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_STRUCT); + assert_field_type(field_type, FieldType::STRUCT); cpr->read(&val); } }; @@ -324,7 +333,7 @@ class parquet_field_union_enumerator : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_STRUCT); + assert_field_type(field_type, FieldType::STRUCT); cpr->skip_struct_field(field_type); val = static_cast(field()); } @@ -337,8 +346,9 @@ class parquet_field_union_enumerator : public parquet_field { * struct fails */ template -struct parquet_field_struct_list : public parquet_field_list { - parquet_field_struct_list(int f, std::vector& v) : parquet_field_list(f, v) +struct parquet_field_struct_list : public parquet_field_list { + parquet_field_struct_list(int f, std::vector& v) + : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { cpr->read(&this->val[i]); @@ -361,7 +371,7 @@ class parquet_field_binary : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_BINARY); + assert_field_type(field_type, FieldType::BINARY); auto const n = cpr->get_u32(); CUDF_EXPECTS(n <= static_cast(cpr->m_end - cpr->m_cur), "binary length mismatch"); @@ -377,7 +387,8 @@ class parquet_field_binary : public parquet_field { * @return True if field types mismatch or if the process of reading a * binary fails */ -struct parquet_field_binary_list : public parquet_field_list, ST_FLD_BINARY> { +struct parquet_field_binary_list + : public parquet_field_list, FieldType::BINARY> { parquet_field_binary_list(int f, std::vector>& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { @@ -404,7 +415,7 @@ class parquet_field_struct_blob : public parquet_field { parquet_field_struct_blob(int f, std::vector& v) : parquet_field(f), val(v) {} inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_STRUCT); + assert_field_type(field_type, FieldType::STRUCT); uint8_t const* const start = cpr->m_cur; cpr->skip_struct_field(field_type); if (cpr->m_cur > start) { val.assign(start, cpr->m_cur - 1); } @@ -439,24 +450,25 @@ class parquet_field_optional : public parquet_field { */ void CompactProtocolReader::skip_struct_field(int t, int depth) { - switch (t) { - case ST_FLD_TRUE: - case ST_FLD_FALSE: break; - case ST_FLD_I16: - case ST_FLD_I32: - case ST_FLD_I64: get_u64(); break; - case ST_FLD_BYTE: skip_bytes(1); break; - case ST_FLD_DOUBLE: skip_bytes(8); break; - case ST_FLD_BINARY: skip_bytes(get_u32()); break; - case ST_FLD_LIST: [[fallthrough]]; - case ST_FLD_SET: { + auto const t_enum = static_cast(t); + switch (t_enum) { + case FieldType::BOOLEAN_TRUE: + case FieldType::BOOLEAN_FALSE: break; + case FieldType::I16: + case FieldType::I32: + case FieldType::I64: get_u64(); break; + case FieldType::I8: skip_bytes(1); break; + case FieldType::DOUBLE: skip_bytes(8); break; + case FieldType::BINARY: skip_bytes(get_u32()); break; + case FieldType::LIST: + case FieldType::SET: { auto const [t, n] = get_listh(); CUDF_EXPECTS(depth <= 10, "struct nesting too deep"); for (uint32_t i = 0; i < n; i++) { skip_struct_field(t, depth + 1); } } break; - case ST_FLD_STRUCT: + case FieldType::STRUCT: for (;;) { int const c = getb(); t = c & 0xf; diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index f857b75f707..d610ec6c546 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -268,39 +268,40 @@ uint32_t CompactProtocolFieldWriter::put_int(int64_t v) return put_uint(((v ^ -s) << 1) + s); } -void CompactProtocolFieldWriter::put_field_header(int f, int cur, int t) +void CompactProtocolFieldWriter::put_field_header(int f, int cur, FieldType t) { if (f > cur && f <= cur + 15) - put_byte(((f - cur) << 4) | t); + put_packed_type_byte(f - cur, t); else { - put_byte(t); + put_byte(static_cast(t)); put_int(f); } } inline void CompactProtocolFieldWriter::field_bool(int field, bool b) { - put_field_header(field, current_field_value, b ? ST_FLD_TRUE : ST_FLD_FALSE); + put_field_header( + field, current_field_value, b ? FieldType::BOOLEAN_TRUE : FieldType::BOOLEAN_FALSE); current_field_value = field; } inline void CompactProtocolFieldWriter::field_int8(int field, int8_t val) { - put_field_header(field, current_field_value, ST_FLD_BYTE); + put_field_header(field, current_field_value, FieldType::I8); put_byte(val); current_field_value = field; } inline void CompactProtocolFieldWriter::field_int(int field, int32_t val) { - put_field_header(field, current_field_value, ST_FLD_I32); + put_field_header(field, current_field_value, FieldType::I32); put_int(val); current_field_value = field; } inline void CompactProtocolFieldWriter::field_int(int field, int64_t val) { - put_field_header(field, current_field_value, ST_FLD_I64); + put_field_header(field, current_field_value, FieldType::I64); put_int(val); current_field_value = field; } @@ -309,8 +310,8 @@ template <> inline void CompactProtocolFieldWriter::field_int_list(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_LIST); - put_byte(static_cast((std::min(val.size(), 0xfUL) << 4) | ST_FLD_I64)); + put_field_header(field, current_field_value, FieldType::LIST); + put_packed_type_byte(val.size(), FieldType::I64); if (val.size() >= 0xfUL) { put_uint(val.size()); } for (auto const v : val) { put_int(v); @@ -321,8 +322,8 @@ inline void CompactProtocolFieldWriter::field_int_list(int field, template inline void CompactProtocolFieldWriter::field_int_list(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_LIST); - put_byte(static_cast((std::min(val.size(), 0xfUL) << 4) | ST_FLD_I32)); + put_field_header(field, current_field_value, FieldType::LIST); + put_packed_type_byte(val.size(), FieldType::I32); if (val.size() >= 0xfUL) { put_uint(val.size()); } for (auto const& v : val) { put_int(static_cast(v)); @@ -333,7 +334,7 @@ inline void CompactProtocolFieldWriter::field_int_list(int field, std::vector inline void CompactProtocolFieldWriter::field_struct(int field, T const& val) { - put_field_header(field, current_field_value, ST_FLD_STRUCT); + put_field_header(field, current_field_value, FieldType::STRUCT); if constexpr (not std::is_empty_v) { writer.write(val); // write the struct if it's not empty } else { @@ -344,7 +345,7 @@ inline void CompactProtocolFieldWriter::field_struct(int field, T const& val) inline void CompactProtocolFieldWriter::field_empty_struct(int field) { - put_field_header(field, current_field_value, ST_FLD_STRUCT); + put_field_header(field, current_field_value, FieldType::STRUCT); put_byte(0); // add a stop field current_field_value = field; } @@ -352,8 +353,8 @@ inline void CompactProtocolFieldWriter::field_empty_struct(int field) template inline void CompactProtocolFieldWriter::field_struct_list(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_LIST); - put_byte((uint8_t)((std::min(val.size(), (size_t)0xfu) << 4) | ST_FLD_STRUCT)); + put_field_header(field, current_field_value, FieldType::LIST); + put_packed_type_byte(val.size(), FieldType::STRUCT); if (val.size() >= 0xf) put_uint(val.size()); for (auto& v : val) { writer.write(v); @@ -370,7 +371,7 @@ inline size_t CompactProtocolFieldWriter::value() inline void CompactProtocolFieldWriter::field_struct_blob(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_STRUCT); + put_field_header(field, current_field_value, FieldType::STRUCT); put_byte(val.data(), static_cast(val.size())); put_byte(0); current_field_value = field; @@ -378,7 +379,7 @@ inline void CompactProtocolFieldWriter::field_struct_blob(int field, inline void CompactProtocolFieldWriter::field_binary(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_BINARY); + put_field_header(field, current_field_value, FieldType::BINARY); put_uint(val.size()); put_byte(val.data(), static_cast(val.size())); current_field_value = field; @@ -386,7 +387,7 @@ inline void CompactProtocolFieldWriter::field_binary(int field, std::vector(val.data()), static_cast(val.size())); @@ -396,8 +397,8 @@ inline void CompactProtocolFieldWriter::field_string(int field, std::string cons inline void CompactProtocolFieldWriter::field_string_list(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_LIST); - put_byte((uint8_t)((std::min(val.size(), (size_t)0xfu) << 4) | ST_FLD_BINARY)); + put_field_header(field, current_field_value, FieldType::LIST); + put_packed_type_byte(val.size(), FieldType::BINARY); if (val.size() >= 0xf) put_uint(val.size()); for (auto& v : val) { put_uint(v.size()); diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index a2ed0f1f4dc..2ed7c078f8b 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,7 +78,14 @@ class CompactProtocolFieldWriter { uint32_t put_int(int64_t v); - void put_field_header(int f, int cur, int t); + template + void put_packed_type_byte(T high_bits, FieldType t) + { + uint8_t const clamped_high_bits = std::min(std::max(high_bits, T{0}), T{0xf}); + put_byte((clamped_high_bits << 4) | static_cast(t)); + } + + void put_field_header(int f, int cur, FieldType t); inline void field_bool(int field, bool b); diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 8e1c0682ffd..e16551024d1 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -2207,13 +2207,14 @@ inline __device__ uint8_t* cpw_put_int64(uint8_t* p, int64_t v) return cpw_put_uint64(p, (v ^ -s) * 2 + s); } -inline __device__ uint8_t* cpw_put_fldh(uint8_t* p, int f, int cur, int t) +inline __device__ uint8_t* cpw_put_fldh(uint8_t* p, int f, int cur, FieldType t) { + auto const t_num = static_cast(t); if (f > cur && f <= cur + 15) { - *p++ = ((f - cur) << 4) | t; + *p++ = ((f - cur) << 4) | t_num; return p; } else { - *p++ = t; + *p++ = t_num; return cpw_put_int32(p, f); } } @@ -2231,7 +2232,7 @@ class header_encoder { inline __device__ void field_struct_begin(int field) { current_header_ptr = - cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_STRUCT); + cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::STRUCT); current_field_index = 0; } @@ -2241,11 +2242,13 @@ class header_encoder { current_field_index = field; } - inline __device__ void field_list_begin(int field, size_t len, int type) + inline __device__ void field_list_begin(int field, size_t len, FieldType type) { - current_header_ptr = cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_LIST); + current_header_ptr = + cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::LIST); + auto const t_num = static_cast(type); current_header_ptr = cpw_put_uint8( - current_header_ptr, static_cast((std::min(len, size_t{0xfu}) << 4) | type)); + current_header_ptr, static_cast((std::min(len, size_t{0xfu}) << 4) | t_num)); if (len >= 0xf) { current_header_ptr = cpw_put_uint32(current_header_ptr, len); } current_field_index = 0; } @@ -2254,7 +2257,9 @@ class header_encoder { inline __device__ void put_bool(bool value) { - current_header_ptr = cpw_put_uint8(current_header_ptr, value ? ST_FLD_TRUE : ST_FLD_FALSE); + auto const type_byte = + static_cast(value ? FieldType::BOOLEAN_TRUE : FieldType::BOOLEAN_FALSE); + current_header_ptr = cpw_put_uint8(current_header_ptr, type_byte); } inline __device__ void put_binary(void const* value, uint32_t length) @@ -2272,15 +2277,18 @@ class header_encoder { inline __device__ void field_bool(int field, bool value) { - current_header_ptr = cpw_put_fldh( - current_header_ptr, field, current_field_index, value ? ST_FLD_TRUE : ST_FLD_FALSE); + current_header_ptr = cpw_put_fldh(current_header_ptr, + field, + current_field_index, + value ? FieldType::BOOLEAN_TRUE : FieldType::BOOLEAN_FALSE); current_field_index = field; } template inline __device__ void field_int32(int field, T value) { - current_header_ptr = cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_I32); + current_header_ptr = + cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::I32); current_header_ptr = cpw_put_int32(current_header_ptr, static_cast(value)); current_field_index = field; } @@ -2288,7 +2296,8 @@ class header_encoder { template inline __device__ void field_int64(int field, T value) { - current_header_ptr = cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_I64); + current_header_ptr = + cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::I64); current_header_ptr = cpw_put_int64(current_header_ptr, static_cast(value)); current_field_index = field; } @@ -2296,7 +2305,7 @@ class header_encoder { inline __device__ void field_binary(int field, void const* value, uint32_t length) { current_header_ptr = - cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_BINARY); + cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::BINARY); current_header_ptr = cpw_put_uint32(current_header_ptr, length); memcpy(current_header_ptr, value, length); current_header_ptr += length; @@ -2868,13 +2877,13 @@ __global__ void __launch_bounds__(1) : align8(ck_g->column_index_blob + ck_g->column_index_size - column_index_truncate_length); // null_pages - encoder.field_list_begin(1, num_data_pages, ST_FLD_TRUE); + encoder.field_list_begin(1, num_data_pages, FieldType::BOOLEAN_TRUE); for (uint32_t page = first_data_page; page < num_pages; page++) { encoder.put_bool(column_stats[pageidx + page].non_nulls == 0); } encoder.field_list_end(1); // min_values - encoder.field_list_begin(2, num_data_pages, ST_FLD_BINARY); + encoder.field_list_begin(2, num_data_pages, FieldType::BINARY); for (uint32_t page = first_data_page; page < num_pages; page++) { auto const [min_ptr, min_size] = get_extremum(&column_stats[pageidx + page].min_value, col_g.stats_dtype, @@ -2885,7 +2894,7 @@ __global__ void __launch_bounds__(1) } encoder.field_list_end(2); // max_values - encoder.field_list_begin(3, num_data_pages, ST_FLD_BINARY); + encoder.field_list_begin(3, num_data_pages, FieldType::BINARY); for (uint32_t page = first_data_page; page < num_pages; page++) { auto const [max_ptr, max_size] = get_extremum(&column_stats[pageidx + page].max_value, col_g.stats_dtype, @@ -2902,7 +2911,7 @@ __global__ void __launch_bounds__(1) col_g.converted_type, num_pages - first_data_page)); // null_counts - encoder.field_list_begin(5, num_data_pages, ST_FLD_I64); + encoder.field_list_begin(5, num_data_pages, FieldType::I64); for (uint32_t page = first_data_page; page < num_pages; page++) { encoder.put_int64(column_stats[pageidx + page].null_count); } @@ -2918,7 +2927,7 @@ __global__ void __launch_bounds__(1) // optionally encode histograms and sum var_bytes. if (cd->max_rep_level > REP_LVL_HIST_CUTOFF) { - encoder.field_list_begin(6, num_data_pages * (cd->max_rep_level + 1), ST_FLD_I64); + encoder.field_list_begin(6, num_data_pages * (cd->max_rep_level + 1), FieldType::I64); thrust::for_each(thrust::seq, page_start, page_end, [&] __device__(auto const& page) { for (int i = 0; i < cd->max_rep_level + 1; i++) { encoder.put_int64(page.rep_histogram[i]); @@ -2929,7 +2938,7 @@ __global__ void __launch_bounds__(1) } if (cd->max_def_level > DEF_LVL_HIST_CUTOFF) { - encoder.field_list_begin(7, num_data_pages * (cd->max_def_level + 1), ST_FLD_I64); + encoder.field_list_begin(7, num_data_pages * (cd->max_def_level + 1), FieldType::I64); thrust::for_each(thrust::seq, page_start, page_end, [&] __device__(auto const& page) { for (int i = 0; i < cd->max_def_level + 1; i++) { encoder.put_int64(page.def_histogram[i]); diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 36157f725e3..cc3f584422d 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -114,28 +114,28 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type) field_type = c & 0xf; if (!(c & 0xf0)) get_i32(bs); } - switch (field_type) { - case ST_FLD_TRUE: - case ST_FLD_FALSE: break; - case ST_FLD_I16: - case ST_FLD_I32: - case ST_FLD_I64: get_u32(bs); break; - case ST_FLD_BYTE: skip_bytes(bs, 1); break; - case ST_FLD_DOUBLE: skip_bytes(bs, 8); break; - case ST_FLD_BINARY: skip_bytes(bs, get_u32(bs)); break; - case ST_FLD_LIST: - case ST_FLD_SET: { // NOTE: skipping a list of lists is not handled + switch (static_cast(field_type)) { + case FieldType::BOOLEAN_TRUE: + case FieldType::BOOLEAN_FALSE: break; + case FieldType::I16: + case FieldType::I32: + case FieldType::I64: get_u32(bs); break; + case FieldType::I8: skip_bytes(bs, 1); break; + case FieldType::DOUBLE: skip_bytes(bs, 8); break; + case FieldType::BINARY: skip_bytes(bs, get_u32(bs)); break; + case FieldType::LIST: + case FieldType::SET: { // NOTE: skipping a list of lists is not handled auto const c = getb(bs); int n = c >> 4; if (n == 0xf) { n = get_u32(bs); } field_type = c & 0xf; - if (field_type == ST_FLD_STRUCT) { + if (static_cast(field_type) == FieldType::STRUCT) { struct_depth += n; } else { rep_cnt = n; } } break; - case ST_FLD_STRUCT: struct_depth++; break; + case FieldType::STRUCT: struct_depth++; break; } } while (rep_cnt || struct_depth); } @@ -180,7 +180,7 @@ struct ParquetFieldInt32 { inline __device__ bool operator()(byte_stream_s* bs, int field_type) { val = get_i32(bs); - return (field_type != ST_FLD_I32); + return (static_cast(field_type) != FieldType::I32); } }; @@ -199,7 +199,7 @@ struct ParquetFieldEnum { inline __device__ bool operator()(byte_stream_s* bs, int field_type) { val = static_cast(get_i32(bs)); - return (field_type != ST_FLD_I32); + return (static_cast(field_type) != FieldType::I32); } }; @@ -218,7 +218,7 @@ struct ParquetFieldStruct { inline __device__ bool operator()(byte_stream_s* bs, int field_type) { - return ((field_type != ST_FLD_STRUCT) || !op(bs)); + return ((static_cast(field_type) != FieldType::STRUCT) || !op(bs)); } }; diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 50736197eb9..a680e44f360 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -140,19 +140,20 @@ enum BoundaryOrder { /** * @brief Thrift compact protocol struct field types */ -enum FieldType { - ST_FLD_TRUE = 1, - ST_FLD_FALSE = 2, - ST_FLD_BYTE = 3, - ST_FLD_I16 = 4, - ST_FLD_I32 = 5, - ST_FLD_I64 = 6, - ST_FLD_DOUBLE = 7, - ST_FLD_BINARY = 8, - ST_FLD_LIST = 9, - ST_FLD_SET = 10, - ST_FLD_MAP = 11, - ST_FLD_STRUCT = 12, +enum class FieldType : uint8_t { + BOOLEAN_TRUE = 1, + BOOLEAN_FALSE = 2, + I8 = 3, + I16 = 4, + I32 = 5, + I64 = 6, + DOUBLE = 7, + BINARY = 8, + LIST = 9, + SET = 10, + MAP = 11, + STRUCT = 12, + UUID = 13, }; } // namespace cudf::io::parquet::detail