From ee3a931ac1a6048b06ef6cbec98993d4b527bfba Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 13 May 2021 14:55:09 -0700 Subject: [PATCH 1/3] use optional and variant --- cpp/include/cudf/io/orc_metadata.hpp | 106 +++++++-------------------- cpp/src/io/functions.cpp | 87 +++++----------------- cpp/src/io/orc/orc.cpp | 34 ++++----- cpp/src/io/orc/orc.h | 20 ++--- cpp/tests/io/orc_test.cpp | 54 ++++++-------- 5 files changed, 92 insertions(+), 209 deletions(-) diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp index 906df3f1005..b2e0768b328 100644 --- a/cpp/include/cudf/io/orc_metadata.hpp +++ b/cpp/include/cudf/io/orc_metadata.hpp @@ -23,6 +23,8 @@ #include +#include +#include #include namespace cudf { @@ -61,20 +63,9 @@ struct raw_orc_statistics { raw_orc_statistics read_raw_orc_statistics(source_info const& src_info); /** - * @brief Enumerator for types of column statistics that can be included in `column_statistics`. - * - * The statistics type depends on the column data type. + * @brief Monostate type for the statistics variant. */ -enum class statistics_type { - NONE, - INT, - DOUBLE, - STRING, - BUCKET, - DECIMAL, - DATE, - BINARY, - TIMESTAMP, +struct no_statistics { }; /** @@ -84,13 +75,8 @@ enum class statistics_type { */ template struct minmax_statistics { - std::unique_ptr _minimum; - std::unique_ptr _maximum; - - auto has_minimum() const { return _minimum != nullptr; } - auto has_maximum() const { return _maximum != nullptr; } - auto minimum() const { return _minimum.get(); } - auto maximum() const { return _maximum.get(); } + std::optional minimum; + std::optional maximum; }; /** @@ -100,24 +86,19 @@ struct minmax_statistics { */ template struct sum_statistics { - std::unique_ptr _sum; - - auto has_sum() const { return _sum != nullptr; } - auto sum() const { return _sum.get(); } + std::optional sum; }; /** * @brief Statistics for integral columns. */ struct integer_statistics : minmax_statistics, sum_statistics { - static constexpr statistics_type type = statistics_type::INT; }; /** * @brief Statistics for floating point columns. */ struct double_statistics : minmax_statistics, sum_statistics { - static constexpr statistics_type type = statistics_type::DOUBLE; }; /** @@ -128,7 +109,6 @@ struct double_statistics : minmax_statistics, sum_statistics { * Note: According to ORC specs, the sum should be signed, but pyarrow uses unsigned value */ struct string_statistics : minmax_statistics, sum_statistics { - static constexpr statistics_type type = statistics_type::STRING; }; /** @@ -137,34 +117,26 @@ struct string_statistics : minmax_statistics, sum_statistics _count; - - auto count(size_t index) const { return &_count.at(index); } + std::vector count; }; /** * @brief Statistics for decimal columns. */ struct decimal_statistics : minmax_statistics, sum_statistics { - static constexpr statistics_type type = statistics_type::DECIMAL; }; /** * @brief Statistics for date(time) columns. */ -struct date_statistics : minmax_statistics { - static constexpr statistics_type type = statistics_type::DATE; -}; +using date_statistics = minmax_statistics; /** * @brief Statistics for binary columns. * * The `sum` is the total number of bytes across all elements. */ -struct binary_statistics : sum_statistics { - static constexpr statistics_type type = statistics_type::BINARY; -}; +using binary_statistics = sum_statistics; /** * @brief Statistics for timestamp columns. @@ -173,14 +145,8 @@ struct binary_statistics : sum_statistics { * the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC. */ struct timestamp_statistics : minmax_statistics { - static constexpr statistics_type type = statistics_type::TIMESTAMP; - std::unique_ptr _minimum_utc; - std::unique_ptr _maximum_utc; - - auto has_minimum_utc() const { return _minimum_utc != nullptr; } - auto has_maximum_utc() const { return _maximum_utc != nullptr; } - auto minimum_utc() const { return _minimum_utc.get(); } - auto maximum_utc() const { return _maximum_utc.get(); } + std::optional minimum_utc; + std::optional maximum_utc; }; namespace orc { @@ -196,40 +162,20 @@ struct column_statistics; * All columns can have the `number_of_values` statistics. Depending on the data type, a column can * have additional statistics, accessible through `type_specific_stats` accessor. */ -class column_statistics { - private: - std::unique_ptr _number_of_values; - statistics_type _type = statistics_type::NONE; - void* _type_specific_stats = nullptr; - - public: - column_statistics() = default; - column_statistics(cudf::io::orc::column_statistics&& other); - - column_statistics& operator=(column_statistics&&) noexcept; - column_statistics(column_statistics&&) noexcept; - - auto has_number_of_values() const { return _number_of_values != nullptr; } - auto number_of_values() const { return _number_of_values.get(); } - - auto type() const { return _type; } - - /** - * @brief Returns a non-owning pointer to the type-specific statistics of the given type. - * - * Returns null if the requested statistics type does not match the type of the currently held - * type-specific statistics. - * - * @tparam T the statistics type - */ - template - T const* type_specific_stats() const - { - if (T::type != _type) return nullptr; - return static_cast(_type_specific_stats); - } - - ~column_statistics(); +struct column_statistics { + std::optional number_of_values; + std::variant + type_specific_stats; + + column_statistics(cudf::io::orc::column_statistics&& detail_statistics); }; /** diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 1b92e863029..bf51012211c 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -237,76 +237,23 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info) column_statistics::column_statistics(cudf::io::orc::column_statistics&& cs) { - _number_of_values = std::move(cs.number_of_values); - if (cs.int_stats.get()) { - _type = statistics_type::INT; - _type_specific_stats = cs.int_stats.release(); - } else if (cs.double_stats.get()) { - _type = statistics_type::DOUBLE; - _type_specific_stats = cs.double_stats.release(); - } else if (cs.string_stats.get()) { - _type = statistics_type::STRING; - _type_specific_stats = cs.string_stats.release(); - } else if (cs.bucket_stats.get()) { - _type = statistics_type::BUCKET; - _type_specific_stats = cs.bucket_stats.release(); - } else if (cs.decimal_stats.get()) { - _type = statistics_type::DECIMAL; - _type_specific_stats = cs.decimal_stats.release(); - } else if (cs.date_stats.get()) { - _type = statistics_type::DATE; - _type_specific_stats = cs.date_stats.release(); - } else if (cs.binary_stats.get()) { - _type = statistics_type::BINARY; - _type_specific_stats = cs.binary_stats.release(); - } else if (cs.timestamp_stats.get()) { - _type = statistics_type::TIMESTAMP; - _type_specific_stats = cs.timestamp_stats.release(); - } -} - -column_statistics& column_statistics::operator=(column_statistics&& other) noexcept -{ - _number_of_values = std::move(other._number_of_values); - _type = other._type; - _type_specific_stats = other._type_specific_stats; - - other._type = statistics_type::NONE; - other._type_specific_stats = nullptr; - - return *this; -} - -column_statistics::column_statistics(column_statistics&& other) noexcept -{ - *this = std::move(other); -} - -column_statistics::~column_statistics() -{ - switch (_type) { - case statistics_type::NONE: // error state, but can't throw from a destructor. - break; - case statistics_type::INT: delete static_cast(_type_specific_stats); break; - case statistics_type::DOUBLE: - delete static_cast(_type_specific_stats); - break; - case statistics_type::STRING: - delete static_cast(_type_specific_stats); - break; - case statistics_type::BUCKET: - delete static_cast(_type_specific_stats); - break; - case statistics_type::DECIMAL: - delete static_cast(_type_specific_stats); - break; - case statistics_type::DATE: delete static_cast(_type_specific_stats); break; - case statistics_type::BINARY: - delete static_cast(_type_specific_stats); - break; - case statistics_type::TIMESTAMP: - delete static_cast(_type_specific_stats); - break; + number_of_values = cs.number_of_values; + if (cs.int_stats) { + type_specific_stats = *cs.int_stats; + } else if (cs.double_stats) { + type_specific_stats = *cs.double_stats; + } else if (cs.string_stats) { + type_specific_stats = *cs.string_stats; + } else if (cs.bucket_stats) { + type_specific_stats = *cs.bucket_stats; + } else if (cs.decimal_stats) { + type_specific_stats = *cs.decimal_stats; + } else if (cs.date_stats) { + type_specific_stats = *cs.date_stats; + } else if (cs.binary_stats) { + type_specific_stats = *cs.binary_stats; + } else if (cs.timestamp_stats) { + type_specific_stats = *cs.timestamp_stats; } } diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp index bef6bd56cba..fd11c6e3bcc 100644 --- a/cpp/src/io/orc/orc.cpp +++ b/cpp/src/io/orc/orc.cpp @@ -118,60 +118,56 @@ void ProtobufReader::read(ColumnEncoding &s, size_t maxlen) void ProtobufReader::read(integer_statistics &s, size_t maxlen) { - auto op = std::make_tuple(make_field_reader(1, s._minimum), - make_field_reader(2, s._maximum), - make_field_reader(3, s._sum)); + auto op = std::make_tuple( + make_field_reader(1, s.minimum), make_field_reader(2, s.maximum), make_field_reader(3, s.sum)); function_builder(s, maxlen, op); } void ProtobufReader::read(double_statistics &s, size_t maxlen) { - auto op = std::make_tuple(make_field_reader(1, s._minimum), - make_field_reader(2, s._maximum), - make_field_reader(3, s._sum)); + auto op = std::make_tuple( + make_field_reader(1, s.minimum), make_field_reader(2, s.maximum), make_field_reader(3, s.sum)); function_builder(s, maxlen, op); } void ProtobufReader::read(string_statistics &s, size_t maxlen) { - auto op = std::make_tuple(make_field_reader(1, s._minimum), - make_field_reader(2, s._maximum), - make_field_reader(3, s._sum)); + auto op = std::make_tuple( + make_field_reader(1, s.minimum), make_field_reader(2, s.maximum), make_field_reader(3, s.sum)); function_builder(s, maxlen, op); } void ProtobufReader::read(bucket_statistics &s, size_t maxlen) { - auto op = std::make_tuple(make_packed_field_reader(1, s._count)); + auto op = std::make_tuple(make_packed_field_reader(1, s.count)); function_builder(s, maxlen, op); } void ProtobufReader::read(decimal_statistics &s, size_t maxlen) { - auto op = std::make_tuple(make_field_reader(1, s._minimum), - make_field_reader(2, s._maximum), - make_field_reader(3, s._sum)); + auto op = std::make_tuple( + make_field_reader(1, s.minimum), make_field_reader(2, s.maximum), make_field_reader(3, s.sum)); function_builder(s, maxlen, op); } void ProtobufReader::read(date_statistics &s, size_t maxlen) { - auto op = std::make_tuple(make_field_reader(1, s._minimum), make_field_reader(2, s._maximum)); + auto op = std::make_tuple(make_field_reader(1, s.minimum), make_field_reader(2, s.maximum)); function_builder(s, maxlen, op); } void ProtobufReader::read(binary_statistics &s, size_t maxlen) { - auto op = std::make_tuple(make_field_reader(1, s._sum)); + auto op = std::make_tuple(make_field_reader(1, s.sum)); function_builder(s, maxlen, op); } void ProtobufReader::read(timestamp_statistics &s, size_t maxlen) { - auto op = std::make_tuple(make_field_reader(1, s._minimum), - make_field_reader(2, s._maximum), - make_field_reader(3, s._minimum_utc), - make_field_reader(4, s._maximum_utc)); + auto op = std::make_tuple(make_field_reader(1, s.minimum), + make_field_reader(2, s.maximum), + make_field_reader(3, s.minimum_utc), + make_field_reader(4, s.maximum_utc)); function_builder(s, maxlen, op); } diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h index c26141f6295..4306ddb8ccb 100644 --- a/cpp/src/io/orc/orc.h +++ b/cpp/src/io/orc/orc.h @@ -107,18 +107,18 @@ struct StripeFooter { /** * @brief Contains per-column ORC statistics. * - * At most one of the `***_statistics` members has a non-null value. + * At most one of the `***_statistics` members has a value. */ struct column_statistics { - std::unique_ptr number_of_values; - std::unique_ptr int_stats; - std::unique_ptr double_stats; - std::unique_ptr string_stats; - std::unique_ptr bucket_stats; - std::unique_ptr decimal_stats; - std::unique_ptr date_stats; - std::unique_ptr binary_stats; - std::unique_ptr timestamp_stats; + std::optional number_of_values; + std::optional int_stats; + std::optional double_stats; + std::optional string_stats; + std::optional bucket_stats; + std::optional decimal_stats; + std::optional date_stats; + std::optional binary_stats; + std::optional timestamp_stats; // TODO: hasNull (issue #7087) }; diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 108befa80a7..8f41d86d404 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -978,47 +978,41 @@ TEST_F(OrcStatisticsTest, Basic) auto validate_statistics = [&](std::vector const& stats) { auto& s0 = stats[0]; - EXPECT_EQ(s0.type(), cudf_io::statistics_type::NONE); - EXPECT_EQ(*s0.number_of_values(), 9ul); + EXPECT_EQ(*s0.number_of_values, 9ul); auto& s1 = stats[1]; - EXPECT_EQ(s1.type(), cudf_io::statistics_type::INT); - EXPECT_EQ(*s1.number_of_values(), 4ul); - auto ts1 = s1.type_specific_stats(); - EXPECT_EQ(*ts1->minimum(), 1); - EXPECT_EQ(*ts1->maximum(), 7); - EXPECT_EQ(*ts1->sum(), 16); + EXPECT_EQ(*s1.number_of_values, 4ul); + auto& ts1 = std::get(s1.type_specific_stats); + EXPECT_EQ(*ts1.minimum, 1); + EXPECT_EQ(*ts1.maximum, 7); + EXPECT_EQ(*ts1.sum, 16); auto& s2 = stats[2]; - EXPECT_EQ(s2.type(), cudf_io::statistics_type::DOUBLE); - EXPECT_EQ(*s2.number_of_values(), 4ul); - auto ts2 = s2.type_specific_stats(); - EXPECT_EQ(*ts2->minimum(), 1.); - EXPECT_EQ(*ts2->maximum(), 7.); + EXPECT_EQ(*s2.number_of_values, 4ul); + auto& ts2 = std::get(s2.type_specific_stats); + EXPECT_EQ(*ts2.minimum, 1.); + EXPECT_EQ(*ts2.maximum, 7.); // No sum ATM, filed #7087 - EXPECT_EQ(ts2->sum(), nullptr); + ASSERT_FALSE(ts2.sum); auto& s3 = stats[3]; - EXPECT_EQ(s3.type(), cudf_io::statistics_type::STRING); - EXPECT_EQ(*s3.number_of_values(), 9ul); - auto ts3 = s3.type_specific_stats(); - EXPECT_EQ(*ts3->minimum(), "Friday"); - EXPECT_EQ(*ts3->maximum(), "Wednesday"); - EXPECT_EQ(*ts3->sum(), 58ul); + EXPECT_EQ(*s3.number_of_values, 9ul); + auto& ts3 = std::get(s3.type_specific_stats); + EXPECT_EQ(*ts3.minimum, "Friday"); + EXPECT_EQ(*ts3.maximum, "Wednesday"); + EXPECT_EQ(*ts3.sum, 58ul); auto& s4 = stats[4]; - EXPECT_EQ(s4.type(), cudf_io::statistics_type::BUCKET); - EXPECT_EQ(*s4.number_of_values(), 9ul); - EXPECT_EQ(*s4.type_specific_stats()->count(0), 8ul); + EXPECT_EQ(*s4.number_of_values, 9ul); + EXPECT_EQ(std::get(s4.type_specific_stats).count[0], 8ul); auto& s5 = stats[5]; - EXPECT_EQ(s5.type(), cudf_io::statistics_type::TIMESTAMP); - EXPECT_EQ(*s5.number_of_values(), 4ul); - auto ts5 = s5.type_specific_stats(); - EXPECT_EQ(*ts5->minimum_utc(), 1000); - EXPECT_EQ(*ts5->maximum_utc(), 7000); - EXPECT_EQ(ts5->minimum(), nullptr); - EXPECT_EQ(ts5->maximum(), nullptr); + EXPECT_EQ(*s5.number_of_values, 4ul); + auto& ts5 = std::get(s5.type_specific_stats); + EXPECT_EQ(*ts5.minimum_utc, 1000); + EXPECT_EQ(*ts5.maximum_utc, 7000); + ASSERT_FALSE(ts5.minimum); + ASSERT_FALSE(ts5.maximum); }; validate_statistics(stats.file_stats); From 1a68d2e5020c5794c42aa950903a8d08fc3d7c5a Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 13 May 2021 15:04:18 -0700 Subject: [PATCH 2/3] remove now unused proocol buffer support for unique_ptr --- cpp/src/io/orc/orc.h | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h index 4306ddb8ccb..55f8197e728 100644 --- a/cpp/src/io/orc/orc.h +++ b/cpp/src/io/orc/orc.h @@ -23,12 +23,11 @@ #include #include -#include - #include #include #include #include +#include #include #include @@ -87,9 +86,10 @@ struct Stream { // Returns index of the column in the table, if any // Stream of the 'column 0' does not have a corresponding column in the table - thrust::optional column_index() const noexcept + std::optional column_index() const noexcept { - return column_id.value_or(0) > 0 ? thrust::optional{*column_id - 1} : thrust::nullopt; + return column_id.value_or(0) > 0 ? std::optional{*column_id - 1} + : std::optional{}; } }; @@ -227,15 +227,6 @@ class ProtobufReader { return encode_field_number_base(field_number); } - // optional fields don't change the field number encoding - template >::value> * = nullptr> - int static constexpr encode_field_number(int field_number) noexcept - { - return encode_field_number_base(field_number); - } - // optional fields don't change the field number encoding template >::value> @@ -287,16 +278,6 @@ class ProtobufReader { read(value.back(), size); } - template >::value> * = nullptr> - void read_field(T &value, const uint8_t *end) - { - typename T::element_type contained_value; - read_field(contained_value, end); - value = std::make_unique(std::move(contained_value)); - } - template >::value> * = nullptr> From b82fff4b775f86c0226f32ed4570c9a4421d9321 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 13 May 2021 15:35:06 -0700 Subject: [PATCH 3/3] use the actual monostate --- cpp/include/cudf/io/orc_metadata.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp index b2e0768b328..807fab2e85c 100644 --- a/cpp/include/cudf/io/orc_metadata.hpp +++ b/cpp/include/cudf/io/orc_metadata.hpp @@ -63,10 +63,9 @@ struct raw_orc_statistics { raw_orc_statistics read_raw_orc_statistics(source_info const& src_info); /** - * @brief Monostate type for the statistics variant. + * @brief Monostate type alias for the statistics variant. */ -struct no_statistics { -}; +using no_statistics = std::monostate; /** * @brief Base class for column statistics that include optional minimum and maximum.