diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 23ed0153f3f..9d6a83e8730 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -124,7 +124,10 @@ struct table_metadata { std::vector column_names; //!< Names of columns contained in the table std::vector schema_info; //!< Detailed name information for the entire output hierarchy - std::map user_data; //!< Format-dependent metadata as key-values pairs + std::map user_data; //!< Format-dependent metadata of the first input + //!< file as key-values pairs (deprecated) + std::vector> + per_file_user_data; //!< Per file format-dependent metadata as key-values pairs }; /** diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index 556ca6b9d80..f39fba0d33b 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -574,7 +574,8 @@ table_with_metadata read_avro(std::unique_ptr&& source, metadata_out.column_names[i] = selected_columns[i].second; } // Return user metadata - metadata_out.user_data = meta.user_data; + metadata_out.user_data = meta.user_data; + metadata_out.per_file_user_data = {{meta.user_data.begin(), meta.user_data.end()}}; return {std::make_unique(std::move(out_columns)), std::move(metadata_out)}; } diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 139eb28d1a1..f64ba6f0566 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -1262,11 +1262,21 @@ table_with_metadata reader::impl::read(size_type skip_rows, out_metadata.schema_info = std::move(schema_info); - for (const auto& meta : _metadata.per_file_metadata) { - for (const auto& kv : meta.ff.metadata) { - out_metadata.user_data.insert({kv.name, kv.value}); - } - } + std::transform(_metadata.per_file_metadata.cbegin(), + _metadata.per_file_metadata.cend(), + std::back_inserter(out_metadata.per_file_user_data), + [](auto& meta) { + std::unordered_map kv_map; + std::transform(meta.ff.metadata.cbegin(), + meta.ff.metadata.cend(), + std::inserter(kv_map, kv_map.end()), + [](auto const& kv) { + return std::pair{kv.name, kv.value}; + }); + return kv_map; + }); + out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(), + out_metadata.per_file_user_data[0].end()}; return {std::make_unique
(std::move(out_columns)), std::move(out_metadata)}; } diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index f165bd5ec3b..c3537833908 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -317,10 +317,10 @@ struct metadata : public FileMetaData { }; class aggregate_reader_metadata { - std::vector const per_file_metadata; - std::map const agg_keyval_map; - size_type const num_rows; - size_type const num_row_groups; + std::vector per_file_metadata; + std::vector> keyval_maps; + size_type num_rows; + size_type num_row_groups; /** * @brief Create a metadata object from each element in the source vector */ @@ -335,18 +335,26 @@ class aggregate_reader_metadata { } /** - * @brief Merge the keyvalue maps from each per-file metadata object into a single map. + * @brief Collect the keyvalue maps from each per-file metadata object into a vector of maps. */ - auto merge_keyval_metadata() + [[nodiscard]] auto collect_keyval_metadata() { - std::map merged; - // merge key/value maps TODO: warn/throw if there are mismatches? - for (auto const& pfm : per_file_metadata) { - for (auto const& kv : pfm.key_value_metadata) { - merged[kv.key] = kv.value; - } - } - return merged; + std::vector> kv_maps; + std::transform(per_file_metadata.cbegin(), + per_file_metadata.cend(), + std::back_inserter(kv_maps), + [](auto const& pfm) { + std::unordered_map kv_map; + std::transform(pfm.key_value_metadata.cbegin(), + pfm.key_value_metadata.cend(), + std::inserter(kv_map, kv_map.end()), + [](auto const& kv) { + return std::pair{kv.key, kv.value}; + }); + return kv_map; + }); + + return kv_maps; } /** @@ -374,7 +382,7 @@ class aggregate_reader_metadata { public: aggregate_reader_metadata(std::vector> const& sources) : per_file_metadata(metadatas_from_sources(sources)), - agg_keyval_map(merge_keyval_metadata()), + keyval_maps(collect_keyval_metadata()), num_rows(calc_num_rows()), num_row_groups(calc_num_row_groups()) { @@ -425,7 +433,7 @@ class aggregate_reader_metadata { return per_file_metadata[0].schema[schema_idx]; } - [[nodiscard]] auto const& get_key_value_metadata() const { return agg_keyval_map; } + [[nodiscard]] auto const& get_key_value_metadata() const { return keyval_maps; } /** * @brief Gets the concrete nesting depth of output cudf columns @@ -461,8 +469,10 @@ class aggregate_reader_metadata { */ [[nodiscard]] std::string get_pandas_index() const { - auto it = agg_keyval_map.find("pandas"); - if (it != agg_keyval_map.end()) { + // Assumes that all input files have the same metadata + // TODO: verify this assumption + auto it = keyval_maps[0].find("pandas"); + if (it != keyval_maps[0].end()) { // Captures a list of quoted strings found inside square brackets after `"index_columns":` // Inside quotes supports newlines, brackets, escaped quotes, etc. // One-liner regex: @@ -1759,7 +1769,9 @@ table_with_metadata reader::impl::read(size_type skip_rows, } // Return user metadata - out_metadata.user_data = _metadata->get_key_value_metadata(); + out_metadata.per_file_user_data = _metadata->get_key_value_metadata(); + out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(), + out_metadata.per_file_user_data[0].end()}; return {std::make_unique
(std::move(out_columns)), std::move(out_metadata)}; }