Skip to content

Commit

Permalink
Return per-file metadata from readers (#10782)
Browse files Browse the repository at this point in the history
Issue #10775 

C++ side of the fix the the issue above.
Adds `pref_file_user_data` to `table_metadata` so that readers can return a map per file instead of merging maps from multiple input file into a single map, overwriting elements with the same key.

The original `user_data` member now holds the metadata from the first input file, instead of trying (and failing) to merge the maps.
Will be removed in the future. Got no good way to deprecate, as the `table_metadata` struct does not have encapsulation :(

"breaking" label because the logic of `user_data` changed. Not expected to impact and working code.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Karthikeyan (https://github.com/karthikeyann)

URL: #10782
  • Loading branch information
vuule authored May 9, 2022
1 parent 6280ef0 commit c4ed468
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 26 deletions.
5 changes: 4 additions & 1 deletion cpp/include/cudf/io/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,10 @@ struct table_metadata {
std::vector<std::string> column_names; //!< Names of columns contained in the table
std::vector<column_name_info>
schema_info; //!< Detailed name information for the entire output hierarchy
std::map<std::string, std::string> user_data; //!< Format-dependent metadata as key-values pairs
std::map<std::string, std::string> user_data; //!< Format-dependent metadata of the first input
//!< file as key-values pairs (deprecated)
std::vector<std::unordered_map<std::string, std::string>>
per_file_user_data; //!< Per file format-dependent metadata as key-values pairs
};

/**
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/io/avro/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -574,7 +574,8 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
metadata_out.column_names[i] = selected_columns[i].second;
}
// Return user metadata
metadata_out.user_data = meta.user_data;
metadata_out.user_data = meta.user_data;
metadata_out.per_file_user_data = {{meta.user_data.begin(), meta.user_data.end()}};

return {std::make_unique<table>(std::move(out_columns)), std::move(metadata_out)};
}
Expand Down
20 changes: 15 additions & 5 deletions cpp/src/io/orc/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1262,11 +1262,21 @@ table_with_metadata reader::impl::read(size_type skip_rows,

out_metadata.schema_info = std::move(schema_info);

for (const auto& meta : _metadata.per_file_metadata) {
for (const auto& kv : meta.ff.metadata) {
out_metadata.user_data.insert({kv.name, kv.value});
}
}
std::transform(_metadata.per_file_metadata.cbegin(),
_metadata.per_file_metadata.cend(),
std::back_inserter(out_metadata.per_file_user_data),
[](auto& meta) {
std::unordered_map<std::string, std::string> kv_map;
std::transform(meta.ff.metadata.cbegin(),
meta.ff.metadata.cend(),
std::inserter(kv_map, kv_map.end()),
[](auto const& kv) {
return std::pair{kv.name, kv.value};
});
return kv_map;
});
out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),
out_metadata.per_file_user_data[0].end()};

return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
}
Expand Down
50 changes: 31 additions & 19 deletions cpp/src/io/parquet/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -317,10 +317,10 @@ struct metadata : public FileMetaData {
};

class aggregate_reader_metadata {
std::vector<metadata> const per_file_metadata;
std::map<std::string, std::string> const agg_keyval_map;
size_type const num_rows;
size_type const num_row_groups;
std::vector<metadata> per_file_metadata;
std::vector<std::unordered_map<std::string, std::string>> keyval_maps;
size_type num_rows;
size_type num_row_groups;
/**
* @brief Create a metadata object from each element in the source vector
*/
Expand All @@ -335,18 +335,26 @@ class aggregate_reader_metadata {
}

/**
* @brief Merge the keyvalue maps from each per-file metadata object into a single map.
* @brief Collect the keyvalue maps from each per-file metadata object into a vector of maps.
*/
auto merge_keyval_metadata()
[[nodiscard]] auto collect_keyval_metadata()
{
std::map<std::string, std::string> merged;
// merge key/value maps TODO: warn/throw if there are mismatches?
for (auto const& pfm : per_file_metadata) {
for (auto const& kv : pfm.key_value_metadata) {
merged[kv.key] = kv.value;
}
}
return merged;
std::vector<std::unordered_map<std::string, std::string>> kv_maps;
std::transform(per_file_metadata.cbegin(),
per_file_metadata.cend(),
std::back_inserter(kv_maps),
[](auto const& pfm) {
std::unordered_map<std::string, std::string> kv_map;
std::transform(pfm.key_value_metadata.cbegin(),
pfm.key_value_metadata.cend(),
std::inserter(kv_map, kv_map.end()),
[](auto const& kv) {
return std::pair{kv.key, kv.value};
});
return kv_map;
});

return kv_maps;
}

/**
Expand Down Expand Up @@ -374,7 +382,7 @@ class aggregate_reader_metadata {
public:
aggregate_reader_metadata(std::vector<std::unique_ptr<datasource>> const& sources)
: per_file_metadata(metadatas_from_sources(sources)),
agg_keyval_map(merge_keyval_metadata()),
keyval_maps(collect_keyval_metadata()),
num_rows(calc_num_rows()),
num_row_groups(calc_num_row_groups())
{
Expand Down Expand Up @@ -425,7 +433,7 @@ class aggregate_reader_metadata {
return per_file_metadata[0].schema[schema_idx];
}

[[nodiscard]] auto const& get_key_value_metadata() const { return agg_keyval_map; }
[[nodiscard]] auto const& get_key_value_metadata() const { return keyval_maps; }

/**
* @brief Gets the concrete nesting depth of output cudf columns
Expand Down Expand Up @@ -461,8 +469,10 @@ class aggregate_reader_metadata {
*/
[[nodiscard]] std::string get_pandas_index() const
{
auto it = agg_keyval_map.find("pandas");
if (it != agg_keyval_map.end()) {
// Assumes that all input files have the same metadata
// TODO: verify this assumption
auto it = keyval_maps[0].find("pandas");
if (it != keyval_maps[0].end()) {
// Captures a list of quoted strings found inside square brackets after `"index_columns":`
// Inside quotes supports newlines, brackets, escaped quotes, etc.
// One-liner regex:
Expand Down Expand Up @@ -1759,7 +1769,9 @@ table_with_metadata reader::impl::read(size_type skip_rows,
}

// Return user metadata
out_metadata.user_data = _metadata->get_key_value_metadata();
out_metadata.per_file_user_data = _metadata->get_key_value_metadata();
out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),
out_metadata.per_file_user_data[0].end()};

return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
}
Expand Down

0 comments on commit c4ed468

Please sign in to comment.