Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Return per-file metadata from readers #10782

Merged
merged 5 commits into from
May 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion cpp/include/cudf/io/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,10 @@ struct table_metadata {
std::vector<std::string> column_names; //!< Names of columns contained in the table
std::vector<column_name_info>
schema_info; //!< Detailed name information for the entire output hierarchy
std::map<std::string, std::string> user_data; //!< Format-dependent metadata as key-values pairs
std::map<std::string, std::string> user_data; //!< Format-dependent metadata of the first input
//!< file as key-values pairs (deprecated)
std::vector<std::unordered_map<std::string, std::string>>
per_file_user_data; //!< Per file format-dependent metadata as key-values pairs
};

/**
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/io/avro/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -574,7 +574,8 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
metadata_out.column_names[i] = selected_columns[i].second;
}
// Return user metadata
metadata_out.user_data = meta.user_data;
metadata_out.user_data = meta.user_data;
metadata_out.per_file_user_data = {{meta.user_data.begin(), meta.user_data.end()}};

return {std::make_unique<table>(std::move(out_columns)), std::move(metadata_out)};
}
Expand Down
20 changes: 15 additions & 5 deletions cpp/src/io/orc/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1262,11 +1262,21 @@ table_with_metadata reader::impl::read(size_type skip_rows,

out_metadata.schema_info = std::move(schema_info);

for (const auto& meta : _metadata.per_file_metadata) {
for (const auto& kv : meta.ff.metadata) {
out_metadata.user_data.insert({kv.name, kv.value});
}
}
std::transform(_metadata.per_file_metadata.cbegin(),
_metadata.per_file_metadata.cend(),
std::back_inserter(out_metadata.per_file_user_data),
[](auto& meta) {
std::unordered_map<std::string, std::string> kv_map;
std::transform(meta.ff.metadata.cbegin(),
meta.ff.metadata.cend(),
std::inserter(kv_map, kv_map.end()),
[](auto const& kv) {
return std::pair{kv.name, kv.value};
});
return kv_map;
});
out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),
out_metadata.per_file_user_data[0].end()};

return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
}
Expand Down
50 changes: 31 additions & 19 deletions cpp/src/io/parquet/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -317,10 +317,10 @@ struct metadata : public FileMetaData {
};

class aggregate_reader_metadata {
std::vector<metadata> const per_file_metadata;
std::map<std::string, std::string> const agg_keyval_map;
size_type const num_rows;
size_type const num_row_groups;
std::vector<metadata> per_file_metadata;
std::vector<std::unordered_map<std::string, std::string>> keyval_maps;
size_type num_rows;
size_type num_row_groups;
/**
* @brief Create a metadata object from each element in the source vector
*/
Expand All @@ -335,18 +335,26 @@ class aggregate_reader_metadata {
}

/**
* @brief Merge the keyvalue maps from each per-file metadata object into a single map.
* @brief Collect the keyvalue maps from each per-file metadata object into a vector of maps.
*/
auto merge_keyval_metadata()
[[nodiscard]] auto collect_keyval_metadata()
{
std::map<std::string, std::string> merged;
// merge key/value maps TODO: warn/throw if there are mismatches?
for (auto const& pfm : per_file_metadata) {
for (auto const& kv : pfm.key_value_metadata) {
merged[kv.key] = kv.value;
}
}
return merged;
std::vector<std::unordered_map<std::string, std::string>> kv_maps;
std::transform(per_file_metadata.cbegin(),
per_file_metadata.cend(),
std::back_inserter(kv_maps),
[](auto const& pfm) {
std::unordered_map<std::string, std::string> kv_map;
std::transform(pfm.key_value_metadata.cbegin(),
pfm.key_value_metadata.cend(),
std::inserter(kv_map, kv_map.end()),
[](auto const& kv) {
return std::pair{kv.key, kv.value};
});
return kv_map;
});

return kv_maps;
}

/**
Expand Down Expand Up @@ -374,7 +382,7 @@ class aggregate_reader_metadata {
public:
aggregate_reader_metadata(std::vector<std::unique_ptr<datasource>> const& sources)
: per_file_metadata(metadatas_from_sources(sources)),
agg_keyval_map(merge_keyval_metadata()),
keyval_maps(collect_keyval_metadata()),
num_rows(calc_num_rows()),
num_row_groups(calc_num_row_groups())
{
Expand Down Expand Up @@ -425,7 +433,7 @@ class aggregate_reader_metadata {
return per_file_metadata[0].schema[schema_idx];
}

[[nodiscard]] auto const& get_key_value_metadata() const { return agg_keyval_map; }
[[nodiscard]] auto const& get_key_value_metadata() const { return keyval_maps; }

/**
* @brief Gets the concrete nesting depth of output cudf columns
Expand Down Expand Up @@ -461,8 +469,10 @@ class aggregate_reader_metadata {
*/
[[nodiscard]] std::string get_pandas_index() const
{
auto it = agg_keyval_map.find("pandas");
if (it != agg_keyval_map.end()) {
// Assumes that all input files have the same metadata
// TODO: verify this assumption
auto it = keyval_maps[0].find("pandas");
if (it != keyval_maps[0].end()) {
Comment on lines +472 to +475
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure how it works if the first file is written by pandas (thus has index_columns) while the second one is written by cudf (has no index_columns). To be verified once the python changes have been finished.

// Captures a list of quoted strings found inside square brackets after `"index_columns":`
// Inside quotes supports newlines, brackets, escaped quotes, etc.
// One-liner regex:
Expand Down Expand Up @@ -1759,7 +1769,9 @@ table_with_metadata reader::impl::read(size_type skip_rows,
}

// Return user metadata
out_metadata.user_data = _metadata->get_key_value_metadata();
out_metadata.per_file_user_data = _metadata->get_key_value_metadata();
out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),
out_metadata.per_file_user_data[0].end()};

return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
}
Expand Down