Skip to content

Commit

Permalink
Merge pull request #5584 from codereport/cuio-modernize
Browse files Browse the repository at this point in the history
[REVIEW] Refactor `CompactProtocolReader::InitSchema`
  • Loading branch information
codereport authored Jun 27, 2020
2 parents aeb6cda + e906f03 commit 1e21c83
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 28 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
- PR #5559 Java APIs for missing date/time operators
- PR #5582 Add support for axis and other parameters to `DataFrame.sort_index` and fix other bunch of issues.
- PR #5562 Add missing join type for java
- PR #5584 Refactor `CompactProtocolReader::InitSchema`
- PR #5591 Add `__arrow_array__` protocol and raise a descriptive error message

## Bug Fixes
Expand Down
55 changes: 27 additions & 28 deletions cpp/src/io/parquet/parquet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

#include "parquet.h"

#include <algorithm>

namespace cudf {
namespace io {
namespace parquet {
Expand Down Expand Up @@ -287,34 +289,31 @@ PARQUET_END_STRUCT()
**/
bool CompactProtocolReader::InitSchema(FileMetaData *md)
{
int final_pos = WalkSchema(md->schema);
if (final_pos != md->schema.size()) { return false; }

// Map columns to schema
for (size_t i = 0; i < md->row_groups.size(); i++) {
RowGroup *g = &md->row_groups[i];
int cur = 0;
for (size_t j = 0; j < g->columns.size(); j++) {
ColumnChunk *col = &g->columns[j];
int parent = 0; // root of schema
for (size_t k = 0; k < col->meta_data.path_in_schema.size(); k++) {
bool found = false;
int pos = cur + 1, maxpos = (int)md->schema.size();
for (int l = maxpos; l > 0; --l) {
if (pos >= maxpos) {
pos = 0; // wrap around
}
if (md->schema[pos].parent_idx == parent &&
md->schema[pos].name == col->meta_data.path_in_schema[k]) {
cur = pos;
found = true;
break;
}
pos++;
}
if (!found) { return false; }
col->schema_idx = cur;
parent = cur;
if (WalkSchema(md->schema) != md->schema.size()) return false;

/* Inside FileMetaData, there is a std::vector of RowGroups and each RowGroup contains a
* a std::vector of ColumnChunks. Each ColumnChunk has a member ColumnMetaData, which contains
* a std::vector of std::strings representing paths. The purpose of the code below is to set the
* schema_idx of each column of each row to it corresonding row_group. This is effectively
* mapping the columns to the schema.
*/
for (auto &row_group : md->row_groups) {
int current_row_group = 0;
for (auto &column : row_group.columns) {
int parent = 0; // root of schema
for (auto const &path : column.meta_data.path_in_schema) {
auto const it = [&] {
// find_if starting at (current_row_group + 1) and then wrapping
auto schema = [&](auto const &e) { return e.parent_idx == parent && e.name == path; };
auto mid = md->schema.cbegin() + current_row_group + 1;
auto it = std::find_if(mid, md->schema.cend(), schema);
if (it != md->schema.cend()) return it;
return std::find_if(md->schema.cbegin(), mid, schema);
}();
if (it == md->schema.cend()) return false;
current_row_group = std::distance(md->schema.cbegin(), it);
column.schema_idx = current_row_group;
parent = current_row_group;
}
}
}
Expand Down

0 comments on commit 1e21c83

Please sign in to comment.