Skip to content

Commit

Permalink
PARQUET-566: Add method to retrieve the full column path
Browse files Browse the repository at this point in the history
This is based on the idea of `org.apache.parquet.hadoop.metadata.ColumnPath`.

Author: Uwe L. Korn <[email protected]>

Closes apache#82 from xhochy/parquet-566 and squashes the following commits:

dd48b01 [Uwe L. Korn] Move friend declaration into protected section
43f51d1 [Uwe L. Korn] Construct ColumnPath only on request
bb30ab3 [Uwe L. Korn] Initialise parent with nullptr
98cf302 [Uwe L. Korn] Add parent node reference
799f553 [Uwe L. Korn] Deactivate C++11 lint checks
032de01 [Uwe L. Korn] Use stringstream for readability
ad887a9 [Uwe L. Korn] Adhere to Google naming conventions
ec3c008 [Uwe L. Korn] PARQUET-566: Add method to retrieve the full column path

Change-Id: Ic4453fa5ba13113a5cead1ef304c7feb238580ee
  • Loading branch information
xhochy authored and wesm committed Mar 27, 2016
1 parent ba454a4 commit 3953dc1
Show file tree
Hide file tree
Showing 7 changed files with 137 additions and 4 deletions.
17 changes: 17 additions & 0 deletions cpp/src/parquet/schema/descriptor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

namespace parquet_cpp {

using schema::ColumnPath;
using schema::Node;
using schema::NodePtr;
using schema::PrimitiveNode;
using schema::GroupNode;
Expand Down Expand Up @@ -96,4 +98,19 @@ int ColumnDescriptor::type_length() const {
return primitive_node_->type_length();
}

const std::shared_ptr<ColumnPath> ColumnDescriptor::path() const {
// Build the path in reverse order as we traverse the nodes to the top
std::vector<std::string> rpath_;
const Node* node = primitive_node_;
// The schema node is not part of the ColumnPath
while (node->parent()) {
rpath_.push_back(node->name());
node = node->parent();
}

// Build ColumnPath in correct order
std::vector<std::string> path_(rpath_.crbegin(), rpath_.crend());
return std::make_shared<ColumnPath>(std::move(path_));
}

} // namespace parquet_cpp
5 changes: 4 additions & 1 deletion cpp/src/parquet/schema/descriptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ class SchemaDescriptor;
class ColumnDescriptor {
public:
ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level,
int16_t max_repetition_level, const SchemaDescriptor* schema_descr = nullptr);
int16_t max_repetition_level,
const SchemaDescriptor* schema_descr = nullptr);

int16_t max_definition_level() const {
return max_definition_level_;
Expand All @@ -61,6 +62,8 @@ class ColumnDescriptor {
return primitive_node_->name();
}

const std::shared_ptr<schema::ColumnPath> path() const;

int type_length() const;

int type_precision() const;
Expand Down
21 changes: 21 additions & 0 deletions cpp/src/parquet/schema/schema-converter-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,23 @@ class TestSchemaConverter : public ::testing::Test {
std::unique_ptr<Node> node_;
};

bool check_for_parent_consistency(const GroupNode* node) {
// Each node should have the group as parent
for (int i = 0; i < node->field_count(); i++) {
const NodePtr& field = node->field(i);
if (field->parent() != node) {
return false;
}
if (field->is_group()) {
const GroupNode* group = static_cast<GroupNode*>(field.get());
if (!check_for_parent_consistency(group)) {
return false;
}
}
}
return true;
}

TEST_F(TestSchemaConverter, NestedExample) {
SchemaElement elt;
std::vector<SchemaElement> elements;
Expand Down Expand Up @@ -96,6 +113,10 @@ TEST_F(TestSchemaConverter, NestedExample) {
NodePtr schema = GroupNode::Make(name_, Repetition::REPEATED, fields);

ASSERT_TRUE(schema->Equals(group_));

// Check that the parent relationship in each node is consitent
ASSERT_EQ(group_->parent(), nullptr);
ASSERT_TRUE(check_for_parent_consistency(group_));
}

TEST_F(TestSchemaConverter, InvalidRoot) {
Expand Down
9 changes: 8 additions & 1 deletion cpp/src/parquet/schema/schema-descriptor-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ TEST_F(TestSchemaDescriptor, BuildTree) {
// optional group bag 1 0
// repeated group records 2 1
// required int64 item1 2 1
// optional boolean item1 3 1
// optional boolean item2 3 1
// repeated int32 item3 3 2
int16_t ex_max_def_levels[6] = {0, 1, 1, 2, 3, 3};
int16_t ex_max_rep_levels[6] = {0, 0, 1, 1, 1, 2};
Expand All @@ -117,6 +117,13 @@ TEST_F(TestSchemaDescriptor, BuildTree) {
EXPECT_EQ(ex_max_rep_levels[i], col->max_repetition_level()) << i;
}

ASSERT_EQ(descr_.Column(0)->path()->ToDotString(), "a");
ASSERT_EQ(descr_.Column(1)->path()->ToDotString(), "b");
ASSERT_EQ(descr_.Column(2)->path()->ToDotString(), "c");
ASSERT_EQ(descr_.Column(3)->path()->ToDotString(), "bag.records.item1");
ASSERT_EQ(descr_.Column(4)->path()->ToDotString(), "bag.records.item2");
ASSERT_EQ(descr_.Column(5)->path()->ToDotString(), "bag.records.item3");

// Init clears the leaves
descr_.Init(schema);
ASSERT_EQ(nleaves, descr_.num_columns());
Expand Down
15 changes: 15 additions & 0 deletions cpp/src/parquet/schema/schema-types-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,21 @@ namespace parquet_cpp {

namespace schema {

// ----------------------------------------------------------------------
// ColumnPath

TEST(TestColumnPath, TestAttrs) {
ColumnPath path(std::vector<std::string>({"toplevel", "leaf"}));

ASSERT_EQ(path.ToDotString(), "toplevel.leaf");

std::shared_ptr<ColumnPath> path_ptr = ColumnPath::FromDotString("toplevel.leaf");
ASSERT_EQ(path_ptr->ToDotString(), "toplevel.leaf");

std::shared_ptr<ColumnPath> extended = path_ptr->extend("anotherlevel");
ASSERT_EQ(extended->ToDotString(), "toplevel.leaf.anotherlevel");
}

// ----------------------------------------------------------------------
// Primitive node

Expand Down
39 changes: 39 additions & 0 deletions cpp/src/parquet/schema/types.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "parquet/schema/types.h"

#include <algorithm>
#include <memory>

#include "parquet/exception.h"
Expand All @@ -27,6 +28,40 @@ namespace parquet_cpp {

namespace schema {

// ----------------------------------------------------------------------
// ColumnPath

std::shared_ptr<ColumnPath> ColumnPath::FromDotString(const std::string& dotstring) {
std::stringstream ss(dotstring);
std::string item;
std::vector<std::string> path;
while (std::getline(ss, item, '.')) {
path.push_back(item);
}
return std::shared_ptr<ColumnPath>(new ColumnPath(std::move(path)));
}

std::shared_ptr<ColumnPath> ColumnPath::extend(const std::string& node_name) const {
std::vector<std::string> path;
path.reserve(path_.size() + 1);
path.resize(path_.size() + 1);
std::copy(path_.cbegin(), path_.cend(), path.begin());
path[path_.size()] = node_name;

return std::shared_ptr<ColumnPath>(new ColumnPath(std::move(path)));
}

std::string ColumnPath::ToDotString() const {
std::stringstream ss;
for (auto it = path_.cbegin(); it != path_.cend(); ++it) {
if (it != path_.cbegin()) {
ss << ".";
}
ss << *it;
}
return ss.str();
}

// ----------------------------------------------------------------------
// Base node

Expand All @@ -37,6 +72,10 @@ bool Node::EqualsInternal(const Node* other) const {
logical_type_ == other->logical_type_;
}

void Node::SetParent(const Node* parent) {
parent_ = parent;
}

// ----------------------------------------------------------------------
// Primitive node

Expand Down
35 changes: 33 additions & 2 deletions cpp/src/parquet/schema/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,23 @@ struct DecimalMetadata {
int32_t precision;
};

class ColumnPath {
public:
ColumnPath() : path_() {}
explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {}
explicit ColumnPath(std::vector<std::string>&& path) : path_(path) {}

static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring);

std::shared_ptr<ColumnPath> extend(const std::string& node_name) const;
std::string ToDotString() const;

protected:
std::vector<std::string> path_;
};

class GroupNode;

// Base class for logical schema types. A type has a name, repetition level,
// and optionally a logical type (ConvertedType in Parquet metadata parlance)
class Node {
Expand All @@ -95,7 +112,8 @@ class Node {
name_(name),
repetition_(repetition),
logical_type_(logical_type),
id_(id) {}
id_(id),
parent_(nullptr) {}

virtual ~Node() {}

Expand Down Expand Up @@ -141,6 +159,10 @@ class Node {
return id_;
}

const Node* parent() const {
return parent_;
}

// Node::Visitor abstract class for walking schemas with the visitor pattern
class Visitor {
public:
Expand All @@ -152,13 +174,18 @@ class Node {
virtual void Visit(Visitor* visitor) = 0;

protected:
friend class GroupNode;

Node::type type_;
std::string name_;
Repetition::type repetition_;
LogicalType::type logical_type_;
int id_;
// Nodes should not be shared, they have a single parent.
const Node* parent_;

bool EqualsInternal(const Node* other) const;
void SetParent(const Node* p_parent);
};

// Save our breath all over the place with these typedefs
Expand Down Expand Up @@ -259,7 +286,11 @@ class GroupNode : public Node {
LogicalType::type logical_type = LogicalType::NONE,
int id = -1) :
Node(Node::GROUP, name, repetition, logical_type, id),
fields_(fields) {}
fields_(fields) {
for (NodePtr& field : fields_) {
field->SetParent(this);
}
}

NodeVector fields_;
bool EqualsInternal(const GroupNode* other) const;
Expand Down

0 comments on commit 3953dc1

Please sign in to comment.