From 1e08a2e7bfab06be625f2efde5159b78eba24bcd Mon Sep 17 00:00:00 2001 From: gibber9809 <devinbook1@gmail.com> Date: Thu, 23 May 2024 18:49:44 +0000 Subject: [PATCH 01/11] Implement serialization for structurized arrays --- components/core/src/clp_s/JsonSerializer.hpp | 14 +- components/core/src/clp_s/SchemaReader.cpp | 251 ++++++++++++++++++- components/core/src/clp_s/SchemaReader.hpp | 24 ++ 3 files changed, 284 insertions(+), 5 deletions(-) diff --git a/components/core/src/clp_s/JsonSerializer.hpp b/components/core/src/clp_s/JsonSerializer.hpp index 4b7ada80d..b5fe866b5 100644 --- a/components/core/src/clp_s/JsonSerializer.hpp +++ b/components/core/src/clp_s/JsonSerializer.hpp @@ -23,7 +23,9 @@ class JsonSerializer { AddStringValue, AddNullValue, BeginArray, - EndArray + EndArray, + BeginDocument, + BeginArrayDocument, }; static int64_t const cReservedLength = 4096; @@ -76,19 +78,25 @@ class JsonSerializer { void end_document() { m_json_string[m_json_string.size() - 1] = '}'; } void end_object() { - if (m_op_list[m_op_list_index - 2] != BeginObject) { + if (m_op_list[m_op_list_index - 2] != BeginObject + && m_op_list[m_op_list_index - 2] != BeginDocument) + { m_json_string.pop_back(); } m_json_string += "},"; } + void begin_array_document() { m_json_string += "["; } + void begin_array() { append_key(); m_json_string += "["; } void end_array() { - if (m_op_list[m_op_list_index - 2] != BeginArray) { + if (m_op_list[m_op_list_index - 2] != BeginArray + && m_op_list[m_op_list_index - 2] != BeginArrayDocument) + { m_json_string.pop_back(); } m_json_string += "],"; diff --git a/components/core/src/clp_s/SchemaReader.cpp b/components/core/src/clp_s/SchemaReader.cpp index dfe3b1934..5ca6674d8 100644 --- a/components/core/src/clp_s/SchemaReader.cpp +++ b/components/core/src/clp_s/SchemaReader.cpp @@ -72,6 +72,10 @@ void SchemaReader::generate_json_string() { m_json_serializer.end_object(); break; } + case JsonSerializer::Op::BeginDocument: { + m_json_serializer.begin_document(); + break; + } case JsonSerializer::Op::BeginArray: { m_json_serializer.begin_array(); break; @@ -80,6 +84,10 @@ void SchemaReader::generate_json_string() { m_json_serializer.end_array(); break; } + case JsonSerializer::Op::BeginArrayDocument: { + m_json_serializer.begin_array_document(); + break; + } case JsonSerializer::Op::AddIntField: { column = m_reordered_columns[column_id_index++]; auto const& name = m_global_schema_tree->get_node(column->get_id()).get_key_name(); @@ -294,6 +302,236 @@ int32_t SchemaReader::get_first_column_in_span(std::span<int32_t> schema) { return -1; } +void SchemaReader::find_intersection_and_fix_brackets( + int32_t cur_root, + int32_t next_root, + std::vector<int32_t>& path_to_intersection +) { + auto const* cur_node = &m_global_schema_tree->get_node(cur_root); + auto const* next_node = &m_global_schema_tree->get_node(next_root); + while (cur_node->get_parent_id() != next_node->get_parent_id()) { + if (cur_node->get_depth() > next_node->get_depth()) { + cur_root = cur_node->get_parent_id(); + cur_node = &m_global_schema_tree->get_node(cur_root); + m_json_serializer.add_op(JsonSerializer::Op::EndObject); + } else if (cur_node->get_depth() < next_node->get_depth()) { + path_to_intersection.push_back(next_root); + next_root = next_node->get_parent_id(); + next_node = &m_global_schema_tree->get_node(next_root); + } else { + cur_root = cur_node->get_parent_id(); + cur_node = &m_global_schema_tree->get_node(cur_root); + m_json_serializer.add_op(JsonSerializer::Op::EndObject); + path_to_intersection.push_back(next_root); + next_root = next_node->get_parent_id(); + next_node = &m_global_schema_tree->get_node(next_root); + } + } + + for (auto it = path_to_intersection.rbegin(); it != path_to_intersection.rend(); ++it) { + auto const& node = m_global_schema_tree->get_node(*it); + bool no_name = true; + if (false == node.get_key_name().empty()) { + m_json_serializer.add_special_key(node.get_key_name()); + no_name = false; + } + if (NodeType::Object == node.get_type()) { + m_json_serializer.add_op( + no_name ? JsonSerializer::Op::BeginDocument : JsonSerializer::Op::BeginObject + ); + } else if (NodeType::StructuredArray == node.get_type()) { + m_json_serializer.add_op( + no_name ? JsonSerializer::Op::BeginArrayDocument + : JsonSerializer::Op::BeginArray + ); + } + } + path_to_intersection.clear(); +} + +size_t SchemaReader::generate_structured_array_template( + int32_t array_root, + size_t column_start, + std::span<int32_t> schema +) { + size_t column_idx = column_start; + std::vector<int32_t> path_to_intersection; + int32_t depth = m_global_schema_tree->get_node(array_root).get_depth(); + + for (size_t i = 0; i < schema.size(); ++i) { + int32_t global_column_id = schema[i]; + if (Schema::schema_entry_is_unordered_object(global_column_id)) { + auto type = Schema::get_unordered_object_type(global_column_id); + size_t length = Schema::get_unordered_object_length(global_column_id); + auto sub_object_schema = schema.subspan(i + 1, length); + if (NodeType::StructuredArray == type) { + int32_t sub_array_root + = m_global_schema_tree->find_matching_subtree_root_in_subtree( + array_root, + get_first_column_in_span(sub_object_schema), + NodeType::StructuredArray + ); + m_json_serializer.add_op(JsonSerializer::Op::BeginArrayDocument); + column_idx = generate_structured_array_template( + sub_array_root, + column_idx, + sub_object_schema + ); + m_json_serializer.add_op(JsonSerializer::Op::EndArray); + } else if (NodeType::Object == type) { + int32_t object_root = m_global_schema_tree->find_matching_subtree_root_in_subtree( + array_root, + get_first_column_in_span(sub_object_schema), + NodeType::Object + ); + m_json_serializer.add_op(JsonSerializer::Op::BeginDocument); + column_idx = generate_structured_object_template( + object_root, + column_idx, + sub_object_schema + ); + m_json_serializer.add_op(JsonSerializer::Op::EndObject); + } + i += length; + } else { + auto const& node = m_global_schema_tree->get_node(global_column_id); + switch (node.get_type()) { + case NodeType::Object: { + find_intersection_and_fix_brackets( + array_root, + node.get_id(), + path_to_intersection + ); + for (int j = 0; j < (node.get_depth() - depth); ++j) { + m_json_serializer.add_op(JsonSerializer::Op::EndObject); + } + break; + } + case NodeType::StructuredArray: { + m_json_serializer.add_op(JsonSerializer::Op::BeginArrayDocument); + m_json_serializer.add_op(JsonSerializer::Op::EndArray); + break; + } + case NodeType::Integer: { + m_json_serializer.add_op(JsonSerializer::Op::AddIntValue); + m_reordered_columns.push_back(m_columns[column_idx++]); + break; + } + case NodeType::Float: { + m_json_serializer.add_op(JsonSerializer::Op::AddFloatValue); + m_reordered_columns.push_back(m_columns[column_idx++]); + break; + } + case NodeType::Boolean: { + m_json_serializer.add_op(JsonSerializer::Op::AddBoolValue); + m_reordered_columns.push_back(m_columns[column_idx++]); + break; + } + case NodeType::ClpString: + case NodeType::VarString: { + m_json_serializer.add_op(JsonSerializer::Op::AddStringValue); + m_reordered_columns.push_back(m_columns[column_idx++]); + break; + } + case NodeType::NullValue: { + m_json_serializer.add_op(JsonSerializer::Op::AddNullValue); + break; + } + case NodeType::DateString: + case NodeType::UnstructuredArray: + case NodeType::Unknown: + break; + } + } + } + return column_idx; +} + +size_t SchemaReader::generate_structured_object_template( + int32_t object_root, + size_t column_start, + std::span<int32_t> schema +) { + int32_t root = object_root; + size_t column_idx = column_start; + std::vector<int32_t> path_to_intersection; + + for (size_t i = 0; i < schema.size(); ++i) { + int32_t global_column_id = schema[i]; + if (Schema::schema_entry_is_unordered_object(global_column_id)) { + // It should only be possible to encounter arrays inside of structured objects + size_t array_length = Schema::get_unordered_object_length(global_column_id); + auto array_schema = schema.subspan(i + 1, array_length); + // we can guarantee that the last array we hit on the path to object root must be the + // right one because otherwise we'd be inside the structured array generator + int32_t array_root = m_global_schema_tree->find_matching_subtree_root_in_subtree( + object_root, + get_first_column_in_span(array_schema), + NodeType::StructuredArray + ); + + find_intersection_and_fix_brackets(root, array_root, path_to_intersection); + column_idx = generate_structured_array_template(array_root, column_idx, array_schema); + m_json_serializer.add_op(JsonSerializer::Op::EndArray); + i += array_length; + // root is parent of the array object since we close the array bracket above + auto const& node = m_global_schema_tree->get_node(array_root); + root = node.get_parent_id(); + } else { + auto const& node = m_global_schema_tree->get_node(global_column_id); + int32_t next_root = node.get_parent_id(); + find_intersection_and_fix_brackets(root, next_root, path_to_intersection); + root = next_root; + switch (node.get_type()) { + case NodeType::Object: { + m_json_serializer.add_op(JsonSerializer::Op::BeginObject); + m_json_serializer.add_special_key(node.get_key_name()); + m_json_serializer.add_op(JsonSerializer::Op::EndObject); + break; + } + case NodeType::StructuredArray: { + m_json_serializer.add_op(JsonSerializer::Op::BeginArray); + m_json_serializer.add_special_key(node.get_key_name()); + m_json_serializer.add_op(JsonSerializer::Op::EndArray); + break; + } + case NodeType::Integer: { + m_json_serializer.add_op(JsonSerializer::Op::AddIntField); + m_reordered_columns.push_back(m_columns[column_idx++]); + break; + } + case NodeType::Float: { + m_json_serializer.add_op(JsonSerializer::Op::AddFloatField); + m_reordered_columns.push_back(m_columns[column_idx++]); + break; + } + case NodeType::Boolean: { + m_json_serializer.add_op(JsonSerializer::Op::AddBoolField); + m_reordered_columns.push_back(m_columns[column_idx++]); + break; + } + case NodeType::ClpString: + case NodeType::VarString: { + m_json_serializer.add_op(JsonSerializer::Op::AddStringField); + m_reordered_columns.push_back(m_columns[column_idx++]); + break; + } + case NodeType::NullValue: { + m_json_serializer.add_op(JsonSerializer::Op::AddNullField); + m_json_serializer.add_special_key(node.get_key_name()); + break; + } + case NodeType::DateString: + case NodeType::UnstructuredArray: + case NodeType::Unknown: + break; + } + } + } + find_intersection_and_fix_brackets(root, object_root, path_to_intersection); + return column_idx; +} + void SchemaReader::initialize_serializer() { if (m_serializer_initialized) { return; @@ -339,10 +577,19 @@ void SchemaReader::generate_json_template(int32_t id) { break; } case NodeType::StructuredArray: { - // Note: Marshalling structured arrays is left intentionally stubbed out so that we - // can split up the PR for supporting structurized arrays. m_json_serializer.add_op(JsonSerializer::Op::BeginArray); m_json_serializer.add_special_key(key); + int32_t global_child_id = m_local_id_to_global_id[child_id]; + auto structured_it = m_global_id_to_unordered_object.find(global_child_id); + if (m_global_id_to_unordered_object.end() != structured_it) { + size_t column_start = structured_it->second.first; + std::span<int32_t> structured_schema = structured_it->second.second; + generate_structured_array_template( + global_child_id, + column_start, + structured_schema + ); + } m_json_serializer.add_op(JsonSerializer::Op::EndArray); break; } diff --git a/components/core/src/clp_s/SchemaReader.hpp b/components/core/src/clp_s/SchemaReader.hpp index 6ea5f57df..1e5e6a349 100644 --- a/components/core/src/clp_s/SchemaReader.hpp +++ b/components/core/src/clp_s/SchemaReader.hpp @@ -193,6 +193,24 @@ class SchemaReader { */ void generate_json_template(int32_t id); + /** + * Generates a json template for a structured array + * @param id + * @param column_start + * @param schema + */ + size_t + generate_structured_array_template(int32_t id, size_t column_start, std::span<int32_t> schema); + + /** + * Generates a json template for a structured object + * @param id + * @param column_start + * @param schema + */ + size_t + generate_structured_object_template(int32_t id, size_t column_start, std::span<int32_t> schema); + /** * @param schema * @return the first column ID found in the given schema, or -1 if the schema contains no @@ -200,6 +218,12 @@ class SchemaReader { */ static inline int32_t get_first_column_in_span(std::span<int32_t> schema); + void find_intersection_and_fix_brackets( + int32_t cur_root, + int32_t next_root, + std::vector<int32_t>& path_to_intersection + ); + /** * Generates a json string from the extracted values */ From a906592f0f1ae1ff47a02f03c20b69f87c0420be Mon Sep 17 00:00:00 2001 From: gibber9809 <devinbook1@gmail.com> Date: Fri, 24 May 2024 20:06:48 +0000 Subject: [PATCH 02/11] Rename some JsonSerializer ops --- components/core/src/clp_s/JsonSerializer.hpp | 8 ++++---- components/core/src/clp_s/SchemaReader.cpp | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/components/core/src/clp_s/JsonSerializer.hpp b/components/core/src/clp_s/JsonSerializer.hpp index b5fe866b5..01a8a1e74 100644 --- a/components/core/src/clp_s/JsonSerializer.hpp +++ b/components/core/src/clp_s/JsonSerializer.hpp @@ -24,8 +24,8 @@ class JsonSerializer { AddNullValue, BeginArray, EndArray, - BeginDocument, - BeginArrayDocument, + BeginUnnamedObject, + BeginUnnamedArray, }; static int64_t const cReservedLength = 4096; @@ -79,7 +79,7 @@ class JsonSerializer { void end_object() { if (m_op_list[m_op_list_index - 2] != BeginObject - && m_op_list[m_op_list_index - 2] != BeginDocument) + && m_op_list[m_op_list_index - 2] != BeginUnnamedObject) { m_json_string.pop_back(); } @@ -95,7 +95,7 @@ class JsonSerializer { void end_array() { if (m_op_list[m_op_list_index - 2] != BeginArray - && m_op_list[m_op_list_index - 2] != BeginArrayDocument) + && m_op_list[m_op_list_index - 2] != BeginUnnamedArray) { m_json_string.pop_back(); } diff --git a/components/core/src/clp_s/SchemaReader.cpp b/components/core/src/clp_s/SchemaReader.cpp index 5ca6674d8..2b63b0ee7 100644 --- a/components/core/src/clp_s/SchemaReader.cpp +++ b/components/core/src/clp_s/SchemaReader.cpp @@ -72,7 +72,7 @@ void SchemaReader::generate_json_string() { m_json_serializer.end_object(); break; } - case JsonSerializer::Op::BeginDocument: { + case JsonSerializer::Op::BeginUnnamedObject: { m_json_serializer.begin_document(); break; } @@ -84,7 +84,7 @@ void SchemaReader::generate_json_string() { m_json_serializer.end_array(); break; } - case JsonSerializer::Op::BeginArrayDocument: { + case JsonSerializer::Op::BeginUnnamedArray: { m_json_serializer.begin_array_document(); break; } @@ -337,12 +337,12 @@ void SchemaReader::find_intersection_and_fix_brackets( } if (NodeType::Object == node.get_type()) { m_json_serializer.add_op( - no_name ? JsonSerializer::Op::BeginDocument : JsonSerializer::Op::BeginObject + no_name ? JsonSerializer::Op::BeginUnnamedObject + : JsonSerializer::Op::BeginObject ); } else if (NodeType::StructuredArray == node.get_type()) { m_json_serializer.add_op( - no_name ? JsonSerializer::Op::BeginArrayDocument - : JsonSerializer::Op::BeginArray + no_name ? JsonSerializer::Op::BeginUnnamedArray : JsonSerializer::Op::BeginArray ); } } @@ -371,7 +371,7 @@ size_t SchemaReader::generate_structured_array_template( get_first_column_in_span(sub_object_schema), NodeType::StructuredArray ); - m_json_serializer.add_op(JsonSerializer::Op::BeginArrayDocument); + m_json_serializer.add_op(JsonSerializer::Op::BeginUnnamedArray); column_idx = generate_structured_array_template( sub_array_root, column_idx, @@ -384,7 +384,7 @@ size_t SchemaReader::generate_structured_array_template( get_first_column_in_span(sub_object_schema), NodeType::Object ); - m_json_serializer.add_op(JsonSerializer::Op::BeginDocument); + m_json_serializer.add_op(JsonSerializer::Op::BeginUnnamedObject); column_idx = generate_structured_object_template( object_root, column_idx, @@ -408,7 +408,7 @@ size_t SchemaReader::generate_structured_array_template( break; } case NodeType::StructuredArray: { - m_json_serializer.add_op(JsonSerializer::Op::BeginArrayDocument); + m_json_serializer.add_op(JsonSerializer::Op::BeginUnnamedArray); m_json_serializer.add_op(JsonSerializer::Op::EndArray); break; } From 9d14226031e37edb072d83344fe1c4de66c6909f Mon Sep 17 00:00:00 2001 From: gibber9809 <devinbook1@gmail.com> Date: Wed, 29 May 2024 17:24:53 +0000 Subject: [PATCH 03/11] Fix a bug where empty structured arrays do not get marshalled --- components/core/src/clp_s/ArchiveReader.cpp | 33 +++++++++++++-------- components/core/src/clp_s/ArchiveReader.hpp | 4 +-- components/core/src/clp_s/SchemaReader.hpp | 14 ++++----- 3 files changed, 30 insertions(+), 21 deletions(-) diff --git a/components/core/src/clp_s/ArchiveReader.cpp b/components/core/src/clp_s/ArchiveReader.cpp index 084593639..cc3734a8a 100644 --- a/components/core/src/clp_s/ArchiveReader.cpp +++ b/components/core/src/clp_s/ArchiveReader.cpp @@ -143,11 +143,10 @@ BaseColumnReader* ArchiveReader::append_reader_column(SchemaReader& reader, int3 void ArchiveReader::append_unordered_reader_columns( SchemaReader& reader, - NodeType unordered_object_type, + int32_t mst_subtree_root_node_id, std::span<int32_t> schema_ids, bool should_marshal_records ) { - int32_t mst_subtree_root_node_id = INT32_MAX; size_t object_begin_pos = reader.get_column_size(); for (int32_t column_id : schema_ids) { if (Schema::schema_entry_is_unordered_object(column_id)) { @@ -155,13 +154,6 @@ void ArchiveReader::append_unordered_reader_columns( } BaseColumnReader* column_reader = nullptr; auto const& node = m_schema_tree->get_node(column_id); - if (INT32_MAX == mst_subtree_root_node_id) { - mst_subtree_root_node_id = m_schema_tree->find_matching_subtree_root_in_subtree( - -1, - column_id, - unordered_object_type - ); - } switch (node.get_type()) { case NodeType::Integer: column_reader = new Int64ColumnReader(column_id); @@ -214,19 +206,36 @@ SchemaReader& ArchiveReader::create_schema_reader( should_marshal_records ); auto timestamp_column_ids = m_timestamp_dict->get_authoritative_timestamp_column_ids(); - for (size_t i = 0; i < schema.size(); ++i) { int32_t column_id = schema[i]; if (Schema::schema_entry_is_unordered_object(column_id)) { size_t length = Schema::get_unordered_object_length(column_id); + + auto sub_schema = schema.get_view(i + 1, length); + auto mst_subtree_root_node_id = m_schema_tree->find_matching_subtree_root_in_subtree( + -1, + SchemaReader::get_first_column_in_span(sub_schema), + Schema::get_unordered_object_type(column_id) + ); append_unordered_reader_columns( m_schema_reader, - Schema::get_unordered_object_type(column_id), - schema.get_view(i + 1, length), + mst_subtree_root_node_id, + sub_schema, should_marshal_records ); i += length; continue; + } else if (i >= schema.get_num_ordered()) { + // Length one unordered object that doesn't have a tag. This is only allowed when the + // column id is the root of the unordered object, so we can pass it directly to + // append_unordered_reader_columns. + append_unordered_reader_columns( + m_schema_reader, + column_id, + schema.get_view(i, 0), + should_marshal_records + ); + continue; } BaseColumnReader* column_reader = append_reader_column(m_schema_reader, column_id); diff --git a/components/core/src/clp_s/ArchiveReader.hpp b/components/core/src/clp_s/ArchiveReader.hpp index 6ce881e91..54eb42698 100644 --- a/components/core/src/clp_s/ArchiveReader.hpp +++ b/components/core/src/clp_s/ArchiveReader.hpp @@ -149,13 +149,13 @@ class ArchiveReader { /** * Appends columns for the entire schema of an unordered object. * @param reader - * @param unordered_object_type + * @param mst_subtree_root_node_id * @param schema_ids * @param should_marshal_records */ void append_unordered_reader_columns( SchemaReader& reader, - NodeType unordered_object_type, + int32_t mst_subtree_root_node_id, std::span<int32_t> schema_ids, bool should_marshal_records ); diff --git a/components/core/src/clp_s/SchemaReader.hpp b/components/core/src/clp_s/SchemaReader.hpp index 1e5e6a349..1df5cbf1b 100644 --- a/components/core/src/clp_s/SchemaReader.hpp +++ b/components/core/src/clp_s/SchemaReader.hpp @@ -178,6 +178,13 @@ class SchemaReader { int32_t get_schema_id() const { return m_schema_id; } + /** + * @param schema + * @return the first column ID found in the given schema, or -1 if the schema contains no + * columns + */ + static int32_t get_first_column_in_span(std::span<int32_t> schema); + private: /** * Merges the current local schema tree with the section of the global schema tree corresponding @@ -211,13 +218,6 @@ class SchemaReader { size_t generate_structured_object_template(int32_t id, size_t column_start, std::span<int32_t> schema); - /** - * @param schema - * @return the first column ID found in the given schema, or -1 if the schema contains no - * columns - */ - static inline int32_t get_first_column_in_span(std::span<int32_t> schema); - void find_intersection_and_fix_brackets( int32_t cur_root, int32_t next_root, From a9aff7ca08bcdfdec3ebf00ed4a09f4c26d9e881 Mon Sep 17 00:00:00 2001 From: gibber9809 <devinbook1@gmail.com> Date: Wed, 29 May 2024 17:48:54 +0000 Subject: [PATCH 04/11] Fix bug where find_intersection_and_fix_brackets can sometimes miss one level of bracket fixing --- components/core/src/clp_s/SchemaReader.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/components/core/src/clp_s/SchemaReader.cpp b/components/core/src/clp_s/SchemaReader.cpp index 2b63b0ee7..e6f423b9f 100644 --- a/components/core/src/clp_s/SchemaReader.cpp +++ b/components/core/src/clp_s/SchemaReader.cpp @@ -328,6 +328,14 @@ void SchemaReader::find_intersection_and_fix_brackets( } } + // The loop above ends when the parent of next node and cur node matches. When these two nodes + // have the same parent but are different nodes we need to close the last bracket for the + // previous node, and add the first key for next node. + if (cur_node != next_node) { + m_json_serializer.add_op(JsonSerializer::Op::EndObject); + path_to_intersection.push_back(next_node->get_id()); + } + for (auto it = path_to_intersection.rbegin(); it != path_to_intersection.rend(); ++it) { auto const& node = m_global_schema_tree->get_node(*it); bool no_name = true; From 1fe417f7aa1cd8999539568d9c9440ec755f1789 Mon Sep 17 00:00:00 2001 From: gibber9809 <devinbook1@gmail.com> Date: Wed, 29 May 2024 18:32:29 +0000 Subject: [PATCH 05/11] Improve comments in SchemaReader.hpp --- components/core/src/clp_s/SchemaReader.hpp | 29 ++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp_s/SchemaReader.hpp b/components/core/src/clp_s/SchemaReader.hpp index 1df5cbf1b..7c96ac57e 100644 --- a/components/core/src/clp_s/SchemaReader.hpp +++ b/components/core/src/clp_s/SchemaReader.hpp @@ -203,8 +203,9 @@ class SchemaReader { /** * Generates a json template for a structured array * @param id - * @param column_start + * @param column_start the index of the first reader in m_columns belonging to this array * @param schema + * @return the index of the next reader in m_columns after those consumed by this array */ size_t generate_structured_array_template(int32_t id, size_t column_start, std::span<int32_t> schema); @@ -212,12 +213,36 @@ class SchemaReader { /** * Generates a json template for a structured object * @param id - * @param column_start + * @param column_start the index of the first reader in m_columns belonging to this object * @param schema + * @return the index of the next reader in m_columns after those consumed by this object */ size_t generate_structured_object_template(int32_t id, size_t column_start, std::span<int32_t> schema); + /** + * Finds the common root of the subtree containing cur_root and next_root, and adds brackets + * and keys to m_json_serializer as necessary so that the json object is correct between the + * previous field which is a child of cur_root, and the next field which is a child of + * next_root. + * + * For example for the object {"a": {"b":"c"}, "d": {"e":{"f":"g"}} after appending "b" cur_root + * would be "a", and next_root would be "e". (since it is the parent of the next field "f"). + * The current state of the object would look like {"a":{"b":"c" -- to prepare for "f" we would + * add },"d":{"e":{ or in other words close one bracket, add "d" and open bracket, add "e" and + * open bracket. After adding field "f" the current root is "e", and the next root is the + * original object which is the parent of "a" so we add }}. + * + * This works by tracing the path between both cur_root and next_root to their nearest common + * ancestor. For every step cur_root takes towards this common ancestor we must close a bracket, + * and for every step on the path from next_root a key must be added and a bracket must be + * opened. The parameter `path_to_intersection` is used as a buffer to store the path from + * next_root to this intersection so that the keys can be added to m_json_serializer in the + * correct order. + * @param cur_root + * @param next_root + * @param path_to_intersection + */ void find_intersection_and_fix_brackets( int32_t cur_root, int32_t next_root, From da90ddb89c75cdc264d73a8937459728c90177ee Mon Sep 17 00:00:00 2001 From: gibber9809 <devinbook1@gmail.com> Date: Wed, 29 May 2024 18:37:17 +0000 Subject: [PATCH 06/11] Address review comment --- components/core/src/clp_s/SchemaReader.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/components/core/src/clp_s/SchemaReader.cpp b/components/core/src/clp_s/SchemaReader.cpp index e6f423b9f..ae3ad1937 100644 --- a/components/core/src/clp_s/SchemaReader.cpp +++ b/components/core/src/clp_s/SchemaReader.cpp @@ -587,13 +587,12 @@ void SchemaReader::generate_json_template(int32_t id) { case NodeType::StructuredArray: { m_json_serializer.add_op(JsonSerializer::Op::BeginArray); m_json_serializer.add_special_key(key); - int32_t global_child_id = m_local_id_to_global_id[child_id]; - auto structured_it = m_global_id_to_unordered_object.find(global_child_id); + auto structured_it = m_global_id_to_unordered_object.find(child_global_id); if (m_global_id_to_unordered_object.end() != structured_it) { size_t column_start = structured_it->second.first; std::span<int32_t> structured_schema = structured_it->second.second; generate_structured_array_template( - global_child_id, + child_global_id, column_start, structured_schema ); From 249f7f64f308c524e2de0ff5e666c6c24cb71a55 Mon Sep 17 00:00:00 2001 From: gibber9809 <devinbook1@gmail.com> Date: Wed, 29 May 2024 19:01:13 +0000 Subject: [PATCH 07/11] Update comment --- components/core/src/clp_s/SchemaReader.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/components/core/src/clp_s/SchemaReader.hpp b/components/core/src/clp_s/SchemaReader.hpp index 7c96ac57e..8597316a6 100644 --- a/components/core/src/clp_s/SchemaReader.hpp +++ b/components/core/src/clp_s/SchemaReader.hpp @@ -226,11 +226,11 @@ class SchemaReader { * previous field which is a child of cur_root, and the next field which is a child of * next_root. * - * For example for the object {"a": {"b":"c"}, "d": {"e":{"f":"g"}} after appending "b" cur_root - * would be "a", and next_root would be "e". (since it is the parent of the next field "f"). - * The current state of the object would look like {"a":{"b":"c" -- to prepare for "f" we would - * add },"d":{"e":{ or in other words close one bracket, add "d" and open bracket, add "e" and - * open bracket. After adding field "f" the current root is "e", and the next root is the + * For example for the object {"a": {"b": "c"}, "d": {"e": {"f": "g"}}} after appending "b" + * cur_root would be "a", and next_root would be "e". (since it is the parent of the next field + * "f"). The current state of the object would look like "a":{"b":"c" -- to prepare for "f" we + * would add },"d":{"e":{ or in other words close one bracket, add "d" and open bracket, add "e" + * and open bracket. After adding field "f" the current root is "e", and the next root is the * original object which is the parent of "a" so we add }}. * * This works by tracing the path between both cur_root and next_root to their nearest common From cf4227029b2e1fcc63bfa8b920fb0bcd777e6d00 Mon Sep 17 00:00:00 2001 From: Devin Gibson <gibber9809@users.noreply.github.com> Date: Mon, 3 Jun 2024 14:32:19 -0400 Subject: [PATCH 08/11] Update components/core/src/clp_s/SchemaReader.cpp Co-authored-by: wraymo <37269683+wraymo@users.noreply.github.com> --- components/core/src/clp_s/SchemaReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp_s/SchemaReader.cpp b/components/core/src/clp_s/SchemaReader.cpp index ae3ad1937..03edebf69 100644 --- a/components/core/src/clp_s/SchemaReader.cpp +++ b/components/core/src/clp_s/SchemaReader.cpp @@ -407,7 +407,7 @@ size_t SchemaReader::generate_structured_array_template( case NodeType::Object: { find_intersection_and_fix_brackets( array_root, - node.get_id(), + global_column_id, path_to_intersection ); for (int j = 0; j < (node.get_depth() - depth); ++j) { From e8235bbc59588ef1541c47b8de1265f164a9c1fb Mon Sep 17 00:00:00 2001 From: gibber9809 <devinbook1@gmail.com> Date: Thu, 6 Jun 2024 16:30:28 +0000 Subject: [PATCH 09/11] Address review comment --- components/core/src/clp_s/ArchiveReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp_s/ArchiveReader.cpp b/components/core/src/clp_s/ArchiveReader.cpp index cc3734a8a..6219f4b42 100644 --- a/components/core/src/clp_s/ArchiveReader.cpp +++ b/components/core/src/clp_s/ArchiveReader.cpp @@ -232,7 +232,7 @@ SchemaReader& ArchiveReader::create_schema_reader( append_unordered_reader_columns( m_schema_reader, column_id, - schema.get_view(i, 0), + std::span<int32_t>(), should_marshal_records ); continue; From 2e0b6dbbf4b6b1eed3195555f0b4668a1ff7955d Mon Sep 17 00:00:00 2001 From: Devin Gibson <gibber9809@users.noreply.github.com> Date: Thu, 6 Jun 2024 14:42:57 -0400 Subject: [PATCH 10/11] Update components/core/src/clp_s/ArchiveReader.cpp Co-authored-by: wraymo <37269683+wraymo@users.noreply.github.com> --- components/core/src/clp_s/ArchiveReader.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/components/core/src/clp_s/ArchiveReader.cpp b/components/core/src/clp_s/ArchiveReader.cpp index 6219f4b42..29e17080a 100644 --- a/components/core/src/clp_s/ArchiveReader.cpp +++ b/components/core/src/clp_s/ArchiveReader.cpp @@ -225,7 +225,8 @@ SchemaReader& ArchiveReader::create_schema_reader( ); i += length; continue; - } else if (i >= schema.get_num_ordered()) { + } + if (i >= schema.get_num_ordered()) { // Length one unordered object that doesn't have a tag. This is only allowed when the // column id is the root of the unordered object, so we can pass it directly to // append_unordered_reader_columns. From e8d82188268a5866cf488e025f5b45ab33b44dc7 Mon Sep 17 00:00:00 2001 From: gibber9809 <devinbook1@gmail.com> Date: Thu, 6 Jun 2024 21:12:38 +0000 Subject: [PATCH 11/11] Fix lint --- components/core/src/clp_s/ArchiveReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp_s/ArchiveReader.cpp b/components/core/src/clp_s/ArchiveReader.cpp index 29e17080a..93f905e3b 100644 --- a/components/core/src/clp_s/ArchiveReader.cpp +++ b/components/core/src/clp_s/ArchiveReader.cpp @@ -225,7 +225,7 @@ SchemaReader& ArchiveReader::create_schema_reader( ); i += length; continue; - } + } if (i >= schema.get_num_ordered()) { // Length one unordered object that doesn't have a tag. This is only allowed when the // column id is the root of the unordered object, so we can pass it directly to