Skip to content

Commit

Permalink
fix and add test
Browse files Browse the repository at this point in the history
  • Loading branch information
eldenmoon committed Dec 2, 2024
1 parent a267cd5 commit 9564428
Show file tree
Hide file tree
Showing 8 changed files with 89 additions and 14 deletions.
3 changes: 2 additions & 1 deletion be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1021,7 +1021,8 @@ Status VerticalSegmentWriter::_append_block_with_variant_subcolumns(RowsInBlock&
auto full_path = full_path_builder.append(parent_column->name_lower_case(), false)
.append(entry->path.get_parts(), false)
.build();
if (typed_columns.contains(entry->path.get_path())) {
// typed column takes no effect no nested column
if (typed_columns.contains(entry->path.get_path()) && !entry->path.has_nested_part()) {
TabletColumn typed_column = *typed_columns[entry->path.get_path()];
typed_column.set_path_info(full_path);
typed_column.set_parent_unique_id(parent_column->unique_id());
Expand Down
36 changes: 29 additions & 7 deletions be/src/vec/common/schema_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ TabletColumn get_column_by_type(const vectorized::DataTypePtr& data_type, const
void update_least_schema_internal(const std::map<PathInData, DataTypes>& subcolumns_types,
TabletSchemaSPtr& common_schema, bool update_sparse_column,
int32_t variant_col_unique_id,
const std::map<std::string, TabletColumnPtr>& typed_columns,
std::set<PathInData>* path_set = nullptr) {
PathsInData tuple_paths;
DataTypes tuple_types;
Expand Down Expand Up @@ -291,11 +292,21 @@ void update_least_schema_internal(const std::map<PathInData, DataTypes>& subcolu
// Append all common type columns of this variant
for (int i = 0; i < tuple_paths.size(); ++i) {
TabletColumn common_column;
// const std::string& column_name = variant_col_name + "." + tuple_paths[i].get_path();
get_column_by_type(tuple_types[i], tuple_paths[i].get_path(), common_column,
ExtraInfo {.unique_id = -1,
.parent_unique_id = variant_col_unique_id,
.path_info = tuple_paths[i]});
// typed path not contains root part
auto path_without_root = tuple_paths[i].copy_pop_front().get_path();
if (typed_columns.contains(path_without_root)) {
common_column = *typed_columns.at(path_without_root);
// parent unique id and path may not be init in write path
common_column.set_parent_unique_id(variant_col_unique_id);
common_column.set_path_info(tuple_paths[i]);
common_column.set_name(tuple_paths[i].get_path());
} else {
// const std::string& column_name = variant_col_name + "." + tuple_paths[i].get_path();
get_column_by_type(tuple_types[i], tuple_paths[i].get_path(), common_column,
ExtraInfo {.unique_id = -1,
.parent_unique_id = variant_col_unique_id,
.path_info = tuple_paths[i]});
}
if (update_sparse_column) {
common_schema->mutable_column_by_uid(variant_col_unique_id)
.append_sparse_column(common_column);
Expand All @@ -311,6 +322,11 @@ void update_least_schema_internal(const std::map<PathInData, DataTypes>& subcolu
void update_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
std::set<PathInData>* path_set) {
std::map<std::string, TabletColumnPtr> typed_columns;
for (const TabletColumnPtr& col :
common_schema->column_by_uid(variant_col_unique_id).get_sub_columns()) {
typed_columns[col->name()] = col;
}
// Types of subcolumns by path from all tuples.
std::map<PathInData, DataTypes> subcolumns_types;
for (const TabletSchemaSPtr& schema : schemas) {
Expand Down Expand Up @@ -341,12 +357,17 @@ void update_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
}
}
update_least_schema_internal(subcolumns_types, common_schema, false, variant_col_unique_id,
path_set);
typed_columns, path_set);
}

void update_least_sparse_column(const std::vector<TabletSchemaSPtr>& schemas,
TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
const std::set<PathInData>& path_set) {
std::map<std::string, TabletColumnPtr> typed_columns;
for (const TabletColumnPtr& col :
common_schema->column_by_uid(variant_col_unique_id).get_sub_columns()) {
typed_columns[col->name()] = col;
}
// Types of subcolumns by path from all tuples.
std::map<PathInData, DataTypes> subcolumns_types;
for (const TabletSchemaSPtr& schema : schemas) {
Expand All @@ -365,7 +386,8 @@ void update_least_sparse_column(const std::vector<TabletSchemaSPtr>& schemas,
}
}
}
update_least_schema_internal(subcolumns_types, common_schema, true, variant_col_unique_id);
update_least_schema_internal(subcolumns_types, common_schema, true, variant_col_unique_id,
typed_columns);
}

void inherit_column_attributes(const TabletColumn& source, TabletColumn& target,
Expand Down
2 changes: 1 addition & 1 deletion be/src/vec/data_types/serde/data_type_object_serde.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ Status DataTypeObjectSerDe::write_one_cell_to_json(
continue;
}
rapidjson::Value key;
key.SetString(entry->path.get_path().data(), (uint32_t) entry->path.get_path().size());
key.SetString(entry->path.get_path().data(), (uint32_t)entry->path.get_path().size());
rapidjson::Value val;
RETURN_IF_ERROR(subtype_serde->write_one_cell_to_json(subcolumn, val, allocator, mem_pool,
row_num, subtype));
Expand Down
2 changes: 1 addition & 1 deletion be/src/vec/data_types/serde/data_type_serde.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ Status DataTypeSerDe::write_one_cell_to_json(const IColumn& column, rapidjson::V
// allocate memory to prevent from heap use after free
void* mem = allocator.Malloc(str_rep.size());
memcpy(mem, str_rep.data(), str_rep.size());
result.SetString((const char*)mem, (uint32_t) str_rep.size());
result.SetString((const char*)mem, (uint32_t)str_rep.size());
return Status::OK();
}

Expand Down
19 changes: 18 additions & 1 deletion regression-test/data/variant_p0/predefine/load.out
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,23 @@ v1.predefine_col2 double Yes false \N NONE
v1.predefine_col3 text Yes false \N NONE
v1.predefine_col4 text Yes false \N NONE
v2.dcm decimal(9,0) Yes false \N NONE
v3.dcm int Yes false \N NONE
v3.dcm decimal(9,0) Yes false \N NONE
v3.dt datetime Yes false \N NONE

-- !sql --
1 {"nested":[{"a":123,"b":"456"}]}
1 {"nested":[{"a":123,"b":"456"}]}

-- !sql --
[{"a":123,"b":"456"}]
[{"a":123,"b":"456"}]

-- !sql --
\N
\N

-- !sql --
id bigint No true \N
v variant<auto_type:int> Yes false \N NONE
v.auto_type int Yes false \N NONE

2 changes: 1 addition & 1 deletion regression-test/data/variant_p0/predefine/sql/q01.out
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,5 @@ v1.int_ int Yes false \N NONE
v1.ipv4_ ipv4 Yes false \N NONE
v1.ipv6_ ipv6 Yes false \N NONE
v1.string_ text Yes false \N NONE
v1.varchar_ text Yes false \N NONE
v1.varchar_ varchar(65533) Yes false \N NONE

7 changes: 7 additions & 0 deletions regression-test/suites/variant_github_events_p2/load.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -236,4 +236,11 @@ suite("regression_test_variant_github_events_p2", "nonConcurrent,p2"){
// query with inverted index
qt_sql """select cast(v["payload"]["pull_request"]["additions"] as int) from github_events where v["repo"]["name"] match 'xpressengine' order by 1;"""
qt_sql """select count() from github_events where v["repo"]["name"] match 'apache' order by 1;"""

// specify schema
// sql "alter table github_events2 modify column v variant<`payload.comment.id`:int,`payload.commits.url`:text,`payload.forkee.has_pages`:tinyint>"
// load_json_data.call("github_events2", """${getS3Url() + '/regression/gharchive.m/2022-11-07-23.json'}""")
// qt_sql "select * from github_events2 WHERE 1=1 ORDER BY k DESC LIMIT 10"
// qt_sql "select v['payload']['commits'] from github_events2 WHERE 1=1 ORDER BY k DESC LIMIT 10"
// qt_sql "select v['payload']['commits']['url'] from github_events2 WHERE 1=1 ORDER BY k DESC LIMIT 10"
}
32 changes: 30 additions & 2 deletions regression-test/suites/variant_p0/predefine/load.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ suite("regression_test_variant_predefine_schema", "p0"){
`v1` variant NULL,
INDEX idx_var_sub(`v1`) USING INVERTED PROPERTIES("parser" = "english", "sub_column_path" = "a.b.c") )
ENGINE=OLAP DUPLICATE KEY(`id`) DISTRIBUTED BY HASH(`id`) BUCKETS 2
PROPERTIES ( "replication_allocation" = "tag.location.default: 1");
PROPERTIES ( "replication_allocation" = "tag.location.default: 1", "variant_enable_flatten_nested" = "true");
"""
sql """insert into test_predefine1 values(1, '{"predefine_col1" : 1024}')"""
sql """insert into test_predefine1 values(2, '{"predefine_col2" : 1.11111}')"""
Expand Down Expand Up @@ -213,5 +213,33 @@ suite("regression_test_variant_predefine_schema", "p0"){
qt_sql "desc test_predefine1"
sql "alter table test_predefine1 drop column v3"

sql """insert into test_predefine1 select id, v1, v1 from test_predefine2"""
sql "DROP TABLE IF EXISTS test_predefine3"
sql """CREATE TABLE `test_predefine3` (
`id` bigint NOT NULL,
`v` variant NULL)
ENGINE=OLAP DUPLICATE KEY(`id`) DISTRIBUTED BY HASH(`id`) BUCKETS 1
PROPERTIES ( "replication_allocation" = "tag.location.default: 1");"""

// test alter nested no effect at present
sql "truncate table test_predefine3"
sql """insert into test_predefine3 values (1, '{"nested" : [{"a" : 123, "b" : "456"}]}')"""
sql "alter table test_predefine3 modify column v variant<`nested.a`: string>"
sql """insert into test_predefine3 values (1, '{"nested" : [{"a" : 123, "b" : "456"}]}')"""
qt_sql "select * from test_predefine3"
qt_sql "select v['nested'] from test_predefine3"
qt_sql "select v['nested']['a'] from test_predefine3"

// test use auto type detect first then alter to modify type
sql "truncate table test_predefine3"
sql """insert into test_predefine3 values (1, '{"auto_type" : 1234.1111}')"""
sql "alter table test_predefine3 modify column v variant<`auto_type`: int>"
sql """insert into test_predefine3 values (1, '{"auto_type" : "124511111"}')"""
sql """insert into test_predefine3 values (1, '{"auto_type" : 1111122334}')"""
sql """insert into test_predefine3 values (1, '{"auto_type" : 111223341111}')"""
sql """insert into test_predefine3 values (1, '{"auto_type" : true}')"""
sql """insert into test_predefine3 values (1, '{"auto_type" : 1}')"""
sql """insert into test_predefine3 values (1, '{"auto_type" : 256}')"""
sql """insert into test_predefine3 values (1, '{"auto_type" : 12345}')"""
sql """insert into test_predefine3 values (1, '{"auto_type" : 1.0}')"""
qt_sql """desc test_predefine3"""
}

0 comments on commit 9564428

Please sign in to comment.