diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h index 3aeaf5282e00bf..181d8e61c9e7e9 100644 --- a/be/src/vec/columns/column.h +++ b/be/src/vec/columns/column.h @@ -220,8 +220,8 @@ class IColumn : public COW { // insert the data of target columns into self column according to positions // positions[i] means index of srcs whitch need to insert_from // the virtual function overhead of multiple calls to insert_from can be reduced to once - void insert_from_multi_column(const std::vector& srcs, - std::vector positions); + virtual void insert_from_multi_column(const std::vector& srcs, + const std::vector& positions) = 0; /// Appends a batch elements from other column with the same type /// Also here should make sure indices_end is bigger than indices_begin @@ -480,14 +480,6 @@ class IColumn : public COW { */ virtual Ptr replicate(const Offsets& offsets) const = 0; - /// Appends one field multiple times. Can be optimized in inherited classes. - // this function has not used ?? - // virtual void insert_many(const Field& field, size_t length) { - // for (size_t i = 0; i < length; ++i) { - // insert(field); - // } - // } - /** Split column to smaller columns. Each value goes to column index, selected by corresponding element of 'selector'. * Selector must contain values from 0 to num_columns - 1. * For default implementation, see column_impl.h @@ -603,23 +595,12 @@ class IColumn : public COW { * To avoid confusion between these cases, we don't have isContiguous method. */ - /// Values in column are represented as continuous memory segment of fixed size. Implies values_have_fixed_size. - virtual bool is_fixed_and_contiguous() const { return false; } - - /// If is_fixed_and_contiguous, returns the underlying data array, otherwise throws an exception. virtual StringRef get_raw_data() const { throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, "Column {} is not a contiguous block of memory", get_name()); return StringRef {}; } - /// If values_have_fixed_size, returns size of value, otherwise throw an exception. - virtual size_t size_of_value_if_fixed() const { - throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, - "Values of column {} are not fixed size.", get_name()); - return 0; - } - /// Returns ratio of values in column, that are equal to default value of column. /// Checks only @sample_ratio ratio of rows. virtual double get_ratio_of_default_rows(double sample_ratio = 1.0) const { return 0.0; } @@ -633,8 +614,6 @@ class IColumn : public COW { virtual bool is_column_string() const { return false; } - virtual bool is_predicate_column() const { return false; } - virtual bool is_column_string64() const { return false; } virtual bool is_column_decimal() const { return false; } @@ -677,7 +656,6 @@ class IColumn : public COW { String dump_structure() const; // only used in agg value replace for column which is not variable length, eg.BlockReader::_copy_value_data - // ColumnString should replace according to 0,1,2... ,size,0,1,2... // usage: self_column.replace_column_data(other_column, other_column's row index, self_column's row index) virtual void replace_column_data(const IColumn&, size_t row, size_t self_row = 0) = 0; // replace data to default value if null, used to avoid null data output decimal check failure @@ -710,6 +688,9 @@ class IColumn : public COW { template void append_data_by_selector_impl(MutablePtr& res, const Selector& selector, size_t begin, size_t end) const; + template + void insert_from_multi_column_impl(const std::vector& srcs, + const std::vector& positions); }; using ColumnPtr = IColumn::Ptr; diff --git a/be/src/vec/columns/column_object.h b/be/src/vec/columns/column_object.h index 306fcdabdaa15b..af8257fbae7394 100644 --- a/be/src/vec/columns/column_object.h +++ b/be/src/vec/columns/column_object.h @@ -525,11 +525,6 @@ class ColumnObject final : public COWHelper { throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, "get_raw_data" + get_name()); } - size_t size_of_value_if_fixed() const override { - throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, - "size_of_value_if_fixed" + get_name()); - } - StringRef get_data_at(size_t) const override { throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, "get_data_at" + get_name()); } diff --git a/be/src/vec/common/schema_util.cpp b/be/src/vec/common/schema_util.cpp index 7e4af423978c33..f8e28a4a500928 100644 --- a/be/src/vec/common/schema_util.cpp +++ b/be/src/vec/common/schema_util.cpp @@ -480,7 +480,8 @@ Status _parse_variant_columns(Block& block, const std::vector& variant_pos, bool is_nullable = column_ref->is_nullable(); const auto& column = remove_nullable(column_ref); const auto& var = assert_cast(*column.get()); - ((ColumnObject&)var).finalize(); + // ColumnObject should be finalized before parsing, finalize maybe modify original column structure + const_cast(var).finalize(); MutableColumnPtr variant_column; if (!var.is_scalar_variant()) { diff --git a/be/src/vec/functions/function_cast.h b/be/src/vec/functions/function_cast.h index ca3474e18acfe7..9c8d16fc4c8f8b 100644 --- a/be/src/vec/functions/function_cast.h +++ b/be/src/vec/functions/function_cast.h @@ -1858,7 +1858,8 @@ class FunctionCast final : public IFunctionBase { auto& variant = assert_cast(*col_from); ColumnPtr col_to = data_type_to->create_column(); if (!variant.is_finalized()) { - ((ColumnObject&)variant).finalize(); + // ColumnObject should be finalized before parsing, finalize maybe modify original column structure + const_cast(variant).finalize(); } // It's important to convert as many elements as possible in this context. For instance, // if the root of this variant column is a number column, converting it to a number column diff --git a/be/src/vec/functions/function_variant_element.cpp b/be/src/vec/functions/function_variant_element.cpp index 187c65c8359136..a94a230cfe1239 100644 --- a/be/src/vec/functions/function_variant_element.cpp +++ b/be/src/vec/functions/function_variant_element.cpp @@ -127,7 +127,9 @@ class FunctionVariantElement : public IFunction { *result = ColumnObject::create(true); // src subcolumns empty but src row count may not be 0 (*result)->assume_mutable()->insert_many_defaults(src.size()); - ((ColumnObject&)(*result)).finalize(); + // ColumnObject should be finalized before parsing, finalize maybe modify original column structure + auto& variant = assert_cast(*(*result)); + const_cast(variant).finalize(); return Status::OK(); } if (src.is_scalar_variant() && diff --git a/be/test/vec/columns/column_ip_test.cpp b/be/test/vec/columns/column_ip_test.cpp index 5185fdfedfc8ad..4049a63884ef26 100644 --- a/be/test/vec/columns/column_ip_test.cpp +++ b/be/test/vec/columns/column_ip_test.cpp @@ -135,7 +135,6 @@ TEST_F(ColumnIPTest, FieldTest) { TEST_F(ColumnIPTest, GetRawDataTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - // ip_cols.push_back(column_ipv4->get_ptr()); ip_cols.push_back(column_ipv6->get_ptr()); check_data(ip_cols, {serde[1]}, ';', {2}, data_files[0], assert_get_raw_data_callback); } @@ -183,14 +182,6 @@ TEST_F(ColumnIPTest, SizeTest) { check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_size_callback); } -TEST_F(ColumnIPTest, SizeOfValueIfFixedTest) { - // insert from data csv and assert insert result - MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); - check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_size_of_value_if_fixed_callback); -} - TEST_F(ColumnIPTest, ByteSizeTest) { // insert from data csv and assert insert result MutableColumns ip_cols; @@ -305,8 +296,8 @@ TEST_F(ColumnIPTest, PermutationAndSortTest) { ip_cols.push_back(column_ipv6->get_ptr()); load_data_from_csv(serde, ip_cols, data_files[0], ';', {1, 2}); // this function will make res not equal ?! - // assertColumnPermutations(column_ipv4->assume_mutable_ref(), dt_ipv4); - // assertColumnPermutations(column_ipv6->assume_mutable_ref(), dt_ipv6); + assertColumnPermutations(column_ipv4->assume_mutable_ref(), dt_ipv4); + assertColumnPermutations(column_ipv6->assume_mutable_ref(), dt_ipv6); // assertColumnPermutations(column_ipv4->assume_mutable_ref(), IndexInRangeUInt32Transform()); // assertColumnPermutations(column_ipv6->assume_mutable_ref(), IndexInRangeUInt64Transform()); } diff --git a/be/test/vec/columns/common_column_test.h b/be/test/vec/columns/common_column_test.h index 4a1c38d185099f..6002c35f07f3ae 100644 --- a/be/test/vec/columns/common_column_test.h +++ b/be/test/vec/columns/common_column_test.h @@ -1098,7 +1098,7 @@ class CommonColumnTest : public ::testing::Test { check_res_file("get_raw_data", res); } - //If is_fixed_and_contiguous, returns the underlying data array, otherwise throws an exception. + //If returns the underlying data array, otherwise throws an exception. //virtual Int64 //get_int (size_t) const static void assert_get_int_callback(MutableColumns& load_cols, DataTypeSerDeSPtrs serders) { @@ -1138,15 +1138,6 @@ class CommonColumnTest : public ::testing::Test { static void getNameAssert(IColumn& column, const string expect_name) { ASSERT_EQ(expect_name, column.get_name()); } - //virtual const char * - //get_family_name () const =0, same with get_name - static void getFamilyNameAssert(IColumn& column, const string expect_name) { - ASSERT_EQ(expect_name, column.get_family_name()); - } - - static void isFixedAndContiguousAssert(IColumn& column, const bool expect_value) { - ASSERT_EQ(expect_value, column.is_fixed_and_contiguous()); - } // size related we can check from checked file to make sure the size is right // @@ -1177,22 +1168,6 @@ class CommonColumnTest : public ::testing::Test { check_res_file("size", res); } - // assert size_of_value_if_fixed - // Define the custom assert callback function to verify size_of_value_if_fixed behavior - static void assert_size_of_value_if_fixed_callback(MutableColumns& load_cols, - DataTypeSerDeSPtrs serders) { - // just check cols size_of_value_if_fixed is the same as assert_res - std::vector> res; - for (size_t i = 0; i < load_cols.size(); ++i) { - std::vector data; - auto& source_column = load_cols[i]; - auto actual_str_value = std::to_string(source_column->size_of_value_if_fixed()); - data.push_back(actual_str_value); - res.push_back(data); - } - check_res_file("size_of_value_if_fixed", res); - } - // assert byte_size // Define the custom assert callback function to verify byte_size behavior static void assert_byte_size_callback(MutableColumns& load_cols, DataTypeSerDeSPtrs serders) { @@ -1525,8 +1500,10 @@ class CommonColumnTest : public ::testing::Test { } //virtual Ptr - //replicate (const Offsets &offsets) always used in ColumnConst.convert_to_full_column, so - // we should make a situation that the column is not full column, and then we can use replicate to make it full column + //replicate (const Offsets &offsets) + // 1. used in ColumnConst.convert_to_full_column, + // we should make a situation that the column is not full column, and then we can use replicate to make it full column + // 2. used in some agg calculate static void assert_replicate_callback(MutableColumns& load_cols, DataTypeSerDeSPtrs serders) { // Create an empty column to verify `replicate` functionality // check replicate with different offsets @@ -1534,44 +1511,47 @@ class CommonColumnTest : public ::testing::Test { std::vector> res; auto option = DataTypeSerDe::FormatOptions(); std::vector check_length = {0, 1, 10, 100, 1000, 10000, 100000}; - // IColumn::Offsets offsets; - // for (size_t i = 0; i < check_length.size(); i++) { - // offsets.push_back(check_length[i]); - // } - for (auto cl = check_length.begin(); cl < check_length.end(); ++cl) { - for (size_t i = 0; i < load_cols.size(); ++i) { - // auto origin_size = load_cols[0]->size(); - // here will heap_use_after_free - // ColumnConst* const_col = ColumnConst::create(load_cols[i]->clone_resized(1), *cl); - auto source_column = load_cols[i]->shrink(*cl); - std::cout << "now we are in replicate column : " << load_cols[i]->get_name() - << " with check length: " << *cl - << " for column size : " << source_column->size() << std::endl; - // auto ptr = const_col->convert_to_full_column(); - auto ptr = source_column->replicate(*cl); + // std::vector check_length = {10, 9}; + // size_t sum = std::reduce(check_length.begin(), check_length.end(), 0, std::plus()); + IColumn::Offsets offsets; + for (size_t i = 0; i < check_length.size(); i++) { + offsets.push_back(check_length[i]); + } + for (size_t i = 0; i < load_cols.size(); ++i) { + // auto origin_size = load_cols[0]->size(); + // here will heap_use_after_free + // ColumnConst* const_col = ColumnConst::create(load_cols[i]->clone_resized(1), *cl); + if (load_cols[i]->size() != check_length.size()) { + EXPECT_ANY_THROW(load_cols[i]->replicate(offsets)); + } + auto source_column = load_cols[i]->shrink(check_length.size()); + std::cout << "now we are in replicate column : " << load_cols[i]->get_name() + << " for column size : " << source_column->size() << std::endl; + // auto ptr = const_col->convert_to_full_column(); + // here will return different ptr + auto ptr = source_column->replicate(offsets); + // check ptr + EXPECT_NE(ptr.get(), source_column.get()); + // check after replicate with assert_res + auto ser_col = ColumnString::create(); + ser_col->reserve(ptr->size()); + VectorBufferWriter buffer_writer(*ser_col.get()); + std::vector data; + data.push_back("column: " + source_column->get_name() + + " with generate col size: " + std::to_string(ptr->size())); + for (size_t j = 0; j < ptr->size(); ++j) { // check size - EXPECT_EQ(ptr->size(), *cl); - // check after replicate with assert_res - auto ser_col = ColumnString::create(); - ser_col->reserve(ptr->size()); - VectorBufferWriter buffer_writer(*ser_col.get()); - std::vector data; - data.push_back("column: " + source_column->get_name() + " with check size: " + - std::to_string(*cl) + " with ptr: " + std::to_string(ptr->size())); - for (size_t j = 0; j < ptr->size(); ++j) { - if (auto st = serders[i]->serialize_one_cell_to_json(*ptr, j, buffer_writer, - option); - !st) { - std::cerr << "Failed to serialize column " << i << " at row " << j - << std::endl; - break; - } - buffer_writer.commit(); - std::string actual_str_value = ser_col->get_data_at(j).to_string(); - data.push_back(actual_str_value); + if (auto st = + serders[i]->serialize_one_cell_to_json(*ptr, j, buffer_writer, option); + !st) { + std::cerr << "Failed to serialize column " << i << " at row " << j << std::endl; + break; } - res.push_back(data); + buffer_writer.commit(); + std::string actual_str_value = ser_col->get_data_at(j).to_string(); + data.push_back(actual_str_value); } + res.push_back(data); } check_res_file("replicate", res); } @@ -1886,6 +1866,44 @@ class CommonColumnTest : public ::testing::Test { // column size changed calculation: // size, reserve, resize, empty, byte_size, allocated_bytes, clone_resized, get_shrinked_column // cut(LIMIT operation), shrink + + // get_shrinked_column should only happened in char-type column or nested char-type column, + // other column just return the origin column without any data changed, so check file content should be the same as the origin column + // just shrink the end zeros for char-type column which happened in segmentIterator + // eg. column_desc: char(6), insert into char(3), the char(3) will padding the 3 zeros at the end for writing to disk. + // but we select should just print the char(3) without the padding zeros + // limit and topN operation will trigger this function call + void shrink_padding_chars_callback(MutableColumns& load_cols, DataTypeSerDeSPtrs serders) { + auto option = DataTypeSerDe::FormatOptions(); + std::vector> res; + for (size_t i = 0; i < load_cols.size(); i++) { + auto& source_column = load_cols[i]; + std::cout << "now we are in shrink_padding_chars column : " << load_cols[i]->get_name() + << " for column size : " << source_column->size() << std::endl; + source_column->shrink_padding_chars(); + // check after get_shrinked_column: 1 in selector present the load cols data is selected and data should be default value + auto ser_col = ColumnString::create(); + ser_col->reserve(source_column->size()); + VectorBufferWriter buffer_writer(*ser_col.get()); + std::vector data; + data.push_back("column: " + source_column->get_name() + + " with shrinked column size: " + std::to_string(source_column->size())); + for (size_t j = 0; j < source_column->size(); ++j) { + if (auto st = serders[i]->serialize_one_cell_to_json(*source_column, j, + buffer_writer, option); + !st) { + std::cerr << "Failed to serialize column " << i << " at row " << j << std::endl; + break; + } + buffer_writer.commit(); + std::string actual_str_value = ser_col->get_data_at(j).to_string(); + data.push_back(actual_str_value); + } + res.push_back(data); + } + check_res_file("shrink_padding_chars", res); + } + void sizeAssert(MutableColumnPtr col, size_t expect_size) { EXPECT_EQ(col->size(), expect_size); } @@ -1951,19 +1969,6 @@ class CommonColumnTest : public ::testing::Test { EXPECT_EQ(new_col->size(), expect_size); } - // get_shrinked_column should only happened in char-type column or nested char-type column - // just shrink the end zeros for char-type column which happened in segmentIterator - // eg. column_desc: char(6), insert into char(3), the char(3) will padding the 3 zeros at the end for writing to disk. - // but we select should just print the char(3) without the padding zeros - // limit and topN operation will trigger this function call - void getShrinkedColumnAssert(MutableColumnPtr col, size_t spcific_size_defined) { - EXPECT_TRUE(col->could_shrinked_column()); - auto new_col = col->get_shrinked_column(); - for (size_t i = 0; i < new_col->size(); i++) { - EXPECT_EQ(col->get_data_at(i).size, spcific_size_defined); - } - } - //serialize and deserialize which usually used in AGG function: // serialize_value_into_arena, deserialize_and_insert_from_arena (called by AggregateFunctionDistinctMultipleGenericData, group_array_intersect, nested-types serder like: DataTypeArraySerDe::write_one_cell_to_jsonb) void ser_deserialize_with_arena_impl(MutableColumns& columns, const DataTypes& data_types) { @@ -2104,12 +2109,10 @@ class CommonColumnTest : public ::testing::Test { // now just used in filter column, according to the selector to // select size of row_ids for column by given column, which only used for predict_column and column_dictionary, column_nullable sometimes in Schema::get_predicate_column_ptr() also will return) void filterBySelectorAssert(vectorized::IColumn::MutablePtr col, std::vector selector, - const IDataType& dt, MutableColumnPtr should_sel_col, + DataTypeSerDeSPtr serder, MutableColumnPtr should_sel_col, size_t expect_size) { - // only used in column_nullable and predict_column, column_dictionary - EXPECT_TRUE(col->is_nullable() || col->is_column_dictionary() || - col->is_predicate_column()); // for every data type should assert behavior in own UT case + DataTypeSerDe::FormatOptions option; col->clear(); col->insert_many_defaults(should_sel_col->size()); std::cout << "col size:" << col->size() << std::endl; @@ -2117,7 +2120,27 @@ class CommonColumnTest : public ::testing::Test { Status st = col->filter_by_selector(selector.data(), expect_size, sel); EXPECT_EQ(st, Status::OK()); EXPECT_EQ(sel->size(), expect_size); - printColumn(*sel, dt); + std::vector> res; + std::vector data; + auto ser_col = ColumnString::create(); + ser_col->reserve(sel->size()); + VectorBufferWriter buffer_writer(*ser_col.get()); + data.push_back("column: " + col->get_name() + + " with selector: " + std::to_string(*selector.data()) + + " with ptr: " + std::to_string(sel->size())); + for (size_t j = 0; j < sel->size(); ++j) { + if (auto ret = serder->serialize_one_cell_to_json(*sel, j, buffer_writer, option); + !ret) { + std::cerr << "Failed to serialize column " + << " at row " << j << std::endl; + break; + } + buffer_writer.commit(); + std::string actual_str_value = ser_col->get_data_at(j).to_string(); + data.push_back(actual_str_value); + } + res.push_back(data); + check_res_file("filter_by_selector-" + col->get_name(), res); } void assertPermutationsWithLimit(const IColumn::Permutation& lhs,