From 272368a47a2e3cc5f067e0b1c26def6da86b03aa Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 22 Aug 2024 18:46:36 +0800 Subject: [PATCH 01/11] improve two funcs --- dbms/src/Functions/FunctionsString.cpp | 52 ++++++++++--------- .../Functions/tests/gtest_strings_ascii.cpp | 12 ++--- .../Functions/tests/gtest_strings_length.cpp | 8 +-- libs/libcommon/include/common/StringRef.h | 9 ++-- 4 files changed, 41 insertions(+), 40 deletions(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index f5b35f802cd..0a91f381876 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -4474,6 +4474,8 @@ class FunctionASCII : public IFunction std::string getName() const override { return name; } size_t getNumberOfArguments() const override { return 1; } + bool useDefaultImplementationForConstants() const override { return true; } + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (arguments.size() != 1) @@ -4490,24 +4492,23 @@ class FunctionASCII : public IFunction void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override { const IColumn * c0_col = block.getByPosition(arguments[0]).column.get(); - const auto * c0_const = checkAndGetColumn(c0_col); const auto * c0_string = checkAndGetColumn(c0_col); - - Field res_field; - int val_num = c0_col->size(); - auto col_res = ColumnInt64::create(); - col_res->reserve(val_num); - if (c0_const == nullptr && c0_string == nullptr) + if (c0_string == nullptr) throw Exception( fmt::format("Illegal argument of function {}", getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - for (int i = 0; i < val_num; i++) + size_t val_num = c0_col->size(); + auto col_res = ColumnInt64::create(); + col_res->reserve(val_num); + + for (size_t i = 0; i < val_num; i++) { - c0_col->get(i, res_field); - String handled_str = res_field.get(); - Int64 res = handled_str.empty() ? 0 : static_cast(handled_str[0]); - col_res->insert(res); + const StringRef data_str = c0_string->getDataAt(i); + if likely (data_str.size != 0) + col_res->insert(data_str.data[0]); + else + col_res->insert(0); } block.getByPosition(result).column = std::move(col_res); @@ -4527,6 +4528,8 @@ class FunctionLength : public IFunction std::string getName() const override { return name; } size_t getNumberOfArguments() const override { return 1; } + bool useDefaultImplementationForConstants() const override { return true; } + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (arguments.size() != 1) @@ -4543,24 +4546,23 @@ class FunctionLength : public IFunction void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override { const IColumn * c0_col = block.getByPosition(arguments[0]).column.get(); - const auto * c0_const = checkAndGetColumn(c0_col); const auto * c0_string = checkAndGetColumn(c0_col); - - Field res_field; - int val_num = c0_col->size(); - auto col_res = ColumnInt64::create(); - col_res->reserve(val_num); - if (c0_const == nullptr && c0_string == nullptr) + if (c0_string == nullptr) throw Exception( fmt::format("Illegal argument of function {}", getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - for (int i = 0; i < val_num; i++) - { - c0_col->get(i, res_field); - String handled_str = res_field.get(); - col_res->insert(static_cast(handled_str.size())); - } + size_t val_num = c0_col->size(); + auto col_res = ColumnInt64::create(); + col_res->reserve(val_num); + + const auto & offsets = c0_string->getOffsets(); + + if (val_num > 0) + col_res->insert(offsets[0] - 1); + + for (size_t i = 1; i < val_num; i++) + col_res->insert(offsets[i] - offsets[i-1] - 1); block.getByPosition(result).column = std::move(col_res); } diff --git a/dbms/src/Functions/tests/gtest_strings_ascii.cpp b/dbms/src/Functions/tests/gtest_strings_ascii.cpp index f01b76a451f..6bbc4260266 100644 --- a/dbms/src/Functions/tests/gtest_strings_ascii.cpp +++ b/dbms/src/Functions/tests/gtest_strings_ascii.cpp @@ -73,15 +73,15 @@ TEST_F(StringASCII, strAndStrTest) test_block.insert({nullptr, func->getReturnType(), "res"}); func->execute(test_block, cns, 1); const IColumn * res = test_block.getByPosition(1).column.get(); - const ColumnInt64 * res_string = checkAndGetColumn(res); + const auto * actual_res = checkAndGetColumn(res); Field res_field; - std::vector results{104, 72, 50, 35, 0}; - for (size_t t = 0; t < results.size(); t++) + std::vector expect_results{104, 72, 50, 35, 0}; + for (size_t t = 0; t < expect_results.size(); t++) { - res_string->get(t, res_field); + actual_res->get(t, res_field); Int64 res_val = res_field.get(); - EXPECT_EQ(results[t], res_val); + EXPECT_EQ(expect_results[t], res_val); } } } @@ -133,7 +133,7 @@ TEST_F(StringASCII, nullTest) MutableColumnPtr mutable_result_null_map_column = (*std::move(result_null_map_column)).mutate(); NullMap & result_null_map = static_cast(*mutable_result_null_map_column).getData(); const IColumn * res = test_block.getByPosition(1).column.get(); - const ColumnNullable * res_nullable_string = checkAndGetColumn(res); + const auto * res_nullable_string = checkAndGetColumn(res); const IColumn & res_string = res_nullable_string->getNestedColumn(); Field res_field; diff --git a/dbms/src/Functions/tests/gtest_strings_length.cpp b/dbms/src/Functions/tests/gtest_strings_length.cpp index c8d41e335e0..ddf8e19c3d8 100644 --- a/dbms/src/Functions/tests/gtest_strings_length.cpp +++ b/dbms/src/Functions/tests/gtest_strings_length.cpp @@ -45,8 +45,8 @@ TEST_F(StringLength, strAndStrTest) auto & factory = FunctionFactory::instance(); - std::vector strs{"hi~", "23333", "pingcap", "你好", "233哈哈", ""}; - std::vector results{3, 5, 7, 6, 9, 0}; + std::vector strs{"hi~", "23333", "pingcap", "你好", "233哈哈", "", "asdの的", "ヽ( ̄▽ ̄)و"}; + std::vector results{3, 5, 7, 6, 9, 0, 9, 16}; for (int i = 0; i < 2; i++) { @@ -74,7 +74,7 @@ TEST_F(StringLength, strAndStrTest) test_block.insert({nullptr, func->getReturnType(), "res"}); func->execute(test_block, cns, 1); const IColumn * res = test_block.getByPosition(1).column.get(); - const ColumnInt64 * res_string = checkAndGetColumn(res); + const auto * res_string = checkAndGetColumn(res); Field res_field; @@ -134,7 +134,7 @@ TEST_F(StringLength, nullTest) MutableColumnPtr mutable_result_null_map_column = (*std::move(result_null_map_column)).mutate(); NullMap & result_null_map = static_cast(*mutable_result_null_map_column).getData(); const IColumn * res = test_block.getByPosition(1).column.get(); - const ColumnNullable * res_nullable_string = checkAndGetColumn(res); + const auto * res_nullable_string = checkAndGetColumn(res); const IColumn & res_string = res_nullable_string->getNestedColumn(); Field res_field; diff --git a/libs/libcommon/include/common/StringRef.h b/libs/libcommon/include/common/StringRef.h index 52eb8b32860..a87b54a7670 100644 --- a/libs/libcommon/include/common/StringRef.h +++ b/libs/libcommon/include/common/StringRef.h @@ -24,7 +24,6 @@ #include #include #include -#include // for std::logic_error #include #include @@ -171,8 +170,8 @@ inline size_t hashLessThan8(const char * data, size_t size) { if (size > 8) { - UInt64 a = unalignedLoad(data); - UInt64 b = unalignedLoad(data + size - 8); + auto a = unalignedLoad(data); + auto b = unalignedLoad(data + size - 8); return hashLen16(a, rotateByAtLeast1(b + size, size)) ^ b; } @@ -199,13 +198,13 @@ struct CRC32Hash do { - UInt64 word = unalignedLoad(pos); + auto word = unalignedLoad(pos); res = _mm_crc32_u64(res, word); pos += 8; } while (pos + 8 < end); - UInt64 word = unalignedLoad(end - 8); /// I'm not sure if this is normal. + auto word = unalignedLoad(end - 8); /// I'm not sure if this is normal. res = _mm_crc32_u64(res, word); return res; From 2d4172934f8500eacb676c76d414a29e8da490b3 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 22 Aug 2024 19:13:39 +0800 Subject: [PATCH 02/11] refine position --- dbms/src/Functions/FunctionsNull.h | 2 +- dbms/src/Functions/FunctionsString.cpp | 43 ++++++++----------- dbms/src/Functions/GatherUtils/Algorithms.h | 33 +++++++------- .../Functions/tests/gtest_strings_ascii.cpp | 2 +- .../tests/gtest_strings_position.cpp | 6 +-- 5 files changed, 41 insertions(+), 45 deletions(-) diff --git a/dbms/src/Functions/FunctionsNull.h b/dbms/src/Functions/FunctionsNull.h index 7e37305d300..046beb380fe 100644 --- a/dbms/src/Functions/FunctionsNull.h +++ b/dbms/src/Functions/FunctionsNull.h @@ -64,7 +64,7 @@ class FunctionCoalesce : public IFunction public: static constexpr auto name = "coalesce"; static FunctionPtr create(const Context & context); - FunctionCoalesce(const Context & context) + explicit FunctionCoalesce(const Context & context) : context(context) {} diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index 0a91f381876..01f0d9b43b7 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -4478,7 +4478,7 @@ class FunctionASCII : public IFunction DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (arguments.size() != 1) + if unlikely (arguments.size() != 1) throw Exception( fmt::format( "Number of arguments for function {} doesn't match: passed {}, should be 1.", @@ -4493,7 +4493,7 @@ class FunctionASCII : public IFunction { const IColumn * c0_col = block.getByPosition(arguments[0]).column.get(); const auto * c0_string = checkAndGetColumn(c0_col); - if (c0_string == nullptr) + if unlikely (c0_string == nullptr) throw Exception( fmt::format("Illegal argument of function {}", getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); @@ -4532,7 +4532,7 @@ class FunctionLength : public IFunction DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (arguments.size() != 1) + if unlikely (arguments.size() != 1) throw Exception( fmt::format( "Number of arguments for function {} doesn't match: passed {}, should be 1.", @@ -4547,7 +4547,7 @@ class FunctionLength : public IFunction { const IColumn * c0_col = block.getByPosition(arguments[0]).column.get(); const auto * c0_string = checkAndGetColumn(c0_col); - if (c0_string == nullptr) + if unlikely (c0_string == nullptr) throw Exception( fmt::format("Illegal argument of function {}", getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); @@ -4560,9 +4560,9 @@ class FunctionLength : public IFunction if (val_num > 0) col_res->insert(offsets[0] - 1); - + for (size_t i = 1; i < val_num; i++) - col_res->insert(offsets[i] - offsets[i-1] - 1); + col_res->insert(offsets[i] - offsets[i - 1] - 1); block.getByPosition(result).column = std::move(col_res); } @@ -4957,7 +4957,7 @@ class FunctionPosition : public IFunction DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (arguments.size() != 2) + if unlikely (arguments.size() != 2) throw Exception( fmt::format( "Number of arguments for function {} doesn't match: passed {}, should be 2.", @@ -4970,37 +4970,32 @@ class FunctionPosition : public IFunction void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override { - const IColumn * c0_col = block.getByPosition(arguments[0]).column.get(); - const auto * c0_const = checkAndGetColumn(c0_col); - const auto * c0_string = checkAndGetColumn(c0_col); - Field c0_field; + const IColumn * col0 = block.getByPosition(arguments[0]).column.get(); + const auto * c0_const = checkAndGetColumn(col0); + const auto * c0_string = checkAndGetColumn(col0); - const IColumn * c1_col = block.getByPosition(arguments[1]).column.get(); - const auto * c1_const = checkAndGetColumn(c1_col); - const auto * c1_string = checkAndGetColumn(c1_col); - Field c1_field; + const IColumn * col1 = block.getByPosition(arguments[1]).column.get(); + const auto * c1_const = checkAndGetColumn(col1); + const auto * c1_string = checkAndGetColumn(col1); - if ((c0_const == nullptr && c0_string == nullptr) || (c1_const == nullptr && c1_string == nullptr)) + if unlikely ((c0_const == nullptr && c0_string == nullptr) || (c1_const == nullptr && c1_string == nullptr)) throw Exception( fmt::format("Illegal argument of function {}", getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - if (c0_col->size() != c1_col->size()) + if unlikely (col0->size() != col1->size()) throw Exception( - fmt::format("Function {} column number is inconformity", getName()), + fmt::format("Row number in of columns in function {} is inconformity", getName()), ErrorCodes::LOGICAL_ERROR); auto col_res = ColumnInt64::create(); - int val_num = c0_col->size(); + int val_num = col0->size(); col_res->reserve(val_num); for (int i = 0; i < val_num; i++) { - c0_col->get(i, c0_field); - c1_col->get(i, c1_field); - - String c0_str = c0_field.get(); - String c1_str = c1_field.get(); + const String c0_str = col0->getDataAt(i).toString(); + const String c1_str = col1->getDataAt(i).toString(); // return -1 when c1_str not contains the c0_str Int64 idx = c1_str.find(c0_str); diff --git a/dbms/src/Functions/GatherUtils/Algorithms.h b/dbms/src/Functions/GatherUtils/Algorithms.h index 576616c6485..f08dbf8fead 100644 --- a/dbms/src/Functions/GatherUtils/Algorithms.h +++ b/dbms/src/Functions/GatherUtils/Algorithms.h @@ -192,7 +192,7 @@ void concat(const std::vector> & array_sources, Si size_t sources_num = array_sources.size(); std::vector is_const(sources_num); - auto checkAndGetSizeToReserve = [](auto source, IArraySource * array_source) { + auto check_and_get_size_to_reserve = [](auto source, IArraySource * array_source) { if (source == nullptr) throw Exception( "Concat function expected " + demangle(typeid(Source).name()) + " or " @@ -205,17 +205,18 @@ void concat(const std::vector> & array_sources, Si size_t size_to_reserve = 0; for (auto i : ext::range(0, sources_num)) { - auto & source = array_sources[i]; + const auto & source = array_sources[i]; is_const[i] = source->isConst(); if (is_const[i]) - size_to_reserve += checkAndGetSizeToReserve(typeid_cast *>(source.get()), source.get()); + size_to_reserve + += check_and_get_size_to_reserve(typeid_cast *>(source.get()), source.get()); else - size_to_reserve += checkAndGetSizeToReserve(typeid_cast(source.get()), source.get()); + size_to_reserve += check_and_get_size_to_reserve(typeid_cast(source.get()), source.get()); } sink.reserve(size_to_reserve); - auto writeNext = [&sink](auto source) { + auto write_next = [&sink](auto source) { writeSlice(source->getWhole(), sink); source->next(); }; @@ -224,11 +225,11 @@ void concat(const std::vector> & array_sources, Si { for (auto i : ext::range(0, sources_num)) { - auto & source = array_sources[i]; + const auto & source = array_sources[i]; if (is_const[i]) - writeNext(static_cast *>(source.get())); + write_next(static_cast *>(source.get())); else - writeNext(static_cast(source.get())); + write_next(static_cast(source.get())); } sink.next(); } @@ -389,11 +390,11 @@ void NO_INLINE pad(SourceA && src, SourceB && padding, Sink && sink, ssize_t len size_t left = static_cast(length) - slice.size; if (is_left) { - StringSource::Slice padSlice = padding.getWhole(); - while (left > padSlice.size && padSlice.size != 0) + StringSource::Slice pad_slice = padding.getWhole(); + while (left > pad_slice.size && pad_slice.size != 0) { - writeSlice(padSlice, sink); - left -= padSlice.size; + writeSlice(pad_slice, sink); + left -= pad_slice.size; } writeSlice(padding.getSliceFromLeft(0, left), sink); @@ -402,11 +403,11 @@ void NO_INLINE pad(SourceA && src, SourceB && padding, Sink && sink, ssize_t len else { writeSlice(slice, sink); - StringSource::Slice padSlice = padding.getWhole(); - while (left > padSlice.size && padSlice.size != 0) + StringSource::Slice pad_slice = padding.getWhole(); + while (left > pad_slice.size && pad_slice.size != 0) { - writeSlice(padSlice, sink); - left -= padSlice.size; + writeSlice(pad_slice, sink); + left -= pad_slice.size; } writeSlice(padding.getSliceFromLeft(0, left), sink); diff --git a/dbms/src/Functions/tests/gtest_strings_ascii.cpp b/dbms/src/Functions/tests/gtest_strings_ascii.cpp index 6bbc4260266..c7888f85c07 100644 --- a/dbms/src/Functions/tests/gtest_strings_ascii.cpp +++ b/dbms/src/Functions/tests/gtest_strings_ascii.cpp @@ -151,4 +151,4 @@ TEST_F(StringASCII, nullTest) } } // namespace tests -} // namespace DB \ No newline at end of file +} // namespace DB diff --git a/dbms/src/Functions/tests/gtest_strings_position.cpp b/dbms/src/Functions/tests/gtest_strings_position.cpp index 5e5106f7231..65d83052aa1 100644 --- a/dbms/src/Functions/tests/gtest_strings_position.cpp +++ b/dbms/src/Functions/tests/gtest_strings_position.cpp @@ -93,7 +93,7 @@ TEST_F(StringPosition, strAndStrTest) bp->build(ctns)->execute(test_block, cns, 2); const IColumn * res = test_block.getByPosition(2).column.get(); - const ColumnInt64 * res_string = checkAndGetColumn(res); + const auto * res_string = checkAndGetColumn(res); Field res_field; @@ -162,7 +162,7 @@ TEST_F(StringPosition, utf8StrAndStrTest) bp->build(ctns)->execute(test_block, cns, 2); const IColumn * res = test_block.getByPosition(2).column.get(); - const ColumnInt64 * res_string = checkAndGetColumn(res); + const auto * res_string = checkAndGetColumn(res); Field res_field; @@ -236,7 +236,7 @@ TEST_F(StringPosition, nullTest) MutableColumnPtr mutable_result_null_map_column = (*std::move(result_null_map_column)).mutate(); NullMap & result_null_map = static_cast(*mutable_result_null_map_column).getData(); const IColumn * res = test_block.getByPosition(2).column.get(); - const ColumnNullable * res_nullable_string = checkAndGetColumn(res); + const auto * res_nullable_string = checkAndGetColumn(res); const IColumn & res_string = res_nullable_string->getNestedColumn(); Field res_field; From 2f2160042bbf23129bc0d7bd6a1511c0f854d730 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 27 Aug 2024 16:04:25 +0800 Subject: [PATCH 03/11] add tests --- dbms/src/Functions/FunctionsString.cpp | 69 +++++-- .../tests/gtest_strings_position.cpp | 194 ++++++++---------- 2 files changed, 142 insertions(+), 121 deletions(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index 01f0d9b43b7..7f62332c0e6 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -35,6 +35,8 @@ #include #include +#include "Columns/IColumn.h" + namespace DB { namespace ErrorCodes @@ -4463,6 +4465,13 @@ class PadUTF8Impl : public IFunction } }; +static Int64 getResult(const ColumnString::Chars_t & chars, size_t size, size_t offset) +{ + if unlikely (size == 0) + return 0; + return chars[offset]; +} + class FunctionASCII : public IFunction { public: @@ -4502,13 +4511,19 @@ class FunctionASCII : public IFunction auto col_res = ColumnInt64::create(); col_res->reserve(val_num); - for (size_t i = 0; i < val_num; i++) + const auto & chars = c0_string->getChars(); + const auto & offsets = c0_string->getOffsets(); + + if (val_num > 0) { - const StringRef data_str = c0_string->getDataAt(i); - if likely (data_str.size != 0) - col_res->insert(data_str.data[0]); - else - col_res->insert(0); + size_t size = offsets[0] - 1; + col_res->insert(getResult(chars, size, 0)); + } + + for (size_t i = 1; i < val_num; i++) + { + auto size = offsets[i] - offsets[i - 1] - 1; + col_res->insert(getResult(chars, size, offsets[i - 1])); } block.getByPosition(result).column = std::move(col_res); @@ -4955,6 +4970,8 @@ class FunctionPosition : public IFunction std::string getName() const override { return name; } size_t getNumberOfArguments() const override { return 2; } + bool useDefaultImplementationForConstants() const override { return true; } + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if unlikely (arguments.size() != 2) @@ -4992,26 +5009,52 @@ class FunctionPosition : public IFunction int val_num = col0->size(); col_res->reserve(val_num); + if (val_num == 0) + { + block.getByPosition(result).column = std::move(col_res); + return; + } + + StringRef c0_str; + if (c0_const != nullptr) + c0_str = col0->getDataAt(0); + for (int i = 0; i < val_num; i++) { - const String c0_str = col0->getDataAt(i).toString(); - const String c1_str = col1->getDataAt(i).toString(); + if (c0_const == nullptr) + c0_str = col0->getDataAt(i); + + const StringRef c1_str = col1->getDataAt(i); + + if unlikely (c0_str.size == 0) + { + col_res->insert(1); + continue; + } + + if unlikely (c1_str.size == 0) + { + col_res->insert(0); + continue; + } - // return -1 when c1_str not contains the c0_str - Int64 idx = c1_str.find(c0_str); - col_res->insert(getPositionUTF8(c1_str, idx)); + VolnitskyCaseInsensitiveUTF8 searcher(c0_str.data, c0_str.size, c1_str.size); + const char * res = searcher.search(c1_str.data, c1_str.size); + Int64 idx = res - c1_str.data; + if (idx >= static_cast(c1_str.size)) + idx = -1; + col_res->insert(getPositionUTF8(reinterpret_cast(c1_str.data), idx)); } block.getByPosition(result).column = std::move(col_res); } private: - static Int64 getPositionUTF8(const String & c1_str, Int64 idx) + static Int64 getPositionUTF8(const UInt8 * data, Int64 idx) { if (idx == -1) return 0; - const auto * data = reinterpret_cast(c1_str.data()); return static_cast(UTF8::countCodePoints(data, idx) + 1); } }; diff --git a/dbms/src/Functions/tests/gtest_strings_position.cpp b/dbms/src/Functions/tests/gtest_strings_position.cpp index 65d83052aa1..a6f49a773de 100644 --- a/dbms/src/Functions/tests/gtest_strings_position.cpp +++ b/dbms/src/Functions/tests/gtest_strings_position.cpp @@ -42,136 +42,114 @@ class StringPosition : public DB::tests::FunctionTest TEST_F(StringPosition, strAndStrTest) { const auto context = TiFlashTestEnv::getContext(); - auto & factory = FunctionFactory::instance(); - // case insensitive std::vector c0_var_strs{"ell", "LL", "3", "ElL", "ye", "aaaa", "world", "", "", "biu"}; std::vector c1_var_strs{"hello", "HELLO", "23333", "HeLlO", "hey", "a", "WoRlD", "", "ping", ""}; - - // var-var - std::vector result0{2, 3, 2, 0, 0, 0, 0, 1, 1, 0}; + std::vector result0{2, 3, 2, 2, 0, 0, 1, 1, 1, 0}; std::vector c0_strs; std::vector c1_strs; - std::vector results; - for (int i = 0; i < 4; i++) - { - MutableColumnPtr csp0; - MutableColumnPtr csp1; + std::vector expect_results; - // var-var - c0_strs = c0_var_strs; - c1_strs = c1_var_strs; - results = result0; + MutableColumnPtr csp0; + MutableColumnPtr csp1; - csp0 = ColumnString::create(); - csp1 = ColumnString::create(); + c0_strs = c0_var_strs; + c1_strs = c1_var_strs; + expect_results = result0; - for (size_t j = 0; j < c0_strs.size(); j++) - { - csp0->insert(Field(c0_strs[j].c_str(), c0_strs[j].size())); - csp1->insert(Field(c1_strs[j].c_str(), c1_strs[j].size())); - } + csp0 = ColumnString::create(); + csp1 = ColumnString::create(); - Block test_block; - ColumnWithTypeAndName ctn0 - = ColumnWithTypeAndName(std::move(csp0), std::make_shared(), "test_position_0"); - ColumnWithTypeAndName ctn1 - = ColumnWithTypeAndName(std::move(csp1), std::make_shared(), "test_position_1"); - ColumnsWithTypeAndName ctns{ctn0, ctn1}; - test_block.insert(ctn0); - test_block.insert(ctn1); - // for result from position - test_block.insert({}); - ColumnNumbers cns{0, 1}; - - // test position - auto bp = factory.tryGet("position", *context); - ASSERT_TRUE(bp != nullptr); - ASSERT_FALSE(bp->isVariadic()); - - bp->build(ctns)->execute(test_block, cns, 2); - const IColumn * res = test_block.getByPosition(2).column.get(); - const auto * res_string = checkAndGetColumn(res); - - Field res_field; - - for (size_t t = 0; t < results.size(); t++) - { - res_string->get(t, res_field); - Int64 res_val = res_field.get(); - EXPECT_EQ(results[t], res_val); - } + for (size_t i = 0; i < c0_strs.size(); i++) + { + csp0->insert(Field(c0_strs[i].c_str(), c0_strs[i].size())); + csp1->insert(Field(c1_strs[i].c_str(), c1_strs[i].size())); } -} -// test string and string in utf8 -TEST_F(StringPosition, utf8StrAndStrTest) -{ - const auto context = TiFlashTestEnv::getContext(); + Block test_block; + ColumnWithTypeAndName ctn0 + = ColumnWithTypeAndName(std::move(csp0), std::make_shared(), "test_position_0"); + ColumnWithTypeAndName ctn1 + = ColumnWithTypeAndName(std::move(csp1), std::make_shared(), "test_position_1"); + ColumnsWithTypeAndName ctns{ctn0, ctn1}; + test_block.insert(ctn0); + test_block.insert(ctn1); + + // for result from position + test_block.insert({}); + ColumnNumbers cns{0, 1}; - auto & factory = FunctionFactory::instance(); + // test position + auto bp = factory.tryGet("position", *context); + ASSERT_TRUE(bp != nullptr); + ASSERT_FALSE(bp->isVariadic()); - // case insensitive - std::vector c0_var_strs{"好", "平凯", "aa哈", "?!", "呵呵呵", "233", "嗯??"}; - std::vector c1_var_strs{"ni好", "平凯星辰", "啊啊aaa哈哈", "??!!", "呵呵呵", "哈哈2333", "嗯?"}; + bp->build(ctns)->execute(test_block, cns, 2); + const IColumn * res = test_block.getByPosition(2).column.get(); + const auto * res_string = checkAndGetColumn(res); - // var-var - std::vector result0{3, 1, 4, 2, 1, 3, 0}; + Field res_field; - std::vector c0_strs; - std::vector c1_strs; - std::vector results; - for (int i = 0; i < 4; i++) + for (size_t i = 0; i < expect_results.size(); i++) { - MutableColumnPtr csp0; - MutableColumnPtr csp1; - - // var-var - c0_strs = c0_var_strs; - c1_strs = c1_var_strs; - results = result0; + res_string->get(i, res_field); + Int64 res_val = res_field.get(); + EXPECT_EQ(expect_results[i], res_val); + } +} - csp0 = ColumnString::create(); - csp1 = ColumnString::create(); +// test string and string in utf8 +TEST_F(StringPosition, utf8StrAndStrTest) +{ + { + // const const + ASSERT_COLUMN_EQ( + createConstColumn(0, 0), + executeFunction("position", createConstColumn(0, ""), createConstColumn(0, ""))); + + ASSERT_COLUMN_EQ( + createConstColumn(1, 3), + executeFunction("position", createConstColumn(1, "a啊A"), createConstColumn(1, "g他A啊a"))); + + ASSERT_COLUMN_EQ( + createConstColumn(10, 1), + executeFunction( + "position", + createConstColumn(10, "a啊A"), + createConstColumn(10, "A啊a我"))); + } + { + // const vector + ASSERT_COLUMN_EQ( + createColumn({1, 1, 3, 2, 0, 0}), + executeFunction( + "position", + createConstColumn(6, "我aA"), + createColumn({"我aa", "我AA123", "aa我aa", "肥我aA个", "vrfv干扰", ""}))); + } - for (size_t i = 0; i < c0_strs.size(); i++) - { - csp0->insert(Field(c0_strs[i].c_str(), c0_strs[i].size())); - csp1->insert(Field(c1_strs[i].c_str(), c1_strs[i].size())); - } + { + // vector vector + ASSERT_COLUMN_EQ( + createColumn({3, 1, 4, 2, 1, 3, 0, 2, 3}), + executeFunction( + "position", + createColumn({"好", "平凯", "aa哈", "?!", "呵呵呵", "233", "嗯??", "好", "aaa"}), + createColumn( + {"ni好", "平凯星辰", "啊啊aaa哈哈", "??!!", "呵呵呵", "哈哈2333", "嗯?", " 好", "vdAaAvr"}))); + } - Block test_block; - ColumnWithTypeAndName ctn0 - = ColumnWithTypeAndName(std::move(csp0), std::make_shared(), "test_position_0"); - ColumnWithTypeAndName ctn1 - = ColumnWithTypeAndName(std::move(csp1), std::make_shared(), "test_position_1"); - ColumnsWithTypeAndName ctns{ctn0, ctn1}; - test_block.insert(ctn0); - test_block.insert(ctn1); - // for result from position - test_block.insert({}); - ColumnNumbers cns{0, 1}; - - // test position - auto bp = factory.tryGet("position", *context); - ASSERT_TRUE(bp != nullptr); - ASSERT_FALSE(bp->isVariadic()); - - bp->build(ctns)->execute(test_block, cns, 2); - const IColumn * res = test_block.getByPosition(2).column.get(); - const auto * res_string = checkAndGetColumn(res); - - Field res_field; - - for (size_t t = 0; t < results.size(); t++) - { - res_string->get(t, res_field); - Int64 res_val = res_field.get(); - EXPECT_EQ(results[t], res_val); - } + { + // vector const + ASSERT_COLUMN_EQ( + createColumn({1, 1, 11, 0, 6, 6}), + executeFunction( + "position", + createColumn({"", "f", "z", "备份", "备g份", "备G份"}), + createConstColumn(6, "fevre备g份gfz"))); } } From beff21a421294045ee1387f03c980251726c4c71 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Tue, 27 Aug 2024 17:36:20 +0800 Subject: [PATCH 04/11] revoke --- dbms/src/Functions/FunctionsString.cpp | 67 ++++++------------- .../tests/gtest_strings_position.cpp | 23 +++---- 2 files changed, 32 insertions(+), 58 deletions(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index 7f62332c0e6..be31bac8f90 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -4970,11 +4970,9 @@ class FunctionPosition : public IFunction std::string getName() const override { return name; } size_t getNumberOfArguments() const override { return 2; } - bool useDefaultImplementationForConstants() const override { return true; } - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if unlikely (arguments.size() != 2) + if (arguments.size() != 2) throw Exception( fmt::format( "Number of arguments for function {} doesn't match: passed {}, should be 2.", @@ -4987,74 +4985,53 @@ class FunctionPosition : public IFunction void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override { - const IColumn * col0 = block.getByPosition(arguments[0]).column.get(); - const auto * c0_const = checkAndGetColumn(col0); - const auto * c0_string = checkAndGetColumn(col0); + const IColumn * c0_col = block.getByPosition(arguments[0]).column.get(); + const auto * c0_const = checkAndGetColumn(c0_col); + const auto * c0_string = checkAndGetColumn(c0_col); + Field c0_field; - const IColumn * col1 = block.getByPosition(arguments[1]).column.get(); - const auto * c1_const = checkAndGetColumn(col1); - const auto * c1_string = checkAndGetColumn(col1); + const IColumn * c1_col = block.getByPosition(arguments[1]).column.get(); + const auto * c1_const = checkAndGetColumn(c1_col); + const auto * c1_string = checkAndGetColumn(c1_col); + Field c1_field; - if unlikely ((c0_const == nullptr && c0_string == nullptr) || (c1_const == nullptr && c1_string == nullptr)) + if ((c0_const == nullptr && c0_string == nullptr) || (c1_const == nullptr && c1_string == nullptr)) throw Exception( fmt::format("Illegal argument of function {}", getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - if unlikely (col0->size() != col1->size()) + if (c0_col->size() != c1_col->size()) throw Exception( - fmt::format("Row number in of columns in function {} is inconformity", getName()), + fmt::format("Function {} column number is inconformity", getName()), ErrorCodes::LOGICAL_ERROR); auto col_res = ColumnInt64::create(); - int val_num = col0->size(); + int val_num = c0_col->size(); col_res->reserve(val_num); - if (val_num == 0) - { - block.getByPosition(result).column = std::move(col_res); - return; - } - - StringRef c0_str; - if (c0_const != nullptr) - c0_str = col0->getDataAt(0); - for (int i = 0; i < val_num; i++) { - if (c0_const == nullptr) - c0_str = col0->getDataAt(i); - - const StringRef c1_str = col1->getDataAt(i); + c0_col->get(i, c0_field); + c1_col->get(i, c1_field); - if unlikely (c0_str.size == 0) - { - col_res->insert(1); - continue; - } - - if unlikely (c1_str.size == 0) - { - col_res->insert(0); - continue; - } + String c0_str = c0_field.get(); + String c1_str = c1_field.get(); - VolnitskyCaseInsensitiveUTF8 searcher(c0_str.data, c0_str.size, c1_str.size); - const char * res = searcher.search(c1_str.data, c1_str.size); - Int64 idx = res - c1_str.data; - if (idx >= static_cast(c1_str.size)) - idx = -1; - col_res->insert(getPositionUTF8(reinterpret_cast(c1_str.data), idx)); + // return -1 when c1_str not contains the c0_str + Int64 idx = c1_str.find(c0_str); + col_res->insert(getPositionUTF8(c1_str, idx)); } block.getByPosition(result).column = std::move(col_res); } private: - static Int64 getPositionUTF8(const UInt8 * data, Int64 idx) + static Int64 getPositionUTF8(const String & c1_str, Int64 idx) { if (idx == -1) return 0; + const auto * data = reinterpret_cast(c1_str.data()); return static_cast(UTF8::countCodePoints(data, idx) + 1); } }; diff --git a/dbms/src/Functions/tests/gtest_strings_position.cpp b/dbms/src/Functions/tests/gtest_strings_position.cpp index a6f49a773de..14e9c2f7cd4 100644 --- a/dbms/src/Functions/tests/gtest_strings_position.cpp +++ b/dbms/src/Functions/tests/gtest_strings_position.cpp @@ -46,7 +46,7 @@ TEST_F(StringPosition, strAndStrTest) std::vector c0_var_strs{"ell", "LL", "3", "ElL", "ye", "aaaa", "world", "", "", "biu"}; std::vector c1_var_strs{"hello", "HELLO", "23333", "HeLlO", "hey", "a", "WoRlD", "", "ping", ""}; - std::vector result0{2, 3, 2, 2, 0, 0, 1, 1, 1, 0}; + std::vector result0{2, 3, 2, 0, 0, 0, 0, 1, 1, 0}; std::vector c0_strs; std::vector c1_strs; @@ -106,19 +106,16 @@ TEST_F(StringPosition, utf8StrAndStrTest) { // const const ASSERT_COLUMN_EQ( - createConstColumn(0, 0), + createColumn({}), executeFunction("position", createConstColumn(0, ""), createConstColumn(0, ""))); ASSERT_COLUMN_EQ( - createConstColumn(1, 3), - executeFunction("position", createConstColumn(1, "a啊A"), createConstColumn(1, "g他A啊a"))); + createColumn({3}), + executeFunction("position", createConstColumn(1, "a啊A"), createConstColumn(1, "g他a啊A"))); ASSERT_COLUMN_EQ( - createConstColumn(10, 1), - executeFunction( - "position", - createConstColumn(10, "a啊A"), - createConstColumn(10, "A啊a我"))); + createColumn({1, 1, 1}), + executeFunction("position", createConstColumn(3, "a啊A"), createConstColumn(3, "a啊A我"))); } { @@ -128,7 +125,7 @@ TEST_F(StringPosition, utf8StrAndStrTest) executeFunction( "position", createConstColumn(6, "我aA"), - createColumn({"我aa", "我AA123", "aa我aa", "肥我aA个", "vrfv干扰", ""}))); + createColumn({"我aA", "我aA123", "aa我aA", "肥我aA个", "vrfv干扰", ""}))); } { @@ -137,7 +134,7 @@ TEST_F(StringPosition, utf8StrAndStrTest) createColumn({3, 1, 4, 2, 1, 3, 0, 2, 3}), executeFunction( "position", - createColumn({"好", "平凯", "aa哈", "?!", "呵呵呵", "233", "嗯??", "好", "aaa"}), + createColumn({"好", "平凯", "aa哈", "?!", "呵呵呵", "233", "嗯??", "好", "AaA"}), createColumn( {"ni好", "平凯星辰", "啊啊aaa哈哈", "??!!", "呵呵呵", "哈哈2333", "嗯?", " 好", "vdAaAvr"}))); } @@ -145,10 +142,10 @@ TEST_F(StringPosition, utf8StrAndStrTest) { // vector const ASSERT_COLUMN_EQ( - createColumn({1, 1, 11, 0, 6, 6}), + createColumn({1, 1, 11, 0, 6}), executeFunction( "position", - createColumn({"", "f", "z", "备份", "备g份", "备G份"}), + createColumn({"", "f", "z", "备份", "备g份"}), createConstColumn(6, "fevre备g份gfz"))); } } From 48f6eec87cd2918c5910cd1dbb88a5d4dc893cfc Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 28 Aug 2024 14:34:57 +0800 Subject: [PATCH 05/11] revert tests --- .../tests/gtest_strings_position.cpp | 189 ++++++++++-------- 1 file changed, 107 insertions(+), 82 deletions(-) diff --git a/dbms/src/Functions/tests/gtest_strings_position.cpp b/dbms/src/Functions/tests/gtest_strings_position.cpp index 14e9c2f7cd4..65d83052aa1 100644 --- a/dbms/src/Functions/tests/gtest_strings_position.cpp +++ b/dbms/src/Functions/tests/gtest_strings_position.cpp @@ -42,111 +42,136 @@ class StringPosition : public DB::tests::FunctionTest TEST_F(StringPosition, strAndStrTest) { const auto context = TiFlashTestEnv::getContext(); + auto & factory = FunctionFactory::instance(); + // case insensitive std::vector c0_var_strs{"ell", "LL", "3", "ElL", "ye", "aaaa", "world", "", "", "biu"}; std::vector c1_var_strs{"hello", "HELLO", "23333", "HeLlO", "hey", "a", "WoRlD", "", "ping", ""}; + + // var-var std::vector result0{2, 3, 2, 0, 0, 0, 0, 1, 1, 0}; std::vector c0_strs; std::vector c1_strs; - std::vector expect_results; - - MutableColumnPtr csp0; - MutableColumnPtr csp1; - - c0_strs = c0_var_strs; - c1_strs = c1_var_strs; - expect_results = result0; - - csp0 = ColumnString::create(); - csp1 = ColumnString::create(); - - for (size_t i = 0; i < c0_strs.size(); i++) + std::vector results; + for (int i = 0; i < 4; i++) { - csp0->insert(Field(c0_strs[i].c_str(), c0_strs[i].size())); - csp1->insert(Field(c1_strs[i].c_str(), c1_strs[i].size())); - } - - Block test_block; - ColumnWithTypeAndName ctn0 - = ColumnWithTypeAndName(std::move(csp0), std::make_shared(), "test_position_0"); - ColumnWithTypeAndName ctn1 - = ColumnWithTypeAndName(std::move(csp1), std::make_shared(), "test_position_1"); - ColumnsWithTypeAndName ctns{ctn0, ctn1}; - test_block.insert(ctn0); - test_block.insert(ctn1); - - // for result from position - test_block.insert({}); - ColumnNumbers cns{0, 1}; + MutableColumnPtr csp0; + MutableColumnPtr csp1; - // test position - auto bp = factory.tryGet("position", *context); - ASSERT_TRUE(bp != nullptr); - ASSERT_FALSE(bp->isVariadic()); + // var-var + c0_strs = c0_var_strs; + c1_strs = c1_var_strs; + results = result0; - bp->build(ctns)->execute(test_block, cns, 2); - const IColumn * res = test_block.getByPosition(2).column.get(); - const auto * res_string = checkAndGetColumn(res); + csp0 = ColumnString::create(); + csp1 = ColumnString::create(); - Field res_field; + for (size_t j = 0; j < c0_strs.size(); j++) + { + csp0->insert(Field(c0_strs[j].c_str(), c0_strs[j].size())); + csp1->insert(Field(c1_strs[j].c_str(), c1_strs[j].size())); + } - for (size_t i = 0; i < expect_results.size(); i++) - { - res_string->get(i, res_field); - Int64 res_val = res_field.get(); - EXPECT_EQ(expect_results[i], res_val); + Block test_block; + ColumnWithTypeAndName ctn0 + = ColumnWithTypeAndName(std::move(csp0), std::make_shared(), "test_position_0"); + ColumnWithTypeAndName ctn1 + = ColumnWithTypeAndName(std::move(csp1), std::make_shared(), "test_position_1"); + ColumnsWithTypeAndName ctns{ctn0, ctn1}; + test_block.insert(ctn0); + test_block.insert(ctn1); + // for result from position + test_block.insert({}); + ColumnNumbers cns{0, 1}; + + // test position + auto bp = factory.tryGet("position", *context); + ASSERT_TRUE(bp != nullptr); + ASSERT_FALSE(bp->isVariadic()); + + bp->build(ctns)->execute(test_block, cns, 2); + const IColumn * res = test_block.getByPosition(2).column.get(); + const auto * res_string = checkAndGetColumn(res); + + Field res_field; + + for (size_t t = 0; t < results.size(); t++) + { + res_string->get(t, res_field); + Int64 res_val = res_field.get(); + EXPECT_EQ(results[t], res_val); + } } } // test string and string in utf8 TEST_F(StringPosition, utf8StrAndStrTest) { - { - // const const - ASSERT_COLUMN_EQ( - createColumn({}), - executeFunction("position", createConstColumn(0, ""), createConstColumn(0, ""))); - - ASSERT_COLUMN_EQ( - createColumn({3}), - executeFunction("position", createConstColumn(1, "a啊A"), createConstColumn(1, "g他a啊A"))); - - ASSERT_COLUMN_EQ( - createColumn({1, 1, 1}), - executeFunction("position", createConstColumn(3, "a啊A"), createConstColumn(3, "a啊A我"))); - } + const auto context = TiFlashTestEnv::getContext(); - { - // const vector - ASSERT_COLUMN_EQ( - createColumn({1, 1, 3, 2, 0, 0}), - executeFunction( - "position", - createConstColumn(6, "我aA"), - createColumn({"我aA", "我aA123", "aa我aA", "肥我aA个", "vrfv干扰", ""}))); - } + auto & factory = FunctionFactory::instance(); - { - // vector vector - ASSERT_COLUMN_EQ( - createColumn({3, 1, 4, 2, 1, 3, 0, 2, 3}), - executeFunction( - "position", - createColumn({"好", "平凯", "aa哈", "?!", "呵呵呵", "233", "嗯??", "好", "AaA"}), - createColumn( - {"ni好", "平凯星辰", "啊啊aaa哈哈", "??!!", "呵呵呵", "哈哈2333", "嗯?", " 好", "vdAaAvr"}))); - } + // case insensitive + std::vector c0_var_strs{"好", "平凯", "aa哈", "?!", "呵呵呵", "233", "嗯??"}; + std::vector c1_var_strs{"ni好", "平凯星辰", "啊啊aaa哈哈", "??!!", "呵呵呵", "哈哈2333", "嗯?"}; + + // var-var + std::vector result0{3, 1, 4, 2, 1, 3, 0}; + std::vector c0_strs; + std::vector c1_strs; + std::vector results; + for (int i = 0; i < 4; i++) { - // vector const - ASSERT_COLUMN_EQ( - createColumn({1, 1, 11, 0, 6}), - executeFunction( - "position", - createColumn({"", "f", "z", "备份", "备g份"}), - createConstColumn(6, "fevre备g份gfz"))); + MutableColumnPtr csp0; + MutableColumnPtr csp1; + + // var-var + c0_strs = c0_var_strs; + c1_strs = c1_var_strs; + results = result0; + + csp0 = ColumnString::create(); + csp1 = ColumnString::create(); + + + for (size_t i = 0; i < c0_strs.size(); i++) + { + csp0->insert(Field(c0_strs[i].c_str(), c0_strs[i].size())); + csp1->insert(Field(c1_strs[i].c_str(), c1_strs[i].size())); + } + + Block test_block; + ColumnWithTypeAndName ctn0 + = ColumnWithTypeAndName(std::move(csp0), std::make_shared(), "test_position_0"); + ColumnWithTypeAndName ctn1 + = ColumnWithTypeAndName(std::move(csp1), std::make_shared(), "test_position_1"); + ColumnsWithTypeAndName ctns{ctn0, ctn1}; + test_block.insert(ctn0); + test_block.insert(ctn1); + // for result from position + test_block.insert({}); + ColumnNumbers cns{0, 1}; + + // test position + auto bp = factory.tryGet("position", *context); + ASSERT_TRUE(bp != nullptr); + ASSERT_FALSE(bp->isVariadic()); + + bp->build(ctns)->execute(test_block, cns, 2); + const IColumn * res = test_block.getByPosition(2).column.get(); + const auto * res_string = checkAndGetColumn(res); + + Field res_field; + + for (size_t t = 0; t < results.size(); t++) + { + res_string->get(t, res_field); + Int64 res_val = res_field.get(); + EXPECT_EQ(results[t], res_val); + } } } From 98135572e3cdd430fcd314b02a296b8dd165e978 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 28 Aug 2024 16:08:47 +0800 Subject: [PATCH 06/11] add bench --- dbms/src/Functions/FunctionsString.cpp | 21 ++---- ...on_ilike.cpp => bench_function_string.cpp} | 64 ++++++++++++++++++- 2 files changed, 67 insertions(+), 18 deletions(-) rename dbms/src/Functions/tests/{bench_function_ilike.cpp => bench_function_string.cpp} (87%) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index be31bac8f90..8f7b4c7973e 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include #include @@ -35,7 +36,6 @@ #include #include -#include "Columns/IColumn.h" namespace DB { @@ -4465,13 +4465,6 @@ class PadUTF8Impl : public IFunction } }; -static Int64 getResult(const ColumnString::Chars_t & chars, size_t size, size_t offset) -{ - if unlikely (size == 0) - return 0; - return chars[offset]; -} - class FunctionASCII : public IFunction { public: @@ -4515,21 +4508,17 @@ class FunctionASCII : public IFunction const auto & offsets = c0_string->getOffsets(); if (val_num > 0) - { - size_t size = offsets[0] - 1; - col_res->insert(getResult(chars, size, 0)); - } + col_res->insert(getResult(chars, 0)); + for (size_t i = 1; i < val_num; i++) - { - auto size = offsets[i] - offsets[i - 1] - 1; - col_res->insert(getResult(chars, size, offsets[i - 1])); - } + col_res->insert(getResult(chars, offsets[i - 1])); block.getByPosition(result).column = std::move(col_res); } private: +static Int64 getResult(const ColumnString::Chars_t & chars, size_t offset) { return chars[offset]; } }; class FunctionLength : public IFunction diff --git a/dbms/src/Functions/tests/bench_function_ilike.cpp b/dbms/src/Functions/tests/bench_function_string.cpp similarity index 87% rename from dbms/src/Functions/tests/bench_function_ilike.cpp rename to dbms/src/Functions/tests/bench_function_string.cpp index b8125146b5e..81faccd441e 100644 --- a/dbms/src/Functions/tests/bench_function_ilike.cpp +++ b/dbms/src/Functions/tests/bench_function_string.cpp @@ -17,8 +17,8 @@ #include #include -/// this is a hack, include the cpp file so we can test MatchImpl directly -#include +/// this is a hack, include the cpp file so we can test functions directly +#include // NOLINT #include // NOLINT namespace DB @@ -306,5 +306,65 @@ BENCH_LIKE_COLLATOR(ASCII_BIN); BENCH_LIKE_COLLATOR(BINARY); BENCH_LIKE_COLLATOR(LATIN1_BIN); +class LengthBench : public benchmark::Fixture +{ +public: + using ColStringType = typename TypeTraits::FieldType; + + ColumnsWithTypeAndName data1{toVec("col", std::vector(data_num, ""))}; + ColumnsWithTypeAndName data2{toVec("col", std::vector(data_num, "aaaaaaaaaa"))}; + ColumnsWithTypeAndName data3{toVec("col", std::vector(data_num, "啊aaaaaaaa"))}; + + void SetUp(const benchmark::State &) override {} +}; + +BENCHMARK_DEFINE_F(LengthBench, bench) +(benchmark::State & state) +try +{ + FunctionLength function_length; + std::vector blocks{Block(data1), Block(data2), Block(data3)}; + for (auto & block : blocks) + block.insert({nullptr, std::make_shared>(), "res"}); + ColumnNumbers arguments{0}; + for (auto _ : state) + { + for (auto & block : blocks) + function_length.executeImpl(block, arguments, 1); + } +} +CATCH +BENCHMARK_REGISTER_F(LengthBench, bench)->Iterations(10); + +class ASCIIBench : public benchmark::Fixture +{ +public: + using ColStringType = typename TypeTraits::FieldType; + + ColumnsWithTypeAndName data1{toVec("col", std::vector(data_num, ""))}; + ColumnsWithTypeAndName data2{toVec("col", std::vector(data_num, "aaaaaaaaaa"))}; + ColumnsWithTypeAndName data3{toVec("col", std::vector(data_num, "啊aaaaaaaa"))}; + + void SetUp(const benchmark::State &) override {} +}; + +BENCHMARK_DEFINE_F(ASCIIBench, bench) +(benchmark::State & state) +try +{ + FunctionASCII function_ascii; + std::vector blocks{Block(data1), Block(data2), Block(data3)}; + for (auto & block : blocks) + block.insert({nullptr, std::make_shared>(), "res"}); + ColumnNumbers arguments{0}; + for (auto _ : state) + { + for (auto & block : blocks) + function_ascii.executeImpl(block, arguments, 1); + } +} +CATCH +BENCHMARK_REGISTER_F(ASCIIBench, bench)->Iterations(10); + } // namespace tests } // namespace DB From adae41d05382962b9442527de8e1fe418ee0a1c0 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Wed, 28 Aug 2024 21:49:46 +0800 Subject: [PATCH 07/11] add tests --- dbms/src/Functions/FunctionsString.cpp | 3 +- .../Functions/tests/gtest_strings_ascii.cpp | 129 ++++------------- .../Functions/tests/gtest_strings_length.cpp | 135 +++++------------- 3 files changed, 66 insertions(+), 201 deletions(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index 8f7b4c7973e..6c6c16942c7 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -4509,7 +4509,6 @@ class FunctionASCII : public IFunction if (val_num > 0) col_res->insert(getResult(chars, 0)); - for (size_t i = 1; i < val_num; i++) col_res->insert(getResult(chars, offsets[i - 1])); @@ -4518,7 +4517,7 @@ class FunctionASCII : public IFunction } private: -static Int64 getResult(const ColumnString::Chars_t & chars, size_t offset) { return chars[offset]; } + static Int64 getResult(const ColumnString::Chars_t & chars, size_t offset) { return chars[offset]; } }; class FunctionLength : public IFunction diff --git a/dbms/src/Functions/tests/gtest_strings_ascii.cpp b/dbms/src/Functions/tests/gtest_strings_ascii.cpp index c7888f85c07..0c7a58559b0 100644 --- a/dbms/src/Functions/tests/gtest_strings_ascii.cpp +++ b/dbms/src/Functions/tests/gtest_strings_ascii.cpp @@ -21,7 +21,6 @@ #include #include -#include #include #pragma GCC diagnostic push @@ -38,117 +37,49 @@ class StringASCII : public DB::tests::FunctionTest { }; -// test string and string TEST_F(StringASCII, strAndStrTest) { - const auto context = TiFlashTestEnv::getContext(); - - auto & factory = FunctionFactory::instance(); - - std::vector strs{"hello", "HELLO", "23333", "#%@#^", ""}; - - for (int i = 0; i < 2; i++) { - MutableColumnPtr csp; - csp = ColumnString::create(); - - for (const auto & str : strs) - { - csp->insert(Field(str.c_str(), str.size())); - } - - Block test_block; - ColumnWithTypeAndName ctn - = ColumnWithTypeAndName(std::move(csp), std::make_shared(), "test_ascii"); - ColumnsWithTypeAndName ctns{ctn}; - test_block.insert(ctn); - ColumnNumbers cns{0}; - - // test ascii - auto bp = factory.tryGet("ascii", *context); - ASSERT_TRUE(bp != nullptr); - ASSERT_FALSE(bp->isVariadic()); - - auto func = bp->build(ctns); - test_block.insert({nullptr, func->getReturnType(), "res"}); - func->execute(test_block, cns, 1); - const IColumn * res = test_block.getByPosition(1).column.get(); - const auto * actual_res = checkAndGetColumn(res); - - Field res_field; - std::vector expect_results{104, 72, 50, 35, 0}; - for (size_t t = 0; t < expect_results.size(); t++) - { - actual_res->get(t, res_field); - Int64 res_val = res_field.get(); - EXPECT_EQ(expect_results[t], res_val); - } + // test const + ASSERT_COLUMN_EQ(createConstColumn(0, 0), executeFunction("ascii", createConstColumn(0, ""))); + ASSERT_COLUMN_EQ( + createConstColumn(1, 38), + executeFunction("ascii", createConstColumn(1, "&ad"))); + ASSERT_COLUMN_EQ( + createConstColumn(5, 38), + executeFunction("ascii", createConstColumn(5, "&ad"))); } -} -// test NULL -TEST_F(StringASCII, nullTest) -{ - const auto context = TiFlashTestEnv::getContext(); - - auto & factory = FunctionFactory::instance(); - - std::vector strs{"a", "b", "c", "d", "e", "f"}; - std::vector results{0, 98, 0, 100, 101, 0}; - std::vector null_map{1, 0, 1, 0, 0, 1}; - auto input_str_col = ColumnString::create(); - for (const auto & str : strs) { - Field field(str.c_str(), str.size()); - input_str_col->insert(field); + // test vec + ASSERT_COLUMN_EQ(createColumn({}), executeFunction("ascii", createColumn({}))); + ASSERT_COLUMN_EQ( + createColumn({230, 104, 72, 50, 35, 0}), + executeFunction("ascii", createColumn({"我a", "hello", "HELLO", "23333", "#%@#^", ""}))); } - auto input_null_map = ColumnUInt8::create(strs.size(), 0); - ColumnUInt8::Container & input_vec_null_map = input_null_map->getData(); - for (size_t i = 0; i < strs.size(); i++) { - input_vec_null_map[i] = null_map[i]; + // test nullable const + ASSERT_COLUMN_EQ( + createConstColumn(0, {}), + executeFunction("ascii", createConstColumn>(0, "aaa"))); + ASSERT_COLUMN_EQ( + createConstColumn(1, {97}), + executeFunction("ascii", createConstColumn>(1, "aaa"))); + ASSERT_COLUMN_EQ( + createConstColumn(3, {97}), + executeFunction("ascii", createConstColumn>(3, "aaa"))); } - auto input_null_col = ColumnNullable::create(std::move(input_str_col), std::move(input_null_map)); - DataTypePtr string_type = std::make_shared(); - DataTypePtr nullable_string_type = makeNullable(string_type); - - auto col1 = ColumnWithTypeAndName(std::move(input_null_col), nullable_string_type, "ascii"); - ColumnsWithTypeAndName ctns{col1}; - - Block test_block; - test_block.insert(col1); - ColumnNumbers cns{0}; - - auto bp = factory.tryGet("ascii", *context); - ASSERT_TRUE(bp != nullptr); - ASSERT_FALSE(bp->isVariadic()); - auto func = bp->build(ctns); - test_block.insert({nullptr, func->getReturnType(), "res"}); - func->execute(test_block, cns, 1); - auto res_col = test_block.getByPosition(1).column; - - ColumnPtr result_null_map_column = static_cast(*res_col).getNullMapColumnPtr(); - MutableColumnPtr mutable_result_null_map_column = (*std::move(result_null_map_column)).mutate(); - NullMap & result_null_map = static_cast(*mutable_result_null_map_column).getData(); - const IColumn * res = test_block.getByPosition(1).column.get(); - const auto * res_nullable_string = checkAndGetColumn(res); - const IColumn & res_string = res_nullable_string->getNestedColumn(); - - Field res_field; - - for (size_t i = 0; i < null_map.size(); i++) { - EXPECT_EQ(result_null_map[i], null_map[i]); - if (result_null_map[i] == 0) - { - res_string.get(i, res_field); - Int64 res_val = res_field.get(); - EXPECT_EQ(results[i], res_val); - } + // test nullable vec + std::vector null_map{0, 1, 0, 1, 0, 0, 1}; + ASSERT_COLUMN_EQ( + createNullableColumn({0, 0, 97, 0, 233, 233, 0}, null_map), + executeFunction( + "ascii", + createNullableColumn({"", "a", "abcd", "嗯", "饼干", "馒头", "???"}, null_map))); } } - } // namespace tests } // namespace DB diff --git a/dbms/src/Functions/tests/gtest_strings_length.cpp b/dbms/src/Functions/tests/gtest_strings_length.cpp index ddf8e19c3d8..638cc783177 100644 --- a/dbms/src/Functions/tests/gtest_strings_length.cpp +++ b/dbms/src/Functions/tests/gtest_strings_length.cpp @@ -38,118 +38,53 @@ class StringLength : public DB::tests::FunctionTest { }; -// test string and string -TEST_F(StringLength, strAndStrTest) +TEST_F(StringLength, length) { - const auto context = TiFlashTestEnv::getContext(); - - auto & factory = FunctionFactory::instance(); - - std::vector strs{"hi~", "23333", "pingcap", "你好", "233哈哈", "", "asdの的", "ヽ( ̄▽ ̄)و"}; - std::vector results{3, 5, 7, 6, 9, 0, 9, 16}; - - for (int i = 0; i < 2; i++) { - MutableColumnPtr csp; - csp = ColumnString::create(); - - for (const auto & str : strs) - { - csp->insert(Field(str.c_str(), str.size())); - } - - Block test_block; - ColumnWithTypeAndName ctn - = ColumnWithTypeAndName(std::move(csp), std::make_shared(), "test_ascii"); - ColumnsWithTypeAndName ctns{ctn}; - test_block.insert(ctn); - ColumnNumbers cns{0}; - - // test length - auto bp = factory.tryGet("length", *context); - ASSERT_TRUE(bp != nullptr); - ASSERT_FALSE(bp->isVariadic()); - - auto func = bp->build(ctns); - test_block.insert({nullptr, func->getReturnType(), "res"}); - func->execute(test_block, cns, 1); - const IColumn * res = test_block.getByPosition(1).column.get(); - const auto * res_string = checkAndGetColumn(res); - - Field res_field; - - for (size_t t = 0; t < results.size(); t++) - { - res_string->get(t, res_field); - Int64 res_val = res_field.get(); - EXPECT_EQ(results[t], res_val); - } + // test const + ASSERT_COLUMN_EQ(createConstColumn(0, 0), executeFunction("length", createConstColumn(0, ""))); + ASSERT_COLUMN_EQ( + createConstColumn(1, 3), + executeFunction("length", createConstColumn(1, "aaa"))); + ASSERT_COLUMN_EQ( + createConstColumn(3, 3), + executeFunction("length", createConstColumn(3, "aaa"))); } -} -// test NULL -TEST_F(StringLength, nullTest) -{ - const auto context = TiFlashTestEnv::getContext(); - - auto & factory = FunctionFactory::instance(); - - std::vector strs{"a", "abcd", "嗯", "饼干", "馒头", "???"}; - std::vector results{0, 4, 0, 6, 6, 0}; - std::vector null_map{1, 0, 1, 0, 0, 1}; - auto input_str_col = ColumnString::create(); - for (const auto & str : strs) { - Field field(str.c_str(), str.size()); - input_str_col->insert(field); + // test vec + ASSERT_COLUMN_EQ(createColumn({}), executeFunction("length", createColumn({}))); + + ASSERT_COLUMN_EQ( + createColumn({0, 3, 5, 7, 6, 9, 0, 9, 16, 0}), + executeFunction( + "length", + createColumn( + {"", "hi~", "23333", "pingcap", "你好", "233哈哈", "", "asdの的", "ヽ( ̄▽ ̄)و", ""}))); } - auto input_null_map = ColumnUInt8::create(strs.size(), 0); - ColumnUInt8::Container & input_vec_null_map = input_null_map->getData(); - for (size_t i = 0; i < strs.size(); i++) { - input_vec_null_map[i] = null_map[i]; + // test nullable const + ASSERT_COLUMN_EQ( + createConstColumn(0, {}), + executeFunction("length", createConstColumn>(0, "aaa"))); + ASSERT_COLUMN_EQ( + createConstColumn(1, {3}), + executeFunction("length", createConstColumn>(1, "aaa"))); + ASSERT_COLUMN_EQ( + createConstColumn(3, {3}), + executeFunction("length", createConstColumn>(3, "aaa"))); } - auto input_null_col = ColumnNullable::create(std::move(input_str_col), std::move(input_null_map)); - DataTypePtr string_type = std::make_shared(); - DataTypePtr nullable_string_type = makeNullable(string_type); - - auto col1 = ColumnWithTypeAndName(std::move(input_null_col), nullable_string_type, "length"); - ColumnsWithTypeAndName ctns{col1}; - - Block test_block; - test_block.insert(col1); - ColumnNumbers cns{0}; - - auto bp = factory.tryGet("length", *context); - ASSERT_TRUE(bp != nullptr); - ASSERT_FALSE(bp->isVariadic()); - auto func = bp->build(ctns); - test_block.insert({nullptr, func->getReturnType(), "res"}); - func->execute(test_block, cns, 1); - auto res_col = test_block.getByPosition(1).column; - - ColumnPtr result_null_map_column = static_cast(*res_col).getNullMapColumnPtr(); - MutableColumnPtr mutable_result_null_map_column = (*std::move(result_null_map_column)).mutate(); - NullMap & result_null_map = static_cast(*mutable_result_null_map_column).getData(); - const IColumn * res = test_block.getByPosition(1).column.get(); - const auto * res_nullable_string = checkAndGetColumn(res); - const IColumn & res_string = res_nullable_string->getNestedColumn(); - - Field res_field; - - for (size_t i = 0; i < null_map.size(); i++) { - EXPECT_EQ(result_null_map[i], null_map[i]); - if (result_null_map[i] == 0) - { - res_string.get(i, res_field); - Int64 res_val = res_field.get(); - EXPECT_EQ(results[i], res_val); - } + // test nullable vec + std::vector null_map{1, 0, 1, 0, 0, 1}; + ASSERT_COLUMN_EQ( + createNullableColumn({0, 4, 0, 6, 6, 0}, null_map), + executeFunction( + "length", + createNullableColumn({"a", "abcd", "嗯", "饼干", "馒头", "???"}, null_map))); } } - } // namespace tests } // namespace DB From ef018022d53b64126679838fa870255d19f6de95 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 29 Aug 2024 10:39:41 +0800 Subject: [PATCH 08/11] address comment --- dbms/src/Functions/FunctionsString.cpp | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index 6c6c16942c7..886d1e5bedb 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -36,6 +36,8 @@ #include #include +#include "Columns/ColumnsNumber.h" + namespace DB { @@ -4502,22 +4504,17 @@ class FunctionASCII : public IFunction size_t val_num = c0_col->size(); auto col_res = ColumnInt64::create(); - col_res->reserve(val_num); + ColumnInt64::Container & data = col_res->getData(); + data.resize(val_num); const auto & chars = c0_string->getChars(); const auto & offsets = c0_string->getOffsets(); - if (val_num > 0) - col_res->insert(getResult(chars, 0)); - - for (size_t i = 1; i < val_num; i++) - col_res->insert(getResult(chars, offsets[i - 1])); + for (size_t i = 0; i < val_num; i++) + data[i] = chars[offsets[i - 1]]; block.getByPosition(result).column = std::move(col_res); } - -private: - static Int64 getResult(const ColumnString::Chars_t & chars, size_t offset) { return chars[offset]; } }; class FunctionLength : public IFunction @@ -4557,15 +4554,16 @@ class FunctionLength : public IFunction size_t val_num = c0_col->size(); auto col_res = ColumnInt64::create(); - col_res->reserve(val_num); + ColumnInt64::Container & data = col_res->getData(); + data.resize(val_num); const auto & offsets = c0_string->getOffsets(); if (val_num > 0) - col_res->insert(offsets[0] - 1); + data[0] = offsets[0] - 1; for (size_t i = 1; i < val_num; i++) - col_res->insert(offsets[i] - offsets[i - 1] - 1); + data[i] = offsets[i] - offsets[i - 1] - 1; block.getByPosition(result).column = std::move(col_res); } From 1c9a8126ac9d2a9dfe7300a26d097b0fe9ebda4d Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 29 Aug 2024 11:22:06 +0800 Subject: [PATCH 09/11] fix --- dbms/src/Functions/FunctionsString.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index 886d1e5bedb..c7666934d0d 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -4502,7 +4502,7 @@ class FunctionASCII : public IFunction fmt::format("Illegal argument of function {}", getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - size_t val_num = c0_col->size(); + auto val_num = static_cast(c0_col->size()); auto col_res = ColumnInt64::create(); ColumnInt64::Container & data = col_res->getData(); data.resize(val_num); @@ -4510,7 +4510,7 @@ class FunctionASCII : public IFunction const auto & chars = c0_string->getChars(); const auto & offsets = c0_string->getOffsets(); - for (size_t i = 0; i < val_num; i++) + for (Int64 i = 0; i < val_num; i++) data[i] = chars[offsets[i - 1]]; block.getByPosition(result).column = std::move(col_res); @@ -4552,7 +4552,7 @@ class FunctionLength : public IFunction fmt::format("Illegal argument of function {}", getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - size_t val_num = c0_col->size(); + auto val_num = static_cast(c0_col->size()); auto col_res = ColumnInt64::create(); ColumnInt64::Container & data = col_res->getData(); data.resize(val_num); @@ -4562,7 +4562,7 @@ class FunctionLength : public IFunction if (val_num > 0) data[0] = offsets[0] - 1; - for (size_t i = 1; i < val_num; i++) + for (Int64 i = 1; i < val_num; i++) data[i] = offsets[i] - offsets[i - 1] - 1; block.getByPosition(result).column = std::move(col_res); From d2e122df25e2b0a5c6370937f0fd3e9b6c1f3409 Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Thu, 29 Aug 2024 11:27:31 +0800 Subject: [PATCH 10/11] fix --- dbms/src/Functions/FunctionsString.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index c7666934d0d..7b7feae4c6d 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -4502,7 +4502,7 @@ class FunctionASCII : public IFunction fmt::format("Illegal argument of function {}", getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - auto val_num = static_cast(c0_col->size()); + auto val_num = static_cast(c0_col->size()); auto col_res = ColumnInt64::create(); ColumnInt64::Container & data = col_res->getData(); data.resize(val_num); @@ -4510,7 +4510,7 @@ class FunctionASCII : public IFunction const auto & chars = c0_string->getChars(); const auto & offsets = c0_string->getOffsets(); - for (Int64 i = 0; i < val_num; i++) + for (ssize_t i = 0; i < val_num; i++) data[i] = chars[offsets[i - 1]]; block.getByPosition(result).column = std::move(col_res); @@ -4552,17 +4552,14 @@ class FunctionLength : public IFunction fmt::format("Illegal argument of function {}", getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - auto val_num = static_cast(c0_col->size()); + auto val_num = static_cast(c0_col->size()); auto col_res = ColumnInt64::create(); ColumnInt64::Container & data = col_res->getData(); data.resize(val_num); const auto & offsets = c0_string->getOffsets(); - if (val_num > 0) - data[0] = offsets[0] - 1; - - for (Int64 i = 1; i < val_num; i++) + for (ssize_t i = 0; i < val_num; i++) data[i] = offsets[i] - offsets[i - 1] - 1; block.getByPosition(result).column = std::move(col_res); From d97e9105c2677e34ab03843074f273a0a7bf57bf Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Mon, 2 Sep 2024 18:24:45 +0800 Subject: [PATCH 11/11] format --- dbms/src/Functions/FunctionsString.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index 7b7feae4c6d..5a3cdfd3be5 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include #include @@ -36,9 +37,6 @@ #include #include -#include "Columns/ColumnsNumber.h" - - namespace DB { namespace ErrorCodes