diff --git a/be/src/util/simd/vstring_function.h b/be/src/util/simd/vstring_function.h index 306a18a6d66ec7..dac964b1b94224 100644 --- a/be/src/util/simd/vstring_function.h +++ b/be/src/util/simd/vstring_function.h @@ -44,6 +44,54 @@ inline uint8_t get_utf8_byte_length(uint8_t character) { return UTF8_BYTE_LENGTH[character]; } +// copy from https://github.com/lemire/fastvalidate-utf-8/blob/master/include/simdasciicheck.h +// The function returns true (1) if all chars passed in src are +// 7-bit values (0x00..0x7F). Otherwise, it returns false (0). +inline bool validate_ascii_fast(const char* src, size_t len) { + size_t i = 0; + __m128i has_error = _mm_setzero_si128(); + if (len >= 16) { + for (; i <= len - 16; i += 16) { + __m128i current_bytes = _mm_loadu_si128((const __m128i*)(src + i)); + has_error = _mm_or_si128(has_error, current_bytes); + } + } + int error_mask = _mm_movemask_epi8(has_error); + + char tail_has_error = 0; + for (; i < len; i++) { + tail_has_error |= src[i]; + } + error_mask |= (tail_has_error & 0x80); + + return !error_mask; +} + +#ifdef __AVX2__ +#include +// The function returns true (1) if all chars passed in src are +// 7-bit values (0x00..0x7F). Otherwise, it returns false (0). +inline bool validate_ascii_fast_avx(const char* src, size_t len) { + size_t i = 0; + __m256i has_error = _mm256_setzero_si256(); + if (len >= 32) { + for (; i <= len - 32; i += 32) { + __m256i current_bytes = _mm256_loadu_si256((const __m256i*)(src + i)); + has_error = _mm256_or_si256(has_error, current_bytes); + } + } + int error_mask = _mm256_movemask_epi8(has_error); + + char tail_has_error = 0; + for (; i < len; i++) { + tail_has_error |= src[i]; + } + error_mask |= (tail_has_error & 0x80); + + return !error_mask; +} +#endif + namespace simd { class VStringFunctions { @@ -219,11 +267,10 @@ class VStringFunctions { // Gcc will do auto simd in this function static bool is_ascii(const StringRef& str) { - char or_code = 0; - for (size_t i = 0; i < str.size; i++) { - or_code |= str.data[i]; - } - return !(or_code & 0x80); +#ifdef __AVX2__ + return validate_ascii_fast_avx(str.data, str.size); +#endif + return validate_ascii_fast(str.data, str.size); } static void reverse(const StringRef& str, StringRef dst) { diff --git a/be/src/vec/common/pod_array.h b/be/src/vec/common/pod_array.h index 91642aebd470b0..0d7cde6503de10 100644 --- a/be/src/vec/common/pod_array.h +++ b/be/src/vec/common/pod_array.h @@ -480,6 +480,14 @@ class PODArray : public PODArrayBasec_end += bytes_to_copy; } + template + void insert_assume_reserved_and_allow_overflow(It1 from_begin, It2 from_end) { + size_t bytes_to_copy = this->byte_size(from_end - from_begin); + memcpy_small_allow_read_write_overflow15( + this->c_end, reinterpret_cast(&*from_begin), bytes_to_copy); + this->c_end += bytes_to_copy; + } + void swap(PODArray& rhs) { #ifndef NDEBUG this->unprotect(); diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 2b1d5891a663fd..4363d040cd54d0 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -17,7 +17,6 @@ #pragma once -#include #include #include #include @@ -52,6 +51,7 @@ #include "vec/common/memcpy_small.h" #include "vec/common/pod_array.h" #include "vec/common/pod_array_fwd.h" +#include "vec/common/string_utils/string_utils.h" #include "vec/common/typeid_cast.h" #include "vec/core/block.h" #include "vec/core/column_numbers.h" @@ -120,6 +120,14 @@ struct StringOP { chars.insert(string_value.data(), string_value.data() + string_value.size()); offsets[index] = chars.size(); } + + static void push_value_string_reserved_and_allow_overflow(const std::string_view& string_value, + int index, ColumnString::Chars& chars, + ColumnString::Offsets& offsets) { + chars.insert_assume_reserved_and_allow_overflow(string_value.data(), + string_value.data() + string_value.size()); + offsets[index] = chars.size(); + } }; struct SubstringUtil { @@ -147,30 +155,37 @@ struct SubstringUtil { check_set_nullable(argument_columns[i], null_map, col_const[i]); } - auto specific_str_column = assert_cast(argument_columns[0].get()); - auto specific_start_column = + const auto* specific_str_column = + assert_cast(argument_columns[0].get()); + const auto* specific_start_column = assert_cast*>(argument_columns[1].get()); - auto specific_len_column = + const auto* specific_len_column = assert_cast*>(argument_columns[2].get()); - if (col_const[1] && col_const[2]) { - vectors(specific_str_column->get_chars(), specific_str_column->get_offsets(), - specific_start_column->get_data(), specific_len_column->get_data(), - null_map->get_data(), res->get_chars(), res->get_offsets()); - } else { - vectors(specific_str_column->get_chars(), specific_str_column->get_offsets(), - specific_start_column->get_data(), specific_len_column->get_data(), - null_map->get_data(), res->get_chars(), res->get_offsets()); + + auto vectors = vectors_utf8; + bool is_ascii = simd::VStringFunctions::is_ascii( + {specific_str_column->get_chars().data(), specific_str_column->get_chars().size()}); + if (col_const[1] && col_const[2] && is_ascii) { + vectors = vectors_ascii; + } else if (col_const[1] && col_const[2]) { + vectors = vectors_utf8; + } else if (is_ascii) { + vectors = vectors_ascii; } + vectors(specific_str_column->get_chars(), specific_str_column->get_offsets(), + specific_start_column->get_data(), specific_len_column->get_data(), + null_map->get_data(), res->get_chars(), res->get_offsets()); + block.get_by_position(result).column = ColumnNullable::create(std::move(res), std::move(null_map)); } private: - template - static void vectors(const ColumnString::Chars& chars, const ColumnString::Offsets& offsets, - const PaddedPODArray& start, const PaddedPODArray& len, - NullMap& null_map, ColumnString::Chars& res_chars, - ColumnString::Offsets& res_offsets) { + template + static void vectors_utf8(const ColumnString::Chars& chars, const ColumnString::Offsets& offsets, + const PaddedPODArray& start, const PaddedPODArray& len, + NullMap& null_map, ColumnString::Chars& res_chars, + ColumnString::Offsets& res_offsets) { size_t size = offsets.size(); res_offsets.resize(size); res_chars.reserve(chars.size()); @@ -179,119 +194,104 @@ struct SubstringUtil { PMR::monotonic_buffer_resource pool {buf.data(), buf.size()}; PMR::vector index {&pool}; - auto* __restrict data_ptr = chars.data(); - auto* __restrict offset_ptr = offsets.data(); - - if constexpr (Const) { - const auto start_value = start[0]; - const auto len_value = len[0]; - if (start_value == 0 || len_value <= 0) { + if constexpr (is_const) { + if (start[0] == 0 || len[0] <= 0) { for (size_t i = 0; i < size; ++i) { StringOP::push_empty_string(i, res_chars, res_offsets); } - } else { - for (size_t i = 0; i < size; ++i) { - const int str_size = offset_ptr[i] - offset_ptr[i - 1]; - const uint8_t* raw_str = data_ptr + offset_ptr[i - 1]; - // return empty string if start > src.length - if (start_value > str_size || start_value < -str_size || str_size == 0) { - StringOP::push_empty_string(i, res_chars, res_offsets); - continue; - } - // reference to string_function.cpp: substring - size_t byte_pos = 0; - index.clear(); - for (size_t j = 0, char_size = 0; - j < str_size && - (start_value <= 0 || index.size() <= start_value + len_value); - j += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned char)(raw_str)[j]]; - index.push_back(j); - } + return; + } + } - int fixed_pos = start_value; - if (fixed_pos < 0) { - fixed_pos = str_size + fixed_pos + 1; - } else if (fixed_pos > index.size()) { - StringOP::push_null_string(i, res_chars, res_offsets, null_map); - continue; - } + for (size_t i = 0; i < size; ++i) { + int str_size = offsets[i] - offsets[i - 1]; + const char* str_data = (char*)chars.data() + offsets[i - 1]; + int start_value = is_const ? start[0] : start[i]; + int len_value = is_const ? len[0] : len[i]; - byte_pos = index[fixed_pos - 1]; - int fixed_len = str_size - byte_pos; - if (fixed_pos + len_value <= index.size()) { - fixed_len = index[fixed_pos + len_value - 1] - byte_pos; - } + // return empty string if start > src.length + if (start_value > str_size || str_size == 0 || start_value == 0 || len_value <= 0) { + StringOP::push_empty_string(i, res_chars, res_offsets); + continue; + } - if (byte_pos <= str_size && fixed_len > 0) { - // return StringRef(str.data + byte_pos, fixed_len); - StringOP::push_value_string( - std::string_view {reinterpret_cast(raw_str + byte_pos), - (size_t)fixed_len}, - i, res_chars, res_offsets); - } else { - StringOP::push_empty_string(i, res_chars, res_offsets); - } + size_t byte_pos = 0; + index.clear(); + for (size_t j = 0, char_size = 0; j < str_size; j += char_size) { + char_size = get_utf8_byte_length(str_data[j]); + index.push_back(j); + if (start_value > 0 && index.size() > start_value + len_value) { + break; } } - } else { - PMR::vector> strs(&pool); - strs.resize(size); - for (int i = 0; i < size; ++i) { - strs[i].first = data_ptr + offset_ptr[i - 1]; - strs[i].second = offset_ptr[i] - offset_ptr[i - 1]; + + int fixed_pos = start_value; + if (fixed_pos < -(int)index.size()) { + StringOP::push_empty_string(i, res_chars, res_offsets); + continue; + } + if (fixed_pos < 0) { + fixed_pos = index.size() + fixed_pos + 1; + } + if (fixed_pos > index.size()) { + StringOP::push_null_string(i, res_chars, res_offsets, null_map); + continue; } - for (size_t i = 0; i < size; ++i) { - auto [raw_str, str_size] = strs[i]; - const auto& start_value = start[i]; - const auto& len_value = len[i]; + byte_pos = index[fixed_pos - 1]; + size_t fixed_len = str_size - byte_pos; + if (fixed_pos + len_value <= index.size()) { + fixed_len = index[fixed_pos + len_value - 1] - byte_pos; + } - // return empty string if start > src.length - if (start_value > str_size || str_size == 0 || start_value == 0 || len_value <= 0) { - StringOP::push_empty_string(i, res_chars, res_offsets); - continue; - } - // reference to string_function.cpp: substring - size_t byte_pos = 0; - index.clear(); - for (size_t j = 0, char_size = 0; j < str_size; j += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned char)(raw_str)[j]]; - index.push_back(j); - if (start_value > 0 && index.size() > start_value + len_value) { - break; - } - } + if (byte_pos <= str_size && fixed_len > 0) { + StringOP::push_value_string_reserved_and_allow_overflow( + {str_data + byte_pos, fixed_len}, i, res_chars, res_offsets); + } else { + StringOP::push_empty_string(i, res_chars, res_offsets); + } + } + } - int fixed_pos = start_value; - if (fixed_pos < -(int)index.size()) { + template + static void vectors_ascii(const ColumnString::Chars& chars, + const ColumnString::Offsets& offsets, + const PaddedPODArray& start, const PaddedPODArray& len, + NullMap& null_map, ColumnString::Chars& res_chars, + ColumnString::Offsets& res_offsets) { + size_t size = offsets.size(); + res_offsets.resize(size); + + if constexpr (is_const) { + if (start[0] == 0 || len[0] <= 0) { + for (size_t i = 0; i < size; ++i) { StringOP::push_empty_string(i, res_chars, res_offsets); - continue; - } - if (fixed_pos < 0) { - fixed_pos = index.size() + fixed_pos + 1; - } - if (fixed_pos > index.size()) { - StringOP::push_null_string(i, res_chars, res_offsets, null_map); - continue; } + return; + } + res_chars.reserve(std::min(chars.size(), len[0] * size)); + } else { + res_chars.reserve(chars.size()); + } - byte_pos = index[fixed_pos - 1]; - int fixed_len = str_size - byte_pos; - if (fixed_pos + len_value <= index.size()) { - fixed_len = index[fixed_pos + len_value - 1] - byte_pos; - } + for (size_t i = 0; i < size; ++i) { + int str_size = offsets[i] - offsets[i - 1]; + const char* str_data = (char*)chars.data() + offsets[i - 1]; - if (byte_pos <= str_size && fixed_len > 0) { - // return StringRef(str.data + byte_pos, fixed_len); - StringOP::push_value_string( - std::string_view {reinterpret_cast(raw_str + byte_pos), - (size_t)fixed_len}, - i, res_chars, res_offsets); - } else { - StringOP::push_empty_string(i, res_chars, res_offsets); - } + int start_value = is_const ? start[0] : start[i]; + int len_value = is_const ? len[0] : len[i]; + + if (start_value > str_size || start_value < -str_size || str_size == 0) { + StringOP::push_empty_string(i, res_chars, res_offsets); + continue; + } + int fixed_pos = start_value - 1; + if (fixed_pos < 0) { + fixed_pos = str_size + fixed_pos + 1; } + size_t fixed_len = std::min(str_size - fixed_pos, len_value); + StringOP::push_value_string_reserved_and_allow_overflow( + {str_data + fixed_pos, fixed_len}, i, res_chars, res_offsets); } } };