From e22eee80558162bea9a0c7cc5f439f45549dc073 Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Fri, 24 Mar 2023 15:45:12 +0100 Subject: [PATCH 01/12] Updated implementation of find_first_symbols for run-time needle Implemented compile-time and run-time dispatching between SSE4.2 and SSE2 implementations Added find_first_symbols_sse2 Added tests --- base/base/find_symbols.h | 112 ++++++++--- src/Common/tests/gtest_find_symbols.cpp | 83 +++++++- .../InlineEscapingKeyStateHandler.cpp | 183 ------------------ .../escaping/InlineEscapingKeyStateHandler.h | 34 ---- ...ler.cpp => InlineEscapingStateHandler.cpp} | 7 + ...Handler.h => InlineEscapingStateHandler.h} | 24 +++ 6 files changed, 203 insertions(+), 240 deletions(-) delete mode 100644 src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingKeyStateHandler.cpp delete mode 100644 src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingKeyStateHandler.h rename src/Functions/keyvaluepair/src/impl/state/strategies/escaping/{InlineEscapingValueStateHandler.cpp => InlineEscapingStateHandler.cpp} (98%) rename src/Functions/keyvaluepair/src/impl/state/strategies/escaping/{InlineEscapingValueStateHandler.h => InlineEscapingStateHandler.h} (54%) diff --git a/base/base/find_symbols.h b/base/base/find_symbols.h index 73810925aefe..c82782f5822b 100644 --- a/base/base/find_symbols.h +++ b/base/base/find_symbols.h @@ -40,7 +40,7 @@ template constexpr bool is_in(char x) { return ((x == chars) || static bool is_in(char c, const char * symbols, size_t num_chars) { - for (auto i = 0u; i < num_chars; i++) + for (size_t i = 0u; i < num_chars; ++i) { if (c == symbols[i]) { @@ -66,6 +66,43 @@ inline __m128i mm_is_in(__m128i bytes) __m128i eq = mm_is_in(bytes); return _mm_or_si128(eq0, eq); } + +inline __m128i mm_is_in(__m128i bytes, const char * symbols, size_t num_chars) +{ + __m128i accumulator = _mm_setzero_si128(); + for (size_t i = 0; i < num_chars; ++i) + { + __m128i eq = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(symbols[i])); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} + +inline std::vector<__m128i> mm_is_in_prepare(const char * symbols, size_t num_chars) +{ + std::vector<__m128i> result; + result.reserve(num_chars); + + for (size_t i = 0; i < num_chars; ++i) + { + result.emplace_back(_mm_set1_epi8(symbols[i])); + } + + return result; +} + +inline __m128i mm_is_in_execute(__m128i bytes, const std::vector<__m128i> & needles) +{ + __m128i accumulator = _mm_setzero_si128(); + for (const auto & needle : needles) + { + __m128i eq = _mm_cmpeq_epi8(bytes, needle); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} #endif template @@ -112,6 +149,32 @@ inline const char * find_first_symbols_sse2(const char * const begin, const char return return_mode == ReturnMode::End ? end : nullptr; } +template +inline const char * find_first_symbols_sse2(const char * const begin, const char * const end, const char * symbols, size_t num_chars) +{ + const char * pos = begin; + const auto needles = mm_is_in_prepare(symbols, num_chars); + +#if defined(__SSE2__) + for (; pos + 15 < end; pos += 16) + { + __m128i bytes = _mm_loadu_si128(reinterpret_cast(pos)); + + __m128i eq = mm_is_in_execute(bytes, needles); + + uint16_t bit_mask = maybe_negate(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) + return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate(is_in(*pos, symbols, num_chars))) + return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + template inline const char * find_last_symbols_sse2(const char * const begin, const char * const end) @@ -192,21 +255,6 @@ inline const char * find_first_symbols_sse42(const char * const begin, const cha return return_mode == ReturnMode::End ? end : nullptr; } - -/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do. - -template -inline const char * find_first_symbols_dispatch(const char * begin, const char * end) - requires(0 <= sizeof...(symbols) && sizeof...(symbols) <= 16) -{ -#if defined(__SSE4_2__) - if (sizeof...(symbols) >= 5) - return find_first_symbols_sse42(begin, end); - else -#endif - return find_first_symbols_sse2(begin, end); -} - template inline const char * find_first_symbols_sse42(const char * const begin, const char * const end, const char * symbols, size_t num_chars) { @@ -241,10 +289,30 @@ inline const char * find_first_symbols_sse42(const char * const begin, const cha return return_mode == ReturnMode::End ? end : nullptr; } +/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do. + +template +inline const char * find_first_symbols_dispatch(const char * begin, const char * end) + requires(0 <= sizeof...(symbols) && sizeof...(symbols) <= 16) +{ +#if defined(__SSE4_2__) + if (sizeof...(symbols) >= 5) + return find_first_symbols_sse42(begin, end); + else +#endif + return find_first_symbols_sse2(begin, end); +} + template -auto find_first_symbols_sse42(std::string_view haystack, std::string_view symbols) +inline const char * find_first_symbols_dispatch(const std::string_view haystack, const std::string_view symbols) { - return find_first_symbols_sse42(haystack.begin(), haystack.end(), symbols.begin(), symbols.size()); + const size_t num_chars = std::max(symbols.size(), 16); +#if defined(__SSE4_2__) + if (num_chars >= 5) + return find_first_symbols_sse42(haystack.begin(), haystack.end(), symbols.begin(), num_chars); + else +#endif + return find_first_symbols_sse2(haystack.begin(), haystack.end(), symbols.begin(), num_chars); } } @@ -266,7 +334,7 @@ inline char * find_first_symbols(char * begin, char * end) inline const char * find_first_symbols(std::string_view haystack, std::string_view symbols) { - return detail::find_first_symbols_sse42(haystack, symbols); + return detail::find_first_symbols_dispatch(haystack, symbols); } template @@ -283,7 +351,7 @@ inline char * find_first_not_symbols(char * begin, char * end) inline const char * find_first_not_symbols(std::string_view haystack, std::string_view symbols) { - return detail::find_first_symbols_sse42(haystack, symbols); + return detail::find_first_symbols_dispatch(haystack, symbols); } template @@ -300,7 +368,7 @@ inline char * find_first_symbols_or_null(char * begin, char * end) inline const char * find_first_symbols_or_null(std::string_view haystack, std::string_view symbols) { - return detail::find_first_symbols_sse42(haystack, symbols); + return detail::find_first_symbols_dispatch(haystack, symbols); } template @@ -317,7 +385,7 @@ inline char * find_first_not_symbols_or_null(char * begin, char * end) inline const char * find_first_not_symbols_or_null(std::string_view haystack, std::string_view symbols) { - return detail::find_first_symbols_sse42(haystack, symbols); + return detail::find_first_symbols_dispatch(haystack, symbols); } template diff --git a/src/Common/tests/gtest_find_symbols.cpp b/src/Common/tests/gtest_find_symbols.cpp index 3c167c71fd6e..b6ed39b1591d 100644 --- a/src/Common/tests/gtest_find_symbols.cpp +++ b/src/Common/tests/gtest_find_symbols.cpp @@ -23,7 +23,7 @@ void test_find_first_not(const std::string & haystack, const std::string & symbo TEST(FindSymbols, SimpleTest) { - std::string s = "Hello, world! Goodbye..."; + const std::string s = "Hello, world! Goodbye..."; const char * begin = s.data(); const char * end = s.data() + s.size(); @@ -34,6 +34,14 @@ TEST(FindSymbols, SimpleTest) ASSERT_EQ(find_first_symbols<'H'>(begin, end), begin); ASSERT_EQ((find_first_symbols<'a', 'e'>(begin, end)), begin + 1); + // Check that nothing matches on big haystack, + ASSERT_EQ(find_first_symbols(s, "ABCDEFIJKLMNOPQRSTUVWXYZacfghijkmnpqstuvxz"), end); + // only 16 bytes of haystack are checked, so nothing is found + ASSERT_EQ(find_first_symbols(s, "ABCDEFIJKLMNOPQR0helloworld"), end); + + // 16-byte needle + ASSERT_EQ(find_first_symbols(s, "XYZ!,.GHbdelorwy"), begin + 12); + ASSERT_EQ(find_last_symbols_or_null<'a'>(begin, end), nullptr); ASSERT_EQ(find_last_symbols_or_null<'e'>(begin, end), end - 4); ASSERT_EQ(find_last_symbols_or_null<'.'>(begin, end), end - 1); @@ -54,6 +62,79 @@ TEST(FindSymbols, SimpleTest) } } +TEST(FindSymbols, RunTimeNeedle) +{ + auto test_haystack = [](const auto & haystack, const auto & unfindable_needle) { +#define TEST_HAYSTACK_AND_NEEDLE(haystack_, needle_) \ + do { \ + const auto & h = haystack_; \ + const auto & n = needle_; \ + EXPECT_EQ( \ + std::find_first_of(h.data(), h.data() + h.size(), n.data(), n.data() + n.size()), \ + find_first_symbols(h, n) \ + ) << "haystack: \"" << h << "\"" \ + << ", needle: \"" << n << "\""; \ + } \ + while (false) + + // can't find + TEST_HAYSTACK_AND_NEEDLE(haystack, unfindable_needle); + +#define test_with_modified_needle(haystack, in_needle, needle_update, with) \ + do \ + { \ + std::string needle = in_needle; \ + needle_update = with; \ + TEST_HAYSTACK_AND_NEEDLE(haystack, needle); \ + } \ + while (false) + + // findable symbol is at beginnig of the needle + // Can find at first pos of haystack + test_with_modified_needle(haystack, unfindable_needle, needle.front(), haystack.front()); + // Can find at first pos of haystack + test_with_modified_needle(haystack, unfindable_needle, needle.front(), haystack.back()); + // Can find in the middle of haystack + test_with_modified_needle(haystack, unfindable_needle, needle.front(), haystack[haystack.size() / 2]); + + // findable symbol is at end of the needle + // Can find at first pos of haystack + test_with_modified_needle(haystack, unfindable_needle, needle.back(), haystack.front()); + // Can find at first pos of haystack + test_with_modified_needle(haystack, unfindable_needle, needle.back(), haystack.back()); + // Can find in the middle of haystack + test_with_modified_needle(haystack, unfindable_needle, needle.back(), haystack[haystack.size() / 2]); + + // findable symbol is in the middle of the needle + // Can find at first pos of haystack + test_with_modified_needle(haystack, unfindable_needle, needle[needle.size() / 2], haystack.front()); + // Can find at first pos of haystack + test_with_modified_needle(haystack, unfindable_needle, needle[needle.size() / 2], haystack.back()); + // Can find in the middle of haystack + test_with_modified_needle(haystack, unfindable_needle, needle[needle.size() / 2], haystack[haystack.size() / 2]); + + }; + + // there are 4 major groups of cases: + // haystack < 16 bytes, haystack > 16 bytes + // needle < 5 bytes, needle >= 5 bytes + + // First and last symbols of haystack should be unique + const std::string long_haystack = "Hello, world! Goodbye...?"; + const std::string short_haystack = "Hello, world!"; + + // In sync with find_first_symbols_dispatch code: long needles receve special treatment. + // as of now "long" means >= 5 + const std::string unfindable_long_needle = "0123456789ABCDEF"; + const std::string unfindable_short_needle = "0123"; + + test_haystack(long_haystack, unfindable_long_needle); + test_haystack(long_haystack, unfindable_short_needle); + + test_haystack(short_haystack, unfindable_long_needle); + test_haystack(short_haystack, unfindable_short_needle); +} + TEST(FindNotSymbols, AllSymbolsPresent) { std::string str_with_17_bytes = "hello world hello"; diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingKeyStateHandler.cpp b/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingKeyStateHandler.cpp deleted file mode 100644 index 2972b3b127d8..000000000000 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingKeyStateHandler.cpp +++ /dev/null @@ -1,183 +0,0 @@ -#include "InlineEscapingKeyStateHandler.h" -#include -#include -#include - -namespace DB -{ - -InlineEscapingKeyStateHandler::InlineEscapingKeyStateHandler(Configuration configuration_) - : extractor_configuration(std::move(configuration_)) -{ - wait_needles = EscapingNeedleFactory::getWaitNeedles(extractor_configuration); - read_needles = EscapingNeedleFactory::getReadNeedles(extractor_configuration); - read_quoted_needles = EscapingNeedleFactory::getReadQuotedNeedles(extractor_configuration); -} - -NextState InlineEscapingKeyStateHandler::wait(std::string_view file, size_t pos) const -{ - BoundsSafeCharacterFinder finder; - - const auto quoting_character = extractor_configuration.quoting_character; - - while (auto character_position_opt = finder.findFirstNot(file, pos, wait_needles)) - { - auto character_position = *character_position_opt; - auto character = file[character_position]; - - if (quoting_character == character) - { - return {character_position + 1u, State::READING_QUOTED_KEY}; - } - else - { - return {character_position, State::READING_KEY}; - } - } - - return {file.size(), State::END}; -} - -/* - * I only need to iteratively copy stuff if there are escape sequences. If not, views are sufficient. - * TSKV has a nice catch for that, implementers kept an auxiliary string to hold copied characters. - * If I find a key value delimiter and that is empty, I do not need to copy? hm,m hm hm - * */ - -NextState InlineEscapingKeyStateHandler::read(std::string_view file, size_t pos, ElementType & key) const -{ - BoundsSafeCharacterFinder finder; - - const auto & [key_value_delimiter, quoting_character, pair_delimiters] - = extractor_configuration; - - key.clear(); - - /* - * Maybe modify finder return type to be the actual pos. In case of failures, it shall return pointer to the end. - * It might help updating current pos? - * */ - - while (auto character_position_opt = finder.findFirst(file, pos, read_needles)) - { - auto character_position = *character_position_opt; - auto character = file[character_position]; - auto next_pos = character_position + 1u; - - if (EscapedCharacterReader::ESCAPE_CHARACTER == character) - { - for (auto i = pos; i < character_position; i++) - { - key.push_back(file[i]); - } - - auto [next_byte_ptr, escaped_characters] = EscapedCharacterReader::read(file, character_position); - next_pos = next_byte_ptr - file.begin(); - - if (escaped_characters.empty()) - { - return {next_pos, State::WAITING_KEY}; - } - else - { - for (auto escaped_character : escaped_characters) - { - key.push_back(escaped_character); - } - } - } - else if (character == key_value_delimiter) - { - // todo try to optimize with resize and memcpy - for (auto i = pos; i < character_position; i++) - { - key.push_back(file[i]); - } - - return {next_pos, State::WAITING_VALUE}; - } - else if (std::find(pair_delimiters.begin(), pair_delimiters.end(), character) != pair_delimiters.end()) - { - return {next_pos, State::WAITING_KEY}; - } - - pos = next_pos; - } - - // might be problematic in case string reaches the end and I haven't copied anything over to key - - return {file.size(), State::END}; -} - -NextState InlineEscapingKeyStateHandler::readQuoted(std::string_view file, size_t pos, ElementType & key) const -{ - BoundsSafeCharacterFinder finder; - - const auto quoting_character = extractor_configuration.quoting_character; - - key.clear(); - - while (auto character_position_opt = finder.findFirst(file, pos, read_quoted_needles)) - { - auto character_position = *character_position_opt; - auto character = file[character_position]; - auto next_pos = character_position + 1u; - - if (character == EscapedCharacterReader::ESCAPE_CHARACTER) - { - for (auto i = pos; i < character_position; i++) - { - key.push_back(file[i]); - } - - auto [next_byte_ptr, escaped_characters] = EscapedCharacterReader::read(file, character_position); - next_pos = next_byte_ptr - file.begin(); - - if (escaped_characters.empty()) - { - return {next_pos, State::WAITING_KEY}; - } - else - { - for (auto escaped_character : escaped_characters) - { - key.push_back(escaped_character); - } - } - } - else if (quoting_character == character) - { - // todo try to optimize with resize and memcpy - for (auto i = pos; i < character_position; i++) - { - key.push_back(file[i]); - } - - if (key.empty()) - { - return {next_pos, State::WAITING_KEY}; - } - - return {next_pos, State::READING_KV_DELIMITER}; - } - - pos = next_pos; - } - - return {file.size(), State::END}; -} - -NextState InlineEscapingKeyStateHandler::readKeyValueDelimiter(std::string_view file, size_t pos) const -{ - if (pos == file.size()) - { - return {pos, State::END}; - } - else - { - const auto current_character = file[pos++]; - return {pos, extractor_configuration.key_value_delimiter == current_character ? State::WAITING_VALUE : State::WAITING_KEY}; - } -} - -} diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingKeyStateHandler.h b/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingKeyStateHandler.h deleted file mode 100644 index 2f49b9419aea..000000000000 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingKeyStateHandler.h +++ /dev/null @@ -1,34 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -namespace DB -{ - -class InlineEscapingKeyStateHandler : public StateHandler -{ -public: - using ElementType = std::string; - - explicit InlineEscapingKeyStateHandler(Configuration configuration_); - - [[nodiscard]] NextState wait(std::string_view file, size_t pos) const; - - [[nodiscard]] NextState read(std::string_view file, size_t pos, ElementType & key) const; - - [[nodiscard]] NextState readQuoted(std::string_view file, size_t pos, ElementType & key) const; - - [[nodiscard]] NextState readKeyValueDelimiter(std::string_view file, size_t pos) const; - -private: - Configuration extractor_configuration; - - std::vector wait_needles; - std::vector read_needles; - std::vector read_quoted_needles; -}; - -} diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingValueStateHandler.cpp b/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp similarity index 98% rename from src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingValueStateHandler.cpp rename to src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp index e6f99c4631c5..f605a1823dd6 100644 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingValueStateHandler.cpp +++ b/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp @@ -3,6 +3,13 @@ #include #include +#include + +namespace +{ + +} + namespace DB { diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingValueStateHandler.h b/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.h similarity index 54% rename from src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingValueStateHandler.h rename to src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.h index 5412ef82f8ba..de8b34cff8ba 100644 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingValueStateHandler.h +++ b/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.h @@ -9,6 +9,30 @@ namespace DB { +class InlineEscapingKeyStateHandler : public StateHandler +{ +public: + using ElementType = std::string; + + explicit InlineEscapingKeyStateHandler(Configuration configuration_); + + [[nodiscard]] NextState wait(std::string_view file, size_t pos) const; + + [[nodiscard]] NextState read(std::string_view file, size_t pos, ElementType & key) const; + + [[nodiscard]] NextState readQuoted(std::string_view file, size_t pos, ElementType & key) const; + + [[nodiscard]] NextState readKeyValueDelimiter(std::string_view file, size_t pos) const; + +private: + Configuration extractor_configuration; + + std::vector wait_needles; + std::vector read_needles; + std::vector read_quoted_needles; +}; + + class InlineEscapingValueStateHandler : public StateHandler { public: From ce608b135f12128a8e750bc895e43907d0cefa7f Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Sat, 25 Mar 2023 23:18:51 +0100 Subject: [PATCH 02/12] Fixed implementation of find_first_symbols_sse42 run-time needle Added some tests --- base/base/find_symbols.h | 7 +- src/Common/tests/gtest_find_symbols.cpp | 125 ++++++++++++++++++------ 2 files changed, 100 insertions(+), 32 deletions(-) diff --git a/base/base/find_symbols.h b/base/base/find_symbols.h index c82782f5822b..ed6764f68482 100644 --- a/base/base/find_symbols.h +++ b/base/base/find_symbols.h @@ -263,7 +263,10 @@ inline const char * find_first_symbols_sse42(const char * const begin, const cha #if defined(__SSE4_2__) constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; - const __m128i set = _mm_loadu_si128(reinterpret_cast(symbols)); + // This is to avoid read past end of `symbols` if `num_chars < 16`. + char buffer[16] = {'\0'}; + memcpy(buffer, symbols, num_chars); + const __m128i set = _mm_loadu_si128(reinterpret_cast(buffer)); for (; pos + 15 < end; pos += 16) { @@ -306,7 +309,7 @@ inline const char * find_first_symbols_dispatch(const char * begin, const char * template inline const char * find_first_symbols_dispatch(const std::string_view haystack, const std::string_view symbols) { - const size_t num_chars = std::max(symbols.size(), 16); + const size_t num_chars = std::min(symbols.size(), 16); #if defined(__SSE4_2__) if (num_chars >= 5) return find_first_symbols_sse42(haystack.begin(), haystack.end(), symbols.begin(), num_chars); diff --git a/src/Common/tests/gtest_find_symbols.cpp b/src/Common/tests/gtest_find_symbols.cpp index b6ed39b1591d..0f719382f81e 100644 --- a/src/Common/tests/gtest_find_symbols.cpp +++ b/src/Common/tests/gtest_find_symbols.cpp @@ -34,13 +34,8 @@ TEST(FindSymbols, SimpleTest) ASSERT_EQ(find_first_symbols<'H'>(begin, end), begin); ASSERT_EQ((find_first_symbols<'a', 'e'>(begin, end)), begin + 1); - // Check that nothing matches on big haystack, - ASSERT_EQ(find_first_symbols(s, "ABCDEFIJKLMNOPQRSTUVWXYZacfghijkmnpqstuvxz"), end); - // only 16 bytes of haystack are checked, so nothing is found - ASSERT_EQ(find_first_symbols(s, "ABCDEFIJKLMNOPQR0helloworld"), end); - - // 16-byte needle - ASSERT_EQ(find_first_symbols(s, "XYZ!,.GHbdelorwy"), begin + 12); + ASSERT_EQ((find_first_symbols<'a', 'e', 'w', 'x', 'z'>(begin, end)), begin + 1); + ASSERT_EQ((find_first_symbols<'p', 'q', 's', 'x', 'z'>(begin, end)), end); ASSERT_EQ(find_last_symbols_or_null<'a'>(begin, end), nullptr); ASSERT_EQ(find_last_symbols_or_null<'e'>(begin, end), end - 4); @@ -62,6 +57,54 @@ TEST(FindSymbols, SimpleTest) } } +template +inline const char * find_first_symbols_sse42_MY(const char * const begin, const char * const end, const char * symbols, size_t num_chars) +{ + using namespace detail; + const char * pos = begin; + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + +#if defined(__AVX512F__) || defined(__AVX512BW__) || defined(__AVX__) || defined(__AVX2__) + +#else + // This is to avoid read past end of allocated string while loading `set` from `symbols` if `num_chars < 16`. + char buffer[16] = {'\0'}; + memcpy(buffer, symbols, num_chars); + const __m128i set = _mm_loadu_si128(reinterpret_cast(buffer)); +#endif + + for (; pos + 15 < end; pos += 16) + { + __m128i bytes = _mm_loadu_si128(reinterpret_cast(pos)); + + if constexpr (positive) + { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } + else + { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate(is_in(*pos, symbols, num_chars))) + return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template +inline const char * find_first_symbols_MY(const char * begin, const char * end) +{ + return detail::find_first_symbols_dispatch(begin, end); +} + TEST(FindSymbols, RunTimeNeedle) { auto test_haystack = [](const auto & haystack, const auto & unfindable_needle) { @@ -72,47 +115,47 @@ TEST(FindSymbols, RunTimeNeedle) EXPECT_EQ( \ std::find_first_of(h.data(), h.data() + h.size(), n.data(), n.data() + n.size()), \ find_first_symbols(h, n) \ - ) << "haystack: \"" << h << "\"" \ + ) << "haystack: \"" << h << "\" (" << static_cast(h.data()) << ")" \ << ", needle: \"" << n << "\""; \ } \ while (false) - // can't find + // can't find needle TEST_HAYSTACK_AND_NEEDLE(haystack, unfindable_needle); -#define test_with_modified_needle(haystack, in_needle, needle_update, with) \ +#define test_with_modified_needle(haystack, in_needle, needle_update_statement) \ do \ { \ - std::string needle = in_needle; \ - needle_update = with; \ + std::string needle = (in_needle); \ + (needle_update_statement); \ + std::cerr << "!!!\tHaystack: \"" << haystack << "\" needele: \"" << needle << "\"" << std::endl; \ TEST_HAYSTACK_AND_NEEDLE(haystack, needle); \ } \ while (false) // findable symbol is at beginnig of the needle // Can find at first pos of haystack - test_with_modified_needle(haystack, unfindable_needle, needle.front(), haystack.front()); + test_with_modified_needle(haystack, unfindable_needle, needle.front() = haystack.front()); // Can find at first pos of haystack - test_with_modified_needle(haystack, unfindable_needle, needle.front(), haystack.back()); + test_with_modified_needle(haystack, unfindable_needle, needle.front() = haystack.back()); // Can find in the middle of haystack - test_with_modified_needle(haystack, unfindable_needle, needle.front(), haystack[haystack.size() / 2]); + test_with_modified_needle(haystack, unfindable_needle, needle.front() = haystack[haystack.size() / 2]); // findable symbol is at end of the needle // Can find at first pos of haystack - test_with_modified_needle(haystack, unfindable_needle, needle.back(), haystack.front()); + test_with_modified_needle(haystack, unfindable_needle, needle.back() = haystack.front()); // Can find at first pos of haystack - test_with_modified_needle(haystack, unfindable_needle, needle.back(), haystack.back()); + test_with_modified_needle(haystack, unfindable_needle, needle.back() = haystack.back()); // Can find in the middle of haystack - test_with_modified_needle(haystack, unfindable_needle, needle.back(), haystack[haystack.size() / 2]); + test_with_modified_needle(haystack, unfindable_needle, needle.back() = haystack[haystack.size() / 2]); // findable symbol is in the middle of the needle // Can find at first pos of haystack - test_with_modified_needle(haystack, unfindable_needle, needle[needle.size() / 2], haystack.front()); + test_with_modified_needle(haystack, unfindable_needle, needle[needle.size() / 2] = haystack.front()); // Can find at first pos of haystack - test_with_modified_needle(haystack, unfindable_needle, needle[needle.size() / 2], haystack.back()); + test_with_modified_needle(haystack, unfindable_needle, needle[needle.size() / 2] = haystack.back()); // Can find in the middle of haystack - test_with_modified_needle(haystack, unfindable_needle, needle[needle.size() / 2], haystack[haystack.size() / 2]); - + test_with_modified_needle(haystack, unfindable_needle, needle[needle.size() / 2] = haystack[haystack.size() / 2]); }; // there are 4 major groups of cases: @@ -120,19 +163,41 @@ TEST(FindSymbols, RunTimeNeedle) // needle < 5 bytes, needle >= 5 bytes // First and last symbols of haystack should be unique - const std::string long_haystack = "Hello, world! Goodbye...?"; - const std::string short_haystack = "Hello, world!"; + std::string long_haystack = "Hello, world! Goodbye...?"; + std::string short_haystack = "Hello, world!"; // In sync with find_first_symbols_dispatch code: long needles receve special treatment. // as of now "long" means >= 5 - const std::string unfindable_long_needle = "0123456789ABCDEF"; - const std::string unfindable_short_needle = "0123"; + std::string unfindable_long_needle = "0123456789ABCDEF"; + std::string unfindable_short_needle = "0123"; - test_haystack(long_haystack, unfindable_long_needle); - test_haystack(long_haystack, unfindable_short_needle); - test_haystack(short_haystack, unfindable_long_needle); - test_haystack(short_haystack, unfindable_short_needle); + for (size_t i = 0; i < 100; ++i) + { + { + SCOPED_TRACE("Long haystack"); + test_haystack(long_haystack, unfindable_long_needle); + test_haystack(long_haystack, unfindable_short_needle); + } + + { + SCOPED_TRACE("Short haystack"); + test_haystack(short_haystack, unfindable_long_needle); + test_haystack(short_haystack, unfindable_short_needle); + } + + // re-allocate buffers to make sure that we run tests against different addresses each time. + long_haystack.reserve(long_haystack.capacity() + 1); + short_haystack.reserve(long_haystack.capacity() + 1); + unfindable_long_needle.reserve(unfindable_long_needle.capacity() + 1); + unfindable_short_needle.reserve(unfindable_short_needle.capacity() + 1); + } + + // Check that nothing matches on big haystack, + EXPECT_EQ(find_first_symbols(long_haystack, "ABCDEFIJKLMNOPQRSTUVWXYZacfghijkmnpqstuvxz"), long_haystack.data() + long_haystack.size()); + + // only 16 bytes of haystack are checked, so nothing is found + EXPECT_EQ(find_first_symbols(long_haystack, "ABCDEFIJKLMNOPQR0helloworld"), long_haystack.data() + long_haystack.size()); } TEST(FindNotSymbols, AllSymbolsPresent) From fa27ed9d41020cc5dfafbd0c59fd98ee2f40196e Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Sat, 25 Mar 2023 23:31:17 +0100 Subject: [PATCH 03/12] Less noisy tests --- src/Common/tests/gtest_find_symbols.cpp | 58 +++++++++++-------------- 1 file changed, 25 insertions(+), 33 deletions(-) diff --git a/src/Common/tests/gtest_find_symbols.cpp b/src/Common/tests/gtest_find_symbols.cpp index 0f719382f81e..298b0670d2df 100644 --- a/src/Common/tests/gtest_find_symbols.cpp +++ b/src/Common/tests/gtest_find_symbols.cpp @@ -123,39 +123,41 @@ TEST(FindSymbols, RunTimeNeedle) // can't find needle TEST_HAYSTACK_AND_NEEDLE(haystack, unfindable_needle); -#define test_with_modified_needle(haystack, in_needle, needle_update_statement) \ +#define TEST_WITH_MODIFIED_NEEDLE(haystack, in_needle, needle_update_statement) \ do \ { \ std::string needle = (in_needle); \ (needle_update_statement); \ - std::cerr << "!!!\tHaystack: \"" << haystack << "\" needele: \"" << needle << "\"" << std::endl; \ TEST_HAYSTACK_AND_NEEDLE(haystack, needle); \ } \ while (false) // findable symbol is at beginnig of the needle // Can find at first pos of haystack - test_with_modified_needle(haystack, unfindable_needle, needle.front() = haystack.front()); + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.front() = haystack.front()); // Can find at first pos of haystack - test_with_modified_needle(haystack, unfindable_needle, needle.front() = haystack.back()); + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.front() = haystack.back()); // Can find in the middle of haystack - test_with_modified_needle(haystack, unfindable_needle, needle.front() = haystack[haystack.size() / 2]); + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.front() = haystack[haystack.size() / 2]); // findable symbol is at end of the needle // Can find at first pos of haystack - test_with_modified_needle(haystack, unfindable_needle, needle.back() = haystack.front()); + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.back() = haystack.front()); // Can find at first pos of haystack - test_with_modified_needle(haystack, unfindable_needle, needle.back() = haystack.back()); + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.back() = haystack.back()); // Can find in the middle of haystack - test_with_modified_needle(haystack, unfindable_needle, needle.back() = haystack[haystack.size() / 2]); + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.back() = haystack[haystack.size() / 2]); // findable symbol is in the middle of the needle // Can find at first pos of haystack - test_with_modified_needle(haystack, unfindable_needle, needle[needle.size() / 2] = haystack.front()); + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle[needle.size() / 2] = haystack.front()); // Can find at first pos of haystack - test_with_modified_needle(haystack, unfindable_needle, needle[needle.size() / 2] = haystack.back()); + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle[needle.size() / 2] = haystack.back()); // Can find in the middle of haystack - test_with_modified_needle(haystack, unfindable_needle, needle[needle.size() / 2] = haystack[haystack.size() / 2]); + TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle[needle.size() / 2] = haystack[haystack.size() / 2]); + +#undef TEST_WITH_MODIFIED_NEEDLE +#undef TEST_HAYSTACK_AND_NEEDLE }; // there are 4 major groups of cases: @@ -163,34 +165,24 @@ TEST(FindSymbols, RunTimeNeedle) // needle < 5 bytes, needle >= 5 bytes // First and last symbols of haystack should be unique - std::string long_haystack = "Hello, world! Goodbye...?"; - std::string short_haystack = "Hello, world!"; + const std::string long_haystack = "Hello, world! Goodbye...?"; + const std::string short_haystack = "Hello, world!"; // In sync with find_first_symbols_dispatch code: long needles receve special treatment. // as of now "long" means >= 5 - std::string unfindable_long_needle = "0123456789ABCDEF"; - std::string unfindable_short_needle = "0123"; - + const std::string unfindable_long_needle = "0123456789ABCDEF"; + const std::string unfindable_short_needle = "0123"; - for (size_t i = 0; i < 100; ++i) { - { - SCOPED_TRACE("Long haystack"); - test_haystack(long_haystack, unfindable_long_needle); - test_haystack(long_haystack, unfindable_short_needle); - } - - { - SCOPED_TRACE("Short haystack"); - test_haystack(short_haystack, unfindable_long_needle); - test_haystack(short_haystack, unfindable_short_needle); - } + SCOPED_TRACE("Long haystack"); + test_haystack(long_haystack, unfindable_long_needle); + test_haystack(long_haystack, unfindable_short_needle); + } - // re-allocate buffers to make sure that we run tests against different addresses each time. - long_haystack.reserve(long_haystack.capacity() + 1); - short_haystack.reserve(long_haystack.capacity() + 1); - unfindable_long_needle.reserve(unfindable_long_needle.capacity() + 1); - unfindable_short_needle.reserve(unfindable_short_needle.capacity() + 1); + { + SCOPED_TRACE("Short haystack"); + test_haystack(short_haystack, unfindable_long_needle); + test_haystack(short_haystack, unfindable_short_needle); } // Check that nothing matches on big haystack, From 7854800517047e1b94e138de7e65c36715fef28e Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Mon, 27 Mar 2023 19:35:22 +0200 Subject: [PATCH 04/12] Almost works --- .../keyvaluepair/src/KeyValuePairExtractor.h | 4 +- .../src/KeyValuePairExtractorBuilder.cpp | 4 +- .../src/KeyValuePairExtractorBuilder.h | 6 +- .../src/impl/CHKeyValuePairExtractor.h | 101 +++++-- .../src/impl/state/Configuration.cpp | 11 +- .../src/impl/state/StateHandler.h | 67 ++++- .../escaping/InlineEscapingStateHandler.cpp | 253 ++++++++++++------ .../escaping/InlineEscapingStateHandler.h | 14 +- .../noescaping/NoEscapingKeyStateHandler.cpp | 25 +- .../noescaping/NoEscapingKeyStateHandler.h | 8 +- .../NoEscapingValueStateHandler.cpp | 10 +- .../noescaping/NoEscapingValueStateHandler.h | 6 +- .../state/strategies/util/CharacterFinder.cpp | 8 +- .../util/EscapedCharacterReader.cpp | 36 --- .../strategies/util/EscapedCharacterReader.h | 33 --- .../state/strategies/util/NeedleFactory.cpp | 7 +- .../tests/gtest_character_finder.cpp | 4 +- .../tests/gtest_escaped_sequence_reader.cpp | 114 ++++---- ...test_escaping_key_value_pair_extractor.cpp | 2 +- ...test_inline_escaping_key_state_handler.cpp | 14 +- ...st_inline_escaping_value_state_handler.cpp | 6 +- .../tests/gtest_key_value_pair_extractor.cpp | 12 +- .../gtest_no_escaping_key_state_handler.cpp | 10 +- 23 files changed, 448 insertions(+), 307 deletions(-) delete mode 100644 src/Functions/keyvaluepair/src/impl/state/strategies/util/EscapedCharacterReader.cpp delete mode 100644 src/Functions/keyvaluepair/src/impl/state/strategies/util/EscapedCharacterReader.h diff --git a/src/Functions/keyvaluepair/src/KeyValuePairExtractor.h b/src/Functions/keyvaluepair/src/KeyValuePairExtractor.h index 311c14a80e14..72f2bc0c955e 100644 --- a/src/Functions/keyvaluepair/src/KeyValuePairExtractor.h +++ b/src/Functions/keyvaluepair/src/KeyValuePairExtractor.h @@ -1,8 +1,10 @@ #pragma once -#include #include +#include +#include + namespace DB { /* diff --git a/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.cpp b/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.cpp index 015f98aa6eea..8add46530968 100644 --- a/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.cpp +++ b/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.cpp @@ -1,6 +1,6 @@ #include "KeyValuePairExtractorBuilder.h" -#include "Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingKeyStateHandler.h" -#include "Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingValueStateHandler.h" + +#include "Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.h" #include "Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.h" #include "Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.h" #include "impl/CHKeyValuePairExtractor.h" diff --git a/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.h b/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.h index da04c1c93d5b..781e75cf73b8 100644 --- a/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.h +++ b/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.h @@ -3,11 +3,15 @@ #include "Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.h" #include "Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.h" #include "KeyValuePairExtractor.h" -#include "impl/CHKeyValuePairExtractor.h" +//#include "impl/CHKeyValuePairExtractor.h" + +#include namespace DB { +struct KeyValuePairExtractor; + class KeyValuePairExtractorBuilder { public: diff --git a/src/Functions/keyvaluepair/src/impl/CHKeyValuePairExtractor.h b/src/Functions/keyvaluepair/src/impl/CHKeyValuePairExtractor.h index 5f0ff4614de1..2c1c78a62330 100644 --- a/src/Functions/keyvaluepair/src/impl/CHKeyValuePairExtractor.h +++ b/src/Functions/keyvaluepair/src/impl/CHKeyValuePairExtractor.h @@ -3,9 +3,12 @@ #include #include #include +#include "Functions/keyvaluepair/src/impl/state/State.h" +#include "fmt/core.h" #include "state/StateHandler.h" #include "../KeyValuePairExtractor.h" +#include namespace DB { @@ -28,68 +31,124 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor uint64_t extract(std::string_view data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) override { - auto state = State::WAITING_KEY; - - std::size_t pos = 0; +// std::cerr << "CHKeyValuePairExtractor::extract: \"" << data << "\"" << std::endl; + auto state = State::WAITING_KEY; Key key; Value value; uint64_t row_offset = 0; - + const auto & config = key_state_handler.extractor_configuration; + std::cerr << "CHKeyValuePairExtractor::extract with " + << typeid(key_state_handler).name() << " \\ " + << typeid(value_state_handler).name() + << "\nConfiguration" + << "\n\tKV delimiter: '" << config.key_value_delimiter << "'" + << "\n\tquote char : '" << config.quoting_character << "'" + << "\n\tpair delims : " << fmt::format("['{}']", fmt::join(config.pair_delimiters, "', '")) + << std::endl; + + NextState next_state = {0, state}; while (state != State::END) { - auto next_state = processState(data, pos, state, key, value, keys, values, row_offset); - - pos = next_state.position_in_string; - state = next_state.state; + std::cerr << "CHKeyValuePairExtractor::extract 1, state: " + << magic_enum::enum_name(state) + << " (" << data.size() << ") " + << fancyQuote(data) + << std::endl; + + next_state = processState(data, state, key, value, keys, values, row_offset); + + std::cerr << "CHKeyValuePairExtractor::extract 2, new_state: " + << magic_enum::enum_name(next_state.state) + << " consumed chars: (" << next_state.position_in_string << ") " + << fancyQuote(data.substr(0, std::min(data.size(), next_state.position_in_string))) + << std::endl; + + if (next_state.position_in_string > data.size() && next_state.state != State::END) { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Attempt to move read poiter past end of available data, from state {} to new state: {}, new position: {}, available data: {}", + magic_enum::enum_name(state), magic_enum::enum_name(next_state.state), + next_state.position_in_string, data.size()); +// next_result = {data.size(), State::END}; + } + + data.remove_prefix(next_state.position_in_string); + state = next_state.state; + + if (data.size() == 0) + state = State::END; } + // if break occured earlier, consume previously generated pair + if (next_state.state == State::FLUSH_PAIR || !(key.empty() && value.empty())) + flushPair(data, key, value, keys, values, row_offset); + return row_offset; } private: - NextState processState(std::string_view file, std::size_t pos, State state, Key & key, + NextState processState(std::string_view file, State state, Key & key, Value & value, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values, uint64_t & row_offset) { switch (state) { case State::WAITING_KEY: - return key_state_handler.wait(file, pos); + return key_state_handler.wait(file); case State::READING_KEY: - return key_state_handler.read(file, pos, key); + { + auto result = key_state_handler.read(file, key); + std::cerr << "CHKeyValuePairExtractor::processState key: " << fancyQuote(key) << std::endl; + return result; + } case State::READING_QUOTED_KEY: - return key_state_handler.readQuoted(file, pos, key); + { + auto result = key_state_handler.readQuoted(file, key); + std::cerr << "CHKeyValuePairExtractor::processState key: " << fancyQuote(key) << std::endl; + return result; + } case State::READING_KV_DELIMITER: - return key_state_handler.readKeyValueDelimiter(file, pos); + return key_state_handler.readKeyValueDelimiter(file); case State::WAITING_VALUE: - return value_state_handler.wait(file, pos); + { + return value_state_handler.wait(file); + } case State::READING_VALUE: - return value_state_handler.read(file, pos, value); + { + auto result = value_state_handler.read(file, value); + std::cerr << "CHKeyValuePairExtractor::processState value: " << fancyQuote(value) << std::endl; + return result; + } case State::READING_QUOTED_VALUE: - return value_state_handler.readQuoted(file, pos, value); + { + auto result = value_state_handler.readQuoted(file, value); + std::cerr << "CHKeyValuePairExtractor::processState value: " << fancyQuote(value) << std::endl; + return result; + } case State::FLUSH_PAIR: - return flushPair(file, pos, key, value, keys, values, row_offset); + return flushPair(file, key, value, keys, values, row_offset); case END: - return {pos, state}; + return {0, state}; } } - NextState flushPair(const std::string_view & file, std::size_t pos, Key & key, + NextState flushPair(const std::string_view & file, Key & key, Value & value, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values, uint64_t & row_offset) { + std::cerr << "CHKeyValuePairExtractor::flushPair key: " << fancyQuote(key) << ", value: " << fancyQuote(value) << std::endl; keys->insertData(key.data(), key.size()); values->insertData(value.data(), value.size()); key = {}; value = {}; - row_offset++; + ++row_offset; + std::cerr << "CHKeyValuePairExtractor::flushPair total pairs: " << row_offset << std::endl; - return {pos, pos == file.size() ? State::END : State::WAITING_KEY}; + return {0, file.size() == 0 ? State::END : State::WAITING_KEY}; } KeyStateHandler key_state_handler; diff --git a/src/Functions/keyvaluepair/src/impl/state/Configuration.cpp b/src/Functions/keyvaluepair/src/impl/state/Configuration.cpp index 9f69652b8937..547b924e1df5 100644 --- a/src/Functions/keyvaluepair/src/impl/state/Configuration.cpp +++ b/src/Functions/keyvaluepair/src/impl/state/Configuration.cpp @@ -1,8 +1,6 @@ #include "Configuration.h" #include -#include - namespace DB { @@ -25,14 +23,15 @@ Configuration ConfigurationFactory::createWithoutEscaping(char key_value_delimit Configuration ConfigurationFactory::createWithEscaping(char key_value_delimiter, char quoting_character, std::vector pair_delimiters) { - if (key_value_delimiter == EscapedCharacterReader::ESCAPE_CHARACTER - || quoting_character == EscapedCharacterReader::ESCAPE_CHARACTER - || std::find(pair_delimiters.begin(), pair_delimiters.end(), EscapedCharacterReader::ESCAPE_CHARACTER) != pair_delimiters.end()) + constexpr char ESCAPE_CHARACTER = '\\'; + if (key_value_delimiter == ESCAPE_CHARACTER + || quoting_character == ESCAPE_CHARACTER + || std::find(pair_delimiters.begin(), pair_delimiters.end(), ESCAPE_CHARACTER) != pair_delimiters.end()) { throw Exception( ErrorCodes::BAD_ARGUMENTS, "Invalid arguments, {} is reserved for the escaping character", - EscapedCharacterReader::ESCAPE_CHARACTER); + ESCAPE_CHARACTER); } return createWithoutEscaping(key_value_delimiter, quoting_character, pair_delimiters); diff --git a/src/Functions/keyvaluepair/src/impl/state/StateHandler.h b/src/Functions/keyvaluepair/src/impl/state/StateHandler.h index 21f5ac3640cf..ae2bbfc61241 100644 --- a/src/Functions/keyvaluepair/src/impl/state/StateHandler.h +++ b/src/Functions/keyvaluepair/src/impl/state/StateHandler.h @@ -5,25 +5,27 @@ #include #include "State.h" +#include + namespace DB { template concept CInlineEscapingKeyStateHandler = requires(KeyStateHandler handler) { - { handler.wait(std::string_view {}, std::size_t {}) } -> std::same_as; - { handler.read(std::string_view {}, std::size_t {}, std::declval()) } -> std::same_as; - { handler.readQuoted(std::string_view {}, std::size_t {}, std::declval()) } -> std::same_as; - { handler.readKeyValueDelimiter(std::string_view {}, std::size_t {}) } -> std::same_as; + { handler.wait(std::string_view {}) } -> std::same_as; + { handler.read(std::string_view {}, std::declval()) } -> std::same_as; + { handler.readQuoted(std::string_view {}, std::declval()) } -> std::same_as; + { handler.readKeyValueDelimiter(std::string_view {}) } -> std::same_as; }; template concept CNoEscapingKeyStateHandler = requires(KeyStateHandler handler) { - { handler.wait(std::string_view {}, std::size_t {}) } -> std::same_as; - { handler.read(std::string_view {}, std::size_t {}, std::declval()) } -> std::same_as; - { handler.readQuoted(std::string_view {}, std::size_t {}, std::declval()) } -> std::same_as; - { handler.readKeyValueDelimiter(std::string_view {}, std::size_t {}) } -> std::same_as; + { handler.wait(std::string_view {}) } -> std::same_as; + { handler.read(std::string_view {}, std::declval()) } -> std::same_as; + { handler.readQuoted(std::string_view {}, std::declval()) } -> std::same_as; + { handler.readKeyValueDelimiter(std::string_view {}) } -> std::same_as; }; template @@ -32,17 +34,17 @@ concept CKeyStateHandler = CInlineEscapingKeyStateHandler || CNoEscapingKeySt template concept CInlineEscapingValueStateHandler = requires(ValueStateHandler handler) { - { handler.wait(std::string_view {}, std::size_t {}) } -> std::same_as; - { handler.read(std::string_view {}, std::size_t {}, std::declval()) } -> std::same_as; - { handler.readQuoted(std::string_view {}, std::size_t {}, std::declval()) } -> std::same_as; + { handler.wait(std::string_view {}) } -> std::same_as; + { handler.read(std::string_view {}, std::declval()) } -> std::same_as; + { handler.readQuoted(std::string_view {}, std::declval()) } -> std::same_as; }; template concept CNoEscapingValueStateHandler = requires(ValueStateHandler handler) { - { handler.wait(std::string_view {}, std::size_t {}) } -> std::same_as; - { handler.read(std::string_view {}, std::size_t {}, std::declval()) } -> std::same_as; - { handler.readQuoted(std::string_view {}, std::size_t {}, std::declval()) } -> std::same_as; + { handler.wait(std::string_view {}) } -> std::same_as; + { handler.read(std::string_view {}, std::declval()) } -> std::same_as; + { handler.readQuoted(std::string_view {}, std::declval()) } -> std::same_as; }; template @@ -59,4 +61,41 @@ struct StateHandler [[nodiscard]] static std::string_view createElement(std::string_view file, std::size_t begin, std::size_t end); }; +template +struct CustomQuoted +{ + const char * start_quote = "\""; + const char * end_quote = "\""; + + const T & value; +}; + +template +CustomQuoted customQuote(const char * start_quote, const T & value, const char * end_quote = nullptr) +{ + assert(start_quote != nullptr); + + return CustomQuoted{ + .start_quote = start_quote, + .end_quote = end_quote ? end_quote : start_quote, + .value = value + }; +} + +template +CustomQuoted fancyQuote(const T & value) +{ + return CustomQuoted{ + .start_quote = "«", + .end_quote = "»", + .value = value + }; +} + +template +std::ostream & operator<<(std::ostream & ostr, const CustomQuoted & val) +{ + return ostr << val.start_quote << val.value << val.end_quote; +} + } diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp b/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp index f605a1823dd6..a68e3f06e615 100644 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp +++ b/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp @@ -1,18 +1,160 @@ -#include "InlineEscapingValueStateHandler.h" +#include "InlineEscapingStateHandler.h" #include -#include #include #include +#include +#include namespace { +size_t consumeWithEscapeSequence(std::string_view file, size_t start_pos, size_t character_pos, std::string & output) +{ + output.insert(output.end(), file.begin() + start_pos, file.begin() + character_pos); + + DB::ReadBufferFromMemory buf(file.begin() + character_pos, file.size() - character_pos); + DB::parseComplexEscapeSequence(output, buf); + return buf.getPosition(); +} } namespace DB { +InlineEscapingKeyStateHandler::InlineEscapingKeyStateHandler(Configuration configuration_) + : extractor_configuration(std::move(configuration_)) +{ + wait_needles = EscapingNeedleFactory::getWaitNeedles(extractor_configuration); + read_needles = EscapingNeedleFactory::getReadNeedles(extractor_configuration); + read_quoted_needles = EscapingNeedleFactory::getReadQuotedNeedles(extractor_configuration); +} + +NextState InlineEscapingKeyStateHandler::wait(std::string_view file) const +{ + const auto quoting_character = extractor_configuration.quoting_character; + size_t pos = 0; + while (const auto * p = find_first_not_symbols_or_null({file.begin() + pos, file.end()}, wait_needles)) + { + const size_t character_position = p - file.begin(); + + if (*p == quoting_character) + { + return {character_position + 1u, State::READING_QUOTED_KEY}; + } + else + { + return {character_position, State::READING_KEY}; + } + } + + return {file.size(), State::END}; +} + +/* + * I only need to iteratively copy stuff if there are escape sequences. If not, views are sufficient. + * TSKV has a nice catch for that, implementers kept an auxiliary string to hold copied characters. + * If I find a key value delimiter and that is empty, I do not need to copy? hm,m hm hm + * */ + +NextState InlineEscapingKeyStateHandler::read(std::string_view file, ElementType & key) const +{ + const auto & [key_value_delimiter, quoting_character, pair_delimiters] = extractor_configuration; + + key.clear(); + + size_t pos = 0; + while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_needles)) + { + auto character_position = p - file.begin(); + size_t next_pos = character_position + 1u; + + if (*p == '\\') + { + const size_t escape_seq_len = consumeWithEscapeSequence(file, pos, character_position, key); + next_pos = character_position + escape_seq_len; + if (escape_seq_len == 0) + { + return {next_pos, State::WAITING_KEY}; + } + } + else if (*p == key_value_delimiter) + { + key.insert(key.end(), file.begin() + pos, file.begin() + character_position); + + return {next_pos, State::WAITING_VALUE}; + } + else if (std::find(pair_delimiters.begin(), pair_delimiters.end(), *p) != pair_delimiters.end()) + { + return {next_pos, State::WAITING_KEY}; + } + + pos = next_pos; + } + + // might be problematic in case string reaches the end and I haven't copied anything over to key + + return {file.size(), State::END}; +} + +NextState InlineEscapingKeyStateHandler::readQuoted(std::string_view file, ElementType & key) const +{ + const auto quoting_character = extractor_configuration.quoting_character; + + key.clear(); + + size_t pos = 0; + while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_quoted_needles)) + { + size_t character_position = p - file.begin(); + size_t next_pos = character_position + 1u; + + if (*p == '\\') + { + const size_t escape_seq_len = consumeWithEscapeSequence(file, pos, character_position, key); + next_pos = character_position + escape_seq_len; + + if (escape_seq_len == 0) + { + return {next_pos, State::WAITING_KEY}; + } + } + else if (*p == quoting_character) + { + // todo try to optimize with resize and memcpy + for (size_t i = pos; i < character_position; ++i) + { + key.push_back(file[i]); + } + + if (key.empty()) + { + return {next_pos, State::WAITING_KEY}; + } + + return {next_pos, State::READING_KV_DELIMITER}; + } + + pos = next_pos; + } + + return {file.size(), State::END}; +} + +NextState InlineEscapingKeyStateHandler::readKeyValueDelimiter(std::string_view file) const +{ + if (file.size() == 0) + { + return {0, State::END}; + } + else + { + const auto current_character = file[0]; + return {0, extractor_configuration.key_value_delimiter == current_character ? State::WAITING_VALUE : State::WAITING_KEY}; + } +} + + InlineEscapingValueStateHandler::InlineEscapingValueStateHandler(Configuration extractor_configuration_) : extractor_configuration(std::move(extractor_configuration_)) { @@ -20,11 +162,11 @@ InlineEscapingValueStateHandler::InlineEscapingValueStateHandler(Configuration e read_quoted_needles = EscapingNeedleFactory::getReadQuotedNeedles(extractor_configuration); } -NextState InlineEscapingValueStateHandler::wait(std::string_view file, size_t pos) const +NextState InlineEscapingValueStateHandler::wait(std::string_view file) const { - const auto & [key_value_delimiter, quoting_character, pair_delimiters] - = extractor_configuration; + const auto & [key_value_delimiter, quoting_character, pair_delimiters] = extractor_configuration; + size_t pos = 0; if (pos < file.size()) { const auto current_character = file[pos]; @@ -46,59 +188,36 @@ NextState InlineEscapingValueStateHandler::wait(std::string_view file, size_t po return {pos, State::READING_VALUE}; } -NextState InlineEscapingValueStateHandler::read(std::string_view file, size_t pos, ElementType & value) const +NextState InlineEscapingValueStateHandler::read(std::string_view file, ElementType & value) const { - BoundsSafeCharacterFinder finder; - - const auto & [key_value_delimiter, quoting_character, pair_delimiters] - = extractor_configuration; + const auto & [key_value_delimiter, quoting_character, pair_delimiters] = extractor_configuration; value.clear(); - /* - * Maybe modify finder return type to be the actual pos. In case of failures, it shall return pointer to the end. - * It might help updating current pos? - * */ - - while (auto character_position_opt = finder.findFirst(file, pos, read_needles)) + size_t pos = 0; + while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_needles)) { - auto character_position = *character_position_opt; - auto character = file[character_position]; - auto next_pos = character_position + 1u; + auto character_position = p - file.begin(); + size_t next_pos = character_position + 1u; - if (EscapedCharacterReader::ESCAPE_CHARACTER == character) + if (*p == '\\') { - for (auto i = pos; i < character_position; i++) - { - value.push_back(file[i]); - } - - auto [next_byte_ptr, escaped_characters] = EscapedCharacterReader::read(file, character_position); - next_pos = next_byte_ptr - file.begin(); - - if (escaped_characters.empty()) + const size_t escape_seq_len = consumeWithEscapeSequence(file, pos, character_position, value); + next_pos = character_position + escape_seq_len; + if (escape_seq_len == 0) { return {next_pos, State::WAITING_KEY}; } - else - { - for (auto escaped_character : escaped_characters) - { - value.push_back(escaped_character); - } - } } - else if (key_value_delimiter == character) + else if (*p == key_value_delimiter) { + // reached new key return {next_pos, State::WAITING_KEY}; } - else if (std::find(pair_delimiters.begin(), pair_delimiters.end(), character) != pair_delimiters.end()) + else if (std::find(pair_delimiters.begin(), pair_delimiters.end(), *p) != pair_delimiters.end()) { - // todo try to optimize with resize and memcpy - for (auto i = pos; i < character_position; i++) - { - value.push_back(file[i]); - } + // reached next pair + value.insert(value.end(), file.begin() + pos, file.begin() + character_position); return {next_pos, State::FLUSH_PAIR}; } @@ -106,57 +225,37 @@ NextState InlineEscapingValueStateHandler::read(std::string_view file, size_t po pos = next_pos; } - for (; pos < file.size(); pos++) - { - value.push_back(file[pos]); - } + // Reached end of input, consume rest of the file as value and make sure KV pair is produced. + value.insert(value.end(), file.begin() + pos, file.end()); return {pos, State::FLUSH_PAIR}; } -NextState InlineEscapingValueStateHandler::readQuoted(std::string_view file, size_t pos, ElementType & value) const +NextState InlineEscapingValueStateHandler::readQuoted(std::string_view file, ElementType & value) const { - BoundsSafeCharacterFinder finder; - const auto quoting_character = extractor_configuration.quoting_character; + const std::string_view needles{read_quoted_needles.begin(), read_quoted_needles.end()}; value.clear(); - while (auto character_position_opt = finder.findFirst(file, pos, read_quoted_needles)) + size_t pos = 0; + while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, needles)) { - auto character_position = *character_position_opt; - auto character = file[character_position]; - auto next_pos = character_position + 1u; + auto character_position = p - file.begin(); + size_t next_pos = character_position + 1u; - if (character == EscapedCharacterReader::ESCAPE_CHARACTER) + if (*p == '\\') { - for (auto i = pos; i < character_position; i++) - { - value.push_back(file[i]); - } - - auto [next_byte_ptr, escaped_characters] = EscapedCharacterReader::read(file, character_position); - next_pos = next_byte_ptr - file.begin(); - - if (escaped_characters.empty()) + const size_t escape_seq_len = consumeWithEscapeSequence(file, pos, character_position, value); + next_pos = character_position + escape_seq_len; + if (escape_seq_len == 0) { return {next_pos, State::WAITING_KEY}; } - else - { - for (auto escaped_character : escaped_characters) - { - value.push_back(escaped_character); - } - } } - else if (quoting_character == character) + else if (*p == quoting_character) { - // todo try to optimize with resize and memcpy - for (auto i = pos; i < character_position; i++) - { - value.push_back(file[i]); - } + value.insert(value.end(), file.begin() + pos, file.begin() + character_position); return {next_pos, State::FLUSH_PAIR}; } diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.h b/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.h index de8b34cff8ba..23bbd062284a 100644 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.h +++ b/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.h @@ -16,13 +16,13 @@ class InlineEscapingKeyStateHandler : public StateHandler explicit InlineEscapingKeyStateHandler(Configuration configuration_); - [[nodiscard]] NextState wait(std::string_view file, size_t pos) const; + [[nodiscard]] NextState wait(std::string_view file) const; - [[nodiscard]] NextState read(std::string_view file, size_t pos, ElementType & key) const; + [[nodiscard]] NextState read(std::string_view file, ElementType & key) const; - [[nodiscard]] NextState readQuoted(std::string_view file, size_t pos, ElementType & key) const; + [[nodiscard]] NextState readQuoted(std::string_view file, ElementType & key) const; - [[nodiscard]] NextState readKeyValueDelimiter(std::string_view file, size_t pos) const; + [[nodiscard]] NextState readKeyValueDelimiter(std::string_view file) const; private: Configuration extractor_configuration; @@ -40,11 +40,11 @@ class InlineEscapingValueStateHandler : public StateHandler explicit InlineEscapingValueStateHandler(Configuration extractor_configuration_); - [[nodiscard]] NextState wait(std::string_view file, size_t pos) const; + [[nodiscard]] NextState wait(std::string_view file) const; - [[nodiscard]] NextState read(std::string_view file, size_t pos, ElementType & value) const; + [[nodiscard]] NextState read(std::string_view file, ElementType & value) const; - [[nodiscard]] NextState readQuoted(std::string_view file, size_t pos, ElementType & value) const; + [[nodiscard]] NextState readQuoted(std::string_view file, ElementType & value) const; private: Configuration extractor_configuration; diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.cpp b/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.cpp index 80c91e7279a7..eeffef881e6e 100644 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.cpp +++ b/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.cpp @@ -13,12 +13,12 @@ NoEscapingKeyStateHandler::NoEscapingKeyStateHandler(Configuration extractor_con read_quoted_needles = NeedleFactory::getReadQuotedNeedles(extractor_configuration); } -NextState NoEscapingKeyStateHandler::wait(std::string_view file, size_t pos) const +NextState NoEscapingKeyStateHandler::wait(std::string_view file) const { BoundsSafeCharacterFinder finder; const auto quoting_character = extractor_configuration.quoting_character; - + size_t pos = 0; while (auto character_position_opt = finder.findFirstNot(file, pos, wait_needles)) { auto character_position = *character_position_opt; @@ -37,7 +37,7 @@ NextState NoEscapingKeyStateHandler::wait(std::string_view file, size_t pos) con return {file.size(), State::END}; } -NextState NoEscapingKeyStateHandler::read(std::string_view file, size_t pos, ElementType & key) const +NextState NoEscapingKeyStateHandler::read(std::string_view file, ElementType & key) const { BoundsSafeCharacterFinder finder; @@ -46,6 +46,7 @@ NextState NoEscapingKeyStateHandler::read(std::string_view file, size_t pos, Ele key = {}; + size_t pos = 0; auto start_index = pos; while (auto character_position_opt = finder.findFirst(file, pos, read_needles)) @@ -76,7 +77,7 @@ NextState NoEscapingKeyStateHandler::read(std::string_view file, size_t pos, Ele return {file.size(), State::END}; } -NextState NoEscapingKeyStateHandler::readQuoted(std::string_view file, size_t pos, ElementType & key) const +NextState NoEscapingKeyStateHandler::readQuoted(std::string_view file, ElementType & key) const { BoundsSafeCharacterFinder finder; @@ -84,6 +85,7 @@ NextState NoEscapingKeyStateHandler::readQuoted(std::string_view file, size_t po key = {}; + size_t pos = 0; auto start_index = pos; while (auto character_position_opt = finder.findFirst(file, pos, read_quoted_needles)) @@ -110,17 +112,20 @@ NextState NoEscapingKeyStateHandler::readQuoted(std::string_view file, size_t po return {file.size(), State::END}; } -NextState NoEscapingKeyStateHandler::readKeyValueDelimiter(std::string_view file, size_t pos) const +NextState NoEscapingKeyStateHandler::readKeyValueDelimiter(std::string_view file) const { - if (pos == file.size()) + if (file.size() == 0) { - return {pos, State::END}; + return {0, State::END}; } - else + + const auto current_character = file[0]; + if (current_character == extractor_configuration.key_value_delimiter) { - const auto current_character = file[pos++]; - return {pos, extractor_configuration.key_value_delimiter == current_character ? State::WAITING_VALUE : State::WAITING_KEY}; + return {1, State::WAITING_VALUE}; } + + return {0, State::WAITING_KEY}; } } diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.h b/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.h index ad53272e110c..e47eb482aecd 100644 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.h +++ b/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.h @@ -13,13 +13,13 @@ class NoEscapingKeyStateHandler : public StateHandler explicit NoEscapingKeyStateHandler(Configuration extractor_configuration_); - [[nodiscard]] NextState wait(std::string_view file, size_t pos) const; + [[nodiscard]] NextState wait(std::string_view file) const; - [[nodiscard]] NextState read(std::string_view file, size_t pos, ElementType & key) const; + [[nodiscard]] NextState read(std::string_view file, ElementType & key) const; - [[nodiscard]] NextState readQuoted(std::string_view file, size_t pos, ElementType & key) const; + [[nodiscard]] NextState readQuoted(std::string_view file, ElementType & key) const; - [[nodiscard]] NextState readKeyValueDelimiter(std::string_view file, size_t pos) const; + [[nodiscard]] NextState readKeyValueDelimiter(std::string_view file) const; private: Configuration extractor_configuration; diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.cpp b/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.cpp index 76b12dab19b1..7bc90e9c7e0b 100644 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.cpp +++ b/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.cpp @@ -12,11 +12,12 @@ NoEscapingValueStateHandler::NoEscapingValueStateHandler(Configuration extractor read_quoted_needles = NeedleFactory::getReadQuotedNeedles(extractor_configuration); } -NextState NoEscapingValueStateHandler::wait(std::string_view file, size_t pos) const +NextState NoEscapingValueStateHandler::wait(std::string_view file) const { const auto & [key_value_delimiter, quoting_character, pair_delimiters] = extractor_configuration; + size_t pos = 0; if (pos < file.size()) { const auto current_character = file[pos]; @@ -38,8 +39,9 @@ NextState NoEscapingValueStateHandler::wait(std::string_view file, size_t pos) c return {file.size(), State::READING_VALUE}; } -NextState NoEscapingValueStateHandler::read(std::string_view file, size_t pos, ElementType & value) const +NextState NoEscapingValueStateHandler::read(std::string_view file, ElementType & value) const { + size_t pos = 0; auto start_index = pos; value = {}; @@ -74,8 +76,9 @@ NextState NoEscapingValueStateHandler::read(std::string_view file, size_t pos, E return {file.size(), State::FLUSH_PAIR}; } -NextState NoEscapingValueStateHandler::readQuoted(std::string_view file, size_t pos, ElementType & value) const +NextState NoEscapingValueStateHandler::readQuoted(std::string_view file, ElementType & value) const { + size_t pos = 0; auto start_index = pos; value = {}; @@ -94,6 +97,7 @@ NextState NoEscapingValueStateHandler::readQuoted(std::string_view file, size_t { value = createElement(file, start_index, character_position); + std::cerr << "NoEscapingValueStateHandler::readQuoted Going to consume up to: «" << fancyQuote(file.substr(0, next_pos)) << " to " << fancyQuote(file.substr(next_pos)) << std::endl; return {next_pos, State::FLUSH_PAIR}; } diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.h b/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.h index 1a58da9c1f91..25901156ca67 100644 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.h +++ b/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.h @@ -14,11 +14,11 @@ class NoEscapingValueStateHandler : public StateHandler explicit NoEscapingValueStateHandler(Configuration extractor_configuration_); - [[nodiscard]] NextState wait(std::string_view file, size_t pos) const; + [[nodiscard]] NextState wait(std::string_view file) const; - [[nodiscard]] NextState read(std::string_view file, size_t pos, ElementType & value) const; + [[nodiscard]] NextState read(std::string_view file, ElementType & value) const; - [[nodiscard]] NextState readQuoted(std::string_view file, size_t pos, ElementType & value) const; + [[nodiscard]] NextState readQuoted(std::string_view file, ElementType & value) const; private: Configuration extractor_configuration; diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/util/CharacterFinder.cpp b/src/Functions/keyvaluepair/src/impl/state/strategies/util/CharacterFinder.cpp index 14dbb804d347..a4c738cc3aa5 100644 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/util/CharacterFinder.cpp +++ b/src/Functions/keyvaluepair/src/impl/state/strategies/util/CharacterFinder.cpp @@ -30,10 +30,10 @@ std::optional CharacterFinder::findFirst(std::string_ std::optional CharacterFinder::findFirstNot(std::string_view haystack, const std::vector & needles) { - if (needles.empty()) - { - return 0u; - } +// if (needles.empty()) +// { +// return 0u; +// } if (const auto * ptr = find_first_not_symbols_or_null(haystack, {needles.begin(), needles.end()})) { diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/util/EscapedCharacterReader.cpp b/src/Functions/keyvaluepair/src/impl/state/strategies/util/EscapedCharacterReader.cpp deleted file mode 100644 index 2d65a223ab55..000000000000 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/util/EscapedCharacterReader.cpp +++ /dev/null @@ -1,36 +0,0 @@ -#include "EscapedCharacterReader.h" - -#include - -#include -#include - -namespace DB -{ - -EscapedCharacterReader::ReadResult EscapedCharacterReader::read(std::string_view element, std::size_t offset) -{ - return read({element.begin() + offset, element.end()}); -} - -EscapedCharacterReader::ReadResult EscapedCharacterReader::read(std::string_view str) -{ - std::string escaped_sequence; - - auto rb = ReadBufferFromString(str); - - if (parseComplexEscapeSequence(escaped_sequence, rb)) - { - return { - rb.position(), - {escaped_sequence.begin(), escaped_sequence.end()} - }; - } - - return { - rb.position(), - {} - }; -} - -} diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/util/EscapedCharacterReader.h b/src/Functions/keyvaluepair/src/impl/state/strategies/util/EscapedCharacterReader.h deleted file mode 100644 index 2e048f04095c..000000000000 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/util/EscapedCharacterReader.h +++ /dev/null @@ -1,33 +0,0 @@ -#pragma once - -#include -#include - -namespace DB -{ - -/* - * Takes in an escape sequence like: \n \t \xHH \0NNN and return its parsed character - * Example: - * Input: "\n", Output: {0xA} - * Input: "\xFF", Output: {0xFF} - * Input: "\0377", Output: {0xFF} - * It can also take non-standard escape sequences, in which case it'll simply return the input. - * Examples: - * Input: "\h", Output {'\', 'h'} - * */ -class EscapedCharacterReader -{ -public: - static constexpr char ESCAPE_CHARACTER = '\\'; - struct ReadResult - { - const char * ptr; - std::vector characters; - }; - - static ReadResult read(std::string_view element); - static ReadResult read(std::string_view element, std::size_t offset); -}; - -} diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/util/NeedleFactory.cpp b/src/Functions/keyvaluepair/src/impl/state/strategies/util/NeedleFactory.cpp index a9279c12cf7a..6863e28984c4 100644 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/util/NeedleFactory.cpp +++ b/src/Functions/keyvaluepair/src/impl/state/strategies/util/NeedleFactory.cpp @@ -1,5 +1,4 @@ #include "NeedleFactory.h" -#include "EscapedCharacterReader.h" namespace DB { @@ -54,7 +53,7 @@ std::vector EscapingNeedleFactory::getWaitNeedles(const DB::Configuration { auto needles = NeedleFactory::getWaitNeedles(extractor_configuration); - needles.push_back(EscapedCharacterReader::ESCAPE_CHARACTER); + needles.push_back('\\'); return needles; } @@ -63,7 +62,7 @@ std::vector EscapingNeedleFactory::getReadNeedles(const Configuration & ex { auto needles = NeedleFactory::getReadNeedles(extractor_configuration); - needles.push_back(EscapedCharacterReader::ESCAPE_CHARACTER); + needles.push_back('\\'); return needles; } @@ -72,7 +71,7 @@ std::vector EscapingNeedleFactory::getReadQuotedNeedles(const Configuratio { auto needles = NeedleFactory::getReadQuotedNeedles(extractor_configuration); - needles.push_back(EscapedCharacterReader::ESCAPE_CHARACTER); + needles.push_back('\\'); return needles; } diff --git a/src/Functions/keyvaluepair/tests/gtest_character_finder.cpp b/src/Functions/keyvaluepair/tests/gtest_character_finder.cpp index 2bc80c93999a..aba637079a94 100644 --- a/src/Functions/keyvaluepair/tests/gtest_character_finder.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_character_finder.cpp @@ -36,7 +36,7 @@ void test_find_first_not(const CharacterFinder& finder, const std::string_view& } } -TEST(CharacterFinderTest, FindFirst) +TEST(extractKVPair_CharacterFinderTest, FindFirst) { CharacterFinder finder; @@ -66,7 +66,7 @@ TEST(CharacterFinderTest, FindFirst) test_find_first(finder, special_needle, special_needles, std::make_optional>('=', 4)); } -TEST(CharacterFinderTest, FindFirstNot) +TEST(extractKVPair_CharacterFinderTest, FindFirstNot) { CharacterFinder finder; diff --git a/src/Functions/keyvaluepair/tests/gtest_escaped_sequence_reader.cpp b/src/Functions/keyvaluepair/tests/gtest_escaped_sequence_reader.cpp index c36b729e00cc..8e9595aa2735 100644 --- a/src/Functions/keyvaluepair/tests/gtest_escaped_sequence_reader.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_escaped_sequence_reader.cpp @@ -1,71 +1,71 @@ -#include -#include +//#include +//#include -namespace DB -{ +//namespace DB +//{ -void test(std::string_view input, const std::vector & expected_characters, std::size_t expected_bytes_read) -{ - auto read_result = EscapedCharacterReader::read(input); +//void test(std::string_view input, const std::vector & expected_characters, std::size_t expected_bytes_read) +//{ +// auto read_result = extractKVPair_EscapedCharacterReader::read(input); - ASSERT_EQ(expected_characters, read_result.characters); +// ASSERT_EQ(expected_characters, read_result.characters); - auto bytes_read = read_result.ptr - input.begin(); +// auto bytes_read = read_result.ptr - input.begin(); - ASSERT_EQ(bytes_read, expected_bytes_read); -} +// ASSERT_EQ(bytes_read, expected_bytes_read); +//} -void test(std::string_view input, const std::vector & expected_characters) -{ - return test(input, expected_characters, input.size()); -} +//void test(std::string_view input, const std::vector & expected_characters) +//{ +// return test(input, expected_characters, input.size()); +//} -TEST(EscapedCharacterReader, HexDigits) -{ -// test("\\xA", {0xA}); - test("\\xAA", {0xAA}); - test("\\xff", {0xFF}); -// test("\\x0", {0x0}); - test("\\x00", {0x0}); +//TEST(extractKVPair_EscapedCharacterReader, HexDigits) +//{ +//// test("\\xA", {0xA}); +// test("\\xAA", {0xAA}); +// test("\\xff", {0xFF}); +//// test("\\x0", {0x0}); +// test("\\x00", {0x0}); -// test("\\xA$", {0xA}, 3); - test("\\xAA$", {0xAA}, 4); - test("\\xff$", {0xFF}, 4); -// test("\\x0$", {0x0}, 3); - test("\\x00$", {0x0}, 4); +//// test("\\xA$", {0xA}, 3); +// test("\\xAA$", {0xAA}, 4); +// test("\\xff$", {0xFF}, 4); +//// test("\\x0$", {0x0}, 3); +// test("\\x00$", {0x0}, 4); - test("\\x", {}); -} +// test("\\x", {}); +//} -TEST(EscapedCharacterReader, OctalDigits) -{ - test("\\0377", {0xFF}); - test("\\0137", {0x5F}); - test("\\013", {0xB}); - test("\\0000", {0x0}); - test("\\000", {0x0}); - test("\\00", {0x0}); +//TEST(extractKVPair_EscapedCharacterReader, OctalDigits) +//{ +// test("\\0377", {0xFF}); +// test("\\0137", {0x5F}); +// test("\\013", {0xB}); +// test("\\0000", {0x0}); +// test("\\000", {0x0}); +// test("\\00", {0x0}); - test("\\0377$", {0xFF}, 5); - test("\\0137$", {0x5F}, 5); - test("\\013$", {0xB}, 4); - test("\\0000$", {0x0}, 5); - test("\\000$", {0x0}, 4); - test("\\00$", {0x0}, 3); -} +// test("\\0377$", {0xFF}, 5); +// test("\\0137$", {0x5F}, 5); +// test("\\013$", {0xB}, 4); +// test("\\0000$", {0x0}, 5); +// test("\\000$", {0x0}, 4); +// test("\\00$", {0x0}, 3); +//} -TEST(EscapedCharacterReader, RegularEscapeSequences) -{ - test("\\n", {0xA}, 2); - test("\\r", {0xD}, 2); - test("\\r", {0xD}, 2); -} +//TEST(extractKVPair_EscapedCharacterReader, RegularEscapeSequences) +//{ +// test("\\n", {0xA}, 2); +// test("\\r", {0xD}, 2); +// test("\\r", {0xD}, 2); +//} -TEST(EscapedCharacterReader, RandomEscapedCharacters) -{ - test("\\h", {'\\', 'h'}); - test("\\g", {'\\', 'g'}); - test("\\", {}); -} +//TEST(extractKVPair_EscapedCharacterReader, RandomEscapedCharacters) +//{ +// test("\\h", {'\\', 'h'}); +// test("\\g", {'\\', 'g'}); +// test("\\", {}); +//} -} +//} diff --git a/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp b/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp index 1607054be4f3..1be9254519ef 100644 --- a/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp @@ -11,7 +11,7 @@ void assert_byte_equality(StringRef lhs, const std::vector & rhs) ASSERT_EQ(lhs_vector, rhs); } -TEST(EscapingKeyValuePairExtractor, EscapeSequences) +TEST(extractKVPair_EscapingKeyValuePairExtractor, EscapeSequences) { using namespace std::literals; diff --git a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp index 3fd2d141689b..a8f4aaaa35cc 100644 --- a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp @@ -1,4 +1,4 @@ -#include +#include #include namespace DB @@ -6,7 +6,7 @@ namespace DB void test_wait(const InlineEscapingKeyStateHandler & handler, std::string_view input, std::size_t expected_pos, State expected_state) { - auto next_state = handler.wait(input, 0u); + auto next_state = handler.wait(input); ASSERT_EQ(next_state.position_in_string, expected_pos); ASSERT_EQ(next_state.state, expected_state); @@ -21,11 +21,11 @@ void test_read(const InlineEscapingKeyStateHandler & handler, std::string_view i if constexpr (quoted) { - next_state = handler.readQuoted(input, 0u, element); + next_state = handler.readQuoted(input, element); } else { - next_state = handler.read(input, 0u, element); + next_state = handler.read(input, element); } ASSERT_EQ(next_state.position_in_string, expected_pos); @@ -45,7 +45,7 @@ void test_read_quoted(const InlineEscapingKeyStateHandler & handler, std::string test_read(handler, input, expected_element, expected_pos, expected_state); } -TEST(InlineEscapingKeyStateHandler, Wait) +TEST(extractKVPair_InlineEscapingKeyStateHandler, Wait) { auto pair_delimiters = std::vector{',', ' '}; @@ -61,7 +61,7 @@ TEST(InlineEscapingKeyStateHandler, Wait) test_wait(handler, "\\\\", 2u, END); } -TEST(InlineEscapingKeyStateHandler, Read) +TEST(extractKVPair_InlineEscapingKeyStateHandler, Read) { auto pair_delimiters = std::vector{',', ' '}; @@ -86,7 +86,7 @@ TEST(InlineEscapingKeyStateHandler, Read) test_read(handler, "", "", 0u, END); } -TEST(InlineEscapingKeyStateHandler, ReadEnclosed) +TEST(extractKVPair_InlineEscapingKeyStateHandler, ReadEnclosed) { auto pair_delimiters = std::vector{',', ' '}; diff --git a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_value_state_handler.cpp b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_value_state_handler.cpp index c91ba8f1b09d..6eeed4dc7b8e 100644 --- a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_value_state_handler.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_value_state_handler.cpp @@ -1,4 +1,4 @@ -#include +#include #include namespace DB @@ -6,13 +6,13 @@ namespace DB void test_wait(const auto & handler, std::string_view input, std::size_t expected_pos, State expected_state) { - auto next_state = handler.wait(input, 0u); + auto next_state = handler.wait(input); ASSERT_EQ(next_state.position_in_string, expected_pos); ASSERT_EQ(next_state.state, expected_state); } -TEST(InlineEscapingValueStateHandler, Wait) +TEST(extractKVPair_InlineEscapingValueStateHandler, Wait) { auto pair_delimiters = std::vector {','}; diff --git a/src/Functions/keyvaluepair/tests/gtest_key_value_pair_extractor.cpp b/src/Functions/keyvaluepair/tests/gtest_key_value_pair_extractor.cpp index 70fe25d580a6..c7245c8af27e 100644 --- a/src/Functions/keyvaluepair/tests/gtest_key_value_pair_extractor.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_key_value_pair_extractor.cpp @@ -17,11 +17,11 @@ // return ostr << test_case.input; //} // -//struct KeyValuePairExtractorTest : public ::testing::TestWithParam +//struct extractKVPair_KeyValuePairExtractorTest : public ::testing::TestWithParam //{ //}; // -//TEST_P(KeyValuePairExtractorTest, KeyValuePairExtractorTests) +//TEST(extractKVPair_KeyValuePairExtractorTest, KeyValuePairExtractorTests) //{ // const auto & [input, expected_output, extractor] = GetParam(); // @@ -32,7 +32,7 @@ // //INSTANTIATE_TEST_SUITE_P( // ValuesCanBeEmptyString, -// KeyValuePairExtractorTest, +// extractKVPair_KeyValuePairExtractorTest, // ::testing::ValuesIn(std::initializer_list{ // { // "age:", @@ -56,7 +56,7 @@ // //INSTANTIATE_TEST_SUITE_P( // MixString, -// KeyValuePairExtractorTest, +// extractKVPair_KeyValuePairExtractorTest, // ::testing::ValuesIn(std::initializer_list{ // { // R"(9 ads =nm, no\:me: neymar, age: 30, daojmskdpoa and a height: 1.75, school: lupe\ picasso, team: psg,)", @@ -95,7 +95,7 @@ // //INSTANTIATE_TEST_SUITE_P( // Escaping, -// KeyValuePairExtractorTest, +// extractKVPair_KeyValuePairExtractorTest, // ::testing::ValuesIn(std::initializer_list{ // { // "na,me,: neymar, age:30", @@ -135,7 +135,7 @@ // //INSTANTIATE_TEST_SUITE_P( // EnclosedElements, -// KeyValuePairExtractorTest, +// extractKVPair_KeyValuePairExtractorTest, // ::testing::ValuesIn(std::initializer_list{ // { // R"("name": "Neymar", "age": 30, team: "psg", "favorite_movie": "", height: 1.75)", diff --git a/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp b/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp index 999f9624c298..1bf2d71c1282 100644 --- a/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp @@ -6,7 +6,7 @@ namespace DB void test_wait(const auto & handler, std::string_view input, std::size_t expected_pos, State expected_state) { - auto next_state = handler.wait(input, 0u); + auto next_state = handler.wait(input); ASSERT_EQ(next_state.position_in_string, expected_pos); ASSERT_EQ(next_state.state, expected_state); @@ -21,11 +21,11 @@ void test_read(const auto & handler, std::string_view input, std::string_view ex if constexpr (quoted) { - next_state = handler.readQuoted(input, 0u, element); + next_state = handler.readQuoted(input, element); } else { - next_state = handler.read(input, 0u, element); + next_state = handler.read(input, element); } ASSERT_EQ(next_state.position_in_string, expected_pos); @@ -45,7 +45,7 @@ void test_read_quoted(const auto & handler, std::string_view input, std::string_ test_read(handler, input, expected_element, expected_pos, expected_state); } -TEST(NoEscapingKeyStateHandler, Wait) +TEST(extractKVPair_NoEscapingKeyStateHandler, Wait) { auto pair_delimiters = std::vector{',', ' ', '$'}; @@ -64,7 +64,7 @@ TEST(NoEscapingKeyStateHandler, Wait) test_wait(handler, "", 0u, END); } -TEST(NoEscapingKeyStateHandler, Read) +TEST(extractKVPair_NoEscapingKeyStateHandler, Read) { auto pair_delimiters = std::vector{',', ' '}; From 6bc33670c7efbf6a988baea72fa73dd80af64e06 Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Tue, 28 Mar 2023 14:34:36 +0200 Subject: [PATCH 05/12] It works --- .../src/KeyValuePairExtractorBuilder.cpp | 6 +++--- .../keyvaluepair/src/KeyValuePairExtractorBuilder.h | 6 +++--- .../keyvaluepair/src/impl/CHKeyValuePairExtractor.h | 5 +++-- .../escaping/InlineEscapingStateHandler.cpp | 12 +++++------- .../noescaping/NoEscapingKeyStateHandler.cpp | 8 ++------ 5 files changed, 16 insertions(+), 21 deletions(-) diff --git a/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.cpp b/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.cpp index 8add46530968..57eda58d38da 100644 --- a/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.cpp +++ b/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.cpp @@ -33,7 +33,7 @@ KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withEscaping() return *this; } -std::shared_ptr KeyValuePairExtractorBuilder::build() +std::shared_ptr KeyValuePairExtractorBuilder::build() const { if (with_escaping) { @@ -43,7 +43,7 @@ std::shared_ptr KeyValuePairExtractorBuilder::build() return buildWithoutEscaping(); } -std::shared_ptr KeyValuePairExtractorBuilder::buildWithoutEscaping() +std::shared_ptr KeyValuePairExtractorBuilder::buildWithoutEscaping() const { auto configuration = ConfigurationFactory::createWithoutEscaping(key_value_pair_delimiter, quoting_character, item_delimiters); @@ -58,7 +58,7 @@ std::shared_ptr KeyValuePairExtractorBuilder::buildWithou return std::make_shared>(key_state_handler, value_state_handler); } -std::shared_ptr KeyValuePairExtractorBuilder::buildWithEscaping() +std::shared_ptr KeyValuePairExtractorBuilder::buildWithEscaping() const { auto configuration = ConfigurationFactory::createWithEscaping(key_value_pair_delimiter, quoting_character, item_delimiters); diff --git a/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.h b/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.h index 781e75cf73b8..a6d7e9c1b65a 100644 --- a/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.h +++ b/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.h @@ -24,7 +24,7 @@ class KeyValuePairExtractorBuilder KeyValuePairExtractorBuilder & withEscaping(); - std::shared_ptr build(); + std::shared_ptr build() const; private: bool with_escaping = false; @@ -32,9 +32,9 @@ class KeyValuePairExtractorBuilder char quoting_character = '"'; std::vector item_delimiters = {' ', ',', ';'}; - std::shared_ptr buildWithEscaping(); + std::shared_ptr buildWithEscaping() const; - std::shared_ptr buildWithoutEscaping(); + std::shared_ptr buildWithoutEscaping() const; }; } diff --git a/src/Functions/keyvaluepair/src/impl/CHKeyValuePairExtractor.h b/src/Functions/keyvaluepair/src/impl/CHKeyValuePairExtractor.h index 2c1c78a62330..ce868c411e50 100644 --- a/src/Functions/keyvaluepair/src/impl/CHKeyValuePairExtractor.h +++ b/src/Functions/keyvaluepair/src/impl/CHKeyValuePairExtractor.h @@ -76,12 +76,13 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor data.remove_prefix(next_state.position_in_string); state = next_state.state; + // No state expects empty input if (data.size() == 0) - state = State::END; + break; } // if break occured earlier, consume previously generated pair - if (next_state.state == State::FLUSH_PAIR || !(key.empty() && value.empty())) + if (state == State::FLUSH_PAIR || !(key.empty() && value.empty())) flushPair(data, key, value, keys, values, row_offset); return row_offset; diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp b/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp index a68e3f06e615..775ed8caba22 100644 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp +++ b/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp @@ -143,15 +143,13 @@ NextState InlineEscapingKeyStateHandler::readQuoted(std::string_view file, Eleme NextState InlineEscapingKeyStateHandler::readKeyValueDelimiter(std::string_view file) const { - if (file.size() == 0) + const auto current_character = file[0]; + if (current_character == extractor_configuration.key_value_delimiter) { - return {0, State::END}; - } - else - { - const auto current_character = file[0]; - return {0, extractor_configuration.key_value_delimiter == current_character ? State::WAITING_VALUE : State::WAITING_KEY}; + return {1, WAITING_VALUE}; } + + return {0, State::WAITING_KEY}; } diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.cpp b/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.cpp index eeffef881e6e..b7d5843f9bdc 100644 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.cpp +++ b/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.cpp @@ -1,6 +1,7 @@ #include "NoEscapingKeyStateHandler.h" #include #include +#include "Functions/keyvaluepair/src/impl/state/State.h" namespace DB { @@ -114,15 +115,10 @@ NextState NoEscapingKeyStateHandler::readQuoted(std::string_view file, ElementTy NextState NoEscapingKeyStateHandler::readKeyValueDelimiter(std::string_view file) const { - if (file.size() == 0) - { - return {0, State::END}; - } - const auto current_character = file[0]; if (current_character == extractor_configuration.key_value_delimiter) { - return {1, State::WAITING_VALUE}; + return {1, WAITING_VALUE}; } return {0, State::WAITING_KEY}; From ee8209ca0143aa9c4d7d598a8ea51350a990d4b4 Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Tue, 28 Mar 2023 14:37:28 +0200 Subject: [PATCH 06/12] Updated test case Invalid escape sequence is removed from key or value, while corresponding key or value are not discarded --- .../noescaping/NoEscapingKeyStateHandler.cpp | 127 ------------------ .../noescaping/NoEscapingKeyStateHandler.h | 32 ----- ...t_key_value_pairs_multiple_input.reference | 4 +- ...extract_key_value_pairs_multiple_input.sql | 2 +- 4 files changed, 3 insertions(+), 162 deletions(-) delete mode 100644 src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.cpp delete mode 100644 src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.h diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.cpp b/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.cpp deleted file mode 100644 index b7d5843f9bdc..000000000000 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.cpp +++ /dev/null @@ -1,127 +0,0 @@ -#include "NoEscapingKeyStateHandler.h" -#include -#include -#include "Functions/keyvaluepair/src/impl/state/State.h" - -namespace DB -{ - -NoEscapingKeyStateHandler::NoEscapingKeyStateHandler(Configuration extractor_configuration_) -: extractor_configuration(std::move(extractor_configuration_)) -{ - wait_needles = NeedleFactory::getWaitNeedles(extractor_configuration); - read_needles = NeedleFactory::getReadNeedles(extractor_configuration); - read_quoted_needles = NeedleFactory::getReadQuotedNeedles(extractor_configuration); -} - -NextState NoEscapingKeyStateHandler::wait(std::string_view file) const -{ - BoundsSafeCharacterFinder finder; - - const auto quoting_character = extractor_configuration.quoting_character; - size_t pos = 0; - while (auto character_position_opt = finder.findFirstNot(file, pos, wait_needles)) - { - auto character_position = *character_position_opt; - auto character = file[character_position]; - - if (quoting_character == character) - { - return {character_position + 1u, State::READING_QUOTED_KEY}; - } - else - { - return {character_position, State::READING_KEY}; - } - } - - return {file.size(), State::END}; -} - -NextState NoEscapingKeyStateHandler::read(std::string_view file, ElementType & key) const -{ - BoundsSafeCharacterFinder finder; - - const auto & [key_value_delimiter, quoting_character, pair_delimiters] - = extractor_configuration; - - key = {}; - - size_t pos = 0; - auto start_index = pos; - - while (auto character_position_opt = finder.findFirst(file, pos, read_needles)) - { - auto character_position = *character_position_opt; - auto character = file[character_position]; - auto next_pos = character_position + 1u; - - if (character == key_value_delimiter) - { - key = createElement(file, start_index, character_position); - - if (key.empty()) - { - return {next_pos, State::WAITING_KEY}; - } - - return {next_pos, State::WAITING_VALUE}; - } - else if (std::find(pair_delimiters.begin(), pair_delimiters.end(), character) != pair_delimiters.end()) - { - return {next_pos, State::WAITING_KEY}; - } - - pos = next_pos; - } - - return {file.size(), State::END}; -} - -NextState NoEscapingKeyStateHandler::readQuoted(std::string_view file, ElementType & key) const -{ - BoundsSafeCharacterFinder finder; - - const auto quoting_character = extractor_configuration.quoting_character; - - key = {}; - - size_t pos = 0; - auto start_index = pos; - - while (auto character_position_opt = finder.findFirst(file, pos, read_quoted_needles)) - { - auto character_position = *character_position_opt; - auto character = file[character_position]; - auto next_pos = character_position + 1u; - - if (quoting_character == character) - { - key = createElement(file, start_index, character_position); - - if (key.empty()) - { - return {next_pos, State::WAITING_KEY}; - } - - return {next_pos, State::READING_KV_DELIMITER}; - } - - pos = next_pos; - } - - return {file.size(), State::END}; -} - -NextState NoEscapingKeyStateHandler::readKeyValueDelimiter(std::string_view file) const -{ - const auto current_character = file[0]; - if (current_character == extractor_configuration.key_value_delimiter) - { - return {1, WAITING_VALUE}; - } - - return {0, State::WAITING_KEY}; -} - -} diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.h b/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.h deleted file mode 100644 index e47eb482aecd..000000000000 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.h +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once - -#include "Functions/keyvaluepair/src/impl/state/Configuration.h" -#include "Functions/keyvaluepair/src/impl/state/StateHandler.h" - -namespace DB -{ - -class NoEscapingKeyStateHandler : public StateHandler -{ -public: - using ElementType = std::string_view; - - explicit NoEscapingKeyStateHandler(Configuration extractor_configuration_); - - [[nodiscard]] NextState wait(std::string_view file) const; - - [[nodiscard]] NextState read(std::string_view file, ElementType & key) const; - - [[nodiscard]] NextState readQuoted(std::string_view file, ElementType & key) const; - - [[nodiscard]] NextState readKeyValueDelimiter(std::string_view file) const; - -private: - Configuration extractor_configuration; - - std::vector wait_needles; - std::vector read_needles; - std::vector read_quoted_needles; -}; - -} diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference index 5fb9998181cc..a8047cd01cb5 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference @@ -67,7 +67,7 @@ SELECT x; {'amount\\z':'$5\\h','currency':'\\$USD'} -- invalid escape sequence, should be discarded --- expected output: {'valid_key':'valid_value'} +-- expected output: {'key':'invalid_val','valid_key':'valid_value'} WITH extractKeyValuePairs('valid_key:valid_value key:invalid_val\\') AS s_map, CAST( @@ -78,7 +78,7 @@ WITH ) AS x SELECT x; -{'valid_key':'valid_value'} +{'key':'invalid_val','valid_key':'valid_value'} -- standard escape sequences are covered by unit tests -- simple quoting diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql index 5f96fd9306b9..ee49455c8c1d 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql @@ -67,7 +67,7 @@ SELECT x; -- invalid escape sequence, should be discarded --- expected output: {'valid_key':'valid_value'} +-- expected output: {'key':'invalid_val','valid_key':'valid_value'} WITH extractKeyValuePairs('valid_key:valid_value key:invalid_val\\') AS s_map, CAST( From 0a039a960142e33447135db573ee6fb2824113ed Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Wed, 29 Mar 2023 08:43:03 +0200 Subject: [PATCH 07/12] Minor optimization of copying data into key --- .../strategies/escaping/InlineEscapingStateHandler.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp b/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp index 775ed8caba22..1695e2e5e853 100644 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp +++ b/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp @@ -121,11 +121,7 @@ NextState InlineEscapingKeyStateHandler::readQuoted(std::string_view file, Eleme } else if (*p == quoting_character) { - // todo try to optimize with resize and memcpy - for (size_t i = pos; i < character_position; ++i) - { - key.push_back(file[i]); - } + key.insert(key.end(), file.begin() + pos, file.begin() + character_position); if (key.empty()) { From afc6b66eb3257a3bf9935fb5d0e71f0a7b3566ea Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Wed, 29 Mar 2023 13:10:17 +0200 Subject: [PATCH 08/12] Fused KeyStateHandler and ValueState togther Removed some unneeded files and moved code around. --- src/Functions/CMakeLists.txt | 3 +- .../{api => }/ArgumentExtractor.cpp | 2 +- .../{api => }/ArgumentExtractor.h | 4 +- src/Functions/keyvaluepair/CMakeLists.txt | 9 +- src/Functions/keyvaluepair/api/CMakeLists.txt | 6 - .../{api => }/extractKeyValuePairs.cpp | 102 ++++---- .../{api => }/extractKeyValuePairs.h | 48 ++-- .../{src => }/impl/CHKeyValuePairExtractor.h | 46 ++-- .../keyvaluepair/impl/CMakeLists.txt | 7 + .../impl/state => impl}/Configuration.cpp | 8 +- .../{src/impl/state => impl}/Configuration.h | 4 +- .../InlineEscapingStateHandler.cpp | 82 +++---- .../impl/InlineEscapingStateHandler.h | 42 ++++ .../{src => impl}/KeyValuePairExtractor.h | 0 .../KeyValuePairExtractorBuilder.cpp | 39 ++-- .../KeyValuePairExtractorBuilder.h | 6 +- .../util => impl}/NeedleFactory.cpp | 11 +- .../strategies/util => impl}/NeedleFactory.h | 9 +- .../impl/NoEscapingStateHandler.cpp | 221 ++++++++++++++++++ .../impl/NoEscapingStateHandler.h | 42 ++++ .../keyvaluepair/impl/StateHandler.h | 90 +++++++ src/Functions/keyvaluepair/src/CMakeLists.txt | 12 - .../keyvaluepair/src/impl/state/State.h | 35 --- .../src/impl/state/StateHandler.cpp | 11 - .../src/impl/state/StateHandler.h | 101 -------- .../escaping/InlineEscapingStateHandler.h | 55 ----- .../NoEscapingValueStateHandler.cpp | 110 --------- .../noescaping/NoEscapingValueStateHandler.h | 29 --- .../state/strategies/util/CharacterFinder.cpp | 80 ------- .../state/strategies/util/CharacterFinder.h | 39 ---- .../tests/gtest_character_finder.cpp | 99 -------- .../tests/gtest_escaped_sequence_reader.cpp | 71 ------ ...test_escaping_key_value_pair_extractor.cpp | 2 +- .../tests/gtest_extractKeyValuePairs.cpp | 137 +++++++++++ ...test_inline_escaping_key_state_handler.cpp | 2 +- ...st_inline_escaping_value_state_handler.cpp | 2 +- .../tests/gtest_key_value_pair_extractor.cpp | 2 +- .../gtest_no_escaping_key_state_handler.cpp | 2 +- 38 files changed, 729 insertions(+), 841 deletions(-) rename src/Functions/keyvaluepair/{api => }/ArgumentExtractor.cpp (97%) rename src/Functions/keyvaluepair/{api => }/ArgumentExtractor.h (100%) delete mode 100644 src/Functions/keyvaluepair/api/CMakeLists.txt rename src/Functions/keyvaluepair/{api => }/extractKeyValuePairs.cpp (85%) rename src/Functions/keyvaluepair/{api => }/extractKeyValuePairs.h (54%) rename src/Functions/keyvaluepair/{src => }/impl/CHKeyValuePairExtractor.h (79%) create mode 100644 src/Functions/keyvaluepair/impl/CMakeLists.txt rename src/Functions/keyvaluepair/{src/impl/state => impl}/Configuration.cpp (97%) rename src/Functions/keyvaluepair/{src/impl/state => impl}/Configuration.h (97%) rename src/Functions/keyvaluepair/{src/impl/state/strategies/escaping => impl}/InlineEscapingStateHandler.cpp (73%) create mode 100644 src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.h rename src/Functions/keyvaluepair/{src => impl}/KeyValuePairExtractor.h (100%) rename src/Functions/keyvaluepair/{src => impl}/KeyValuePairExtractorBuilder.cpp (54%) rename src/Functions/keyvaluepair/{src => impl}/KeyValuePairExtractorBuilder.h (74%) rename src/Functions/keyvaluepair/{src/impl/state/strategies/util => impl}/NeedleFactory.cpp (86%) rename src/Functions/keyvaluepair/{src/impl/state/strategies/util => impl}/NeedleFactory.h (92%) create mode 100644 src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp create mode 100644 src/Functions/keyvaluepair/impl/NoEscapingStateHandler.h create mode 100644 src/Functions/keyvaluepair/impl/StateHandler.h delete mode 100644 src/Functions/keyvaluepair/src/CMakeLists.txt delete mode 100644 src/Functions/keyvaluepair/src/impl/state/State.h delete mode 100644 src/Functions/keyvaluepair/src/impl/state/StateHandler.cpp delete mode 100644 src/Functions/keyvaluepair/src/impl/state/StateHandler.h delete mode 100644 src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.h delete mode 100644 src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.cpp delete mode 100644 src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.h delete mode 100644 src/Functions/keyvaluepair/src/impl/state/strategies/util/CharacterFinder.cpp delete mode 100644 src/Functions/keyvaluepair/src/impl/state/strategies/util/CharacterFinder.h delete mode 100644 src/Functions/keyvaluepair/tests/gtest_character_finder.cpp delete mode 100644 src/Functions/keyvaluepair/tests/gtest_escaped_sequence_reader.cpp create mode 100644 src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt index 5a3adbace22e..2f5c8a212f2d 100644 --- a/src/Functions/CMakeLists.txt +++ b/src/Functions/CMakeLists.txt @@ -104,8 +104,7 @@ add_subdirectory(JSONPath) list (APPEND PRIVATE_LIBS clickhouse_functions_jsonpath) add_subdirectory(keyvaluepair) -list (APPEND OBJECT_LIBS $) -list (APPEND OBJECT_LIBS $) +list (APPEND OBJECT_LIBS $) # Signed integer overflow on user-provided data inside boost::geometry - ignore. set_source_files_properties("pointInPolygon.cpp" PROPERTIES COMPILE_FLAGS -fno-sanitize=signed-integer-overflow) diff --git a/src/Functions/keyvaluepair/api/ArgumentExtractor.cpp b/src/Functions/keyvaluepair/ArgumentExtractor.cpp similarity index 97% rename from src/Functions/keyvaluepair/api/ArgumentExtractor.cpp rename to src/Functions/keyvaluepair/ArgumentExtractor.cpp index 0ba87fab5316..2799a9c5e0f1 100644 --- a/src/Functions/keyvaluepair/api/ArgumentExtractor.cpp +++ b/src/Functions/keyvaluepair/ArgumentExtractor.cpp @@ -1,4 +1,4 @@ -#include "ArgumentExtractor.h" +#include namespace DB { diff --git a/src/Functions/keyvaluepair/api/ArgumentExtractor.h b/src/Functions/keyvaluepair/ArgumentExtractor.h similarity index 100% rename from src/Functions/keyvaluepair/api/ArgumentExtractor.h rename to src/Functions/keyvaluepair/ArgumentExtractor.h index 5f5854ba5b94..171cc9d3caca 100644 --- a/src/Functions/keyvaluepair/api/ArgumentExtractor.h +++ b/src/Functions/keyvaluepair/ArgumentExtractor.h @@ -1,10 +1,10 @@ #pragma once -#include - #include #include +#include + namespace DB { diff --git a/src/Functions/keyvaluepair/CMakeLists.txt b/src/Functions/keyvaluepair/CMakeLists.txt index 6cedfe8bf10c..eeac4ce11d4d 100644 --- a/src/Functions/keyvaluepair/CMakeLists.txt +++ b/src/Functions/keyvaluepair/CMakeLists.txt @@ -1,2 +1,7 @@ -add_subdirectory(src) -add_subdirectory(api) \ No newline at end of file +include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") +add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs .) +add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs impl) + +add_library(clickhouse_functions_extractkeyvaluepairs ${clickhouse_functions_extractkeyvaluepairs_sources} ${clickhouse_functions_extractkeyvaluepairs_headers}) + +target_link_libraries(clickhouse_functions_extractkeyvaluepairs PRIVATE dbms) diff --git a/src/Functions/keyvaluepair/api/CMakeLists.txt b/src/Functions/keyvaluepair/api/CMakeLists.txt deleted file mode 100644 index d5390eeb3e3a..000000000000 --- a/src/Functions/keyvaluepair/api/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") -add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs_api .) - -add_library(clickhouse_functions_extractkeyvaluepairs_api ${clickhouse_functions_extractkeyvaluepairs_api_sources} ${clickhouse_functions_extractkeyvaluepairs_api_headers}) - -target_link_libraries(clickhouse_functions_extractkeyvaluepairs_api PRIVATE dbms clickhouse_functions_extractkeyvaluepairs_core) \ No newline at end of file diff --git a/src/Functions/keyvaluepair/api/extractKeyValuePairs.cpp b/src/Functions/keyvaluepair/extractKeyValuePairs.cpp similarity index 85% rename from src/Functions/keyvaluepair/api/extractKeyValuePairs.cpp rename to src/Functions/keyvaluepair/extractKeyValuePairs.cpp index 1229f958c216..c9d0a1a5f3ce 100644 --- a/src/Functions/keyvaluepair/api/extractKeyValuePairs.cpp +++ b/src/Functions/keyvaluepair/extractKeyValuePairs.cpp @@ -1,59 +1,24 @@ -#include "extractKeyValuePairs.h" +#include + +#include +#include +#include #include +#include #include - -#include - #include #include +#include -#include - -namespace DB -{ - -ExtractKeyValuePairs::ExtractKeyValuePairs() - : return_type(std::make_shared(std::make_shared(), std::make_shared())) -{ -} +#include -String ExtractKeyValuePairs::getName() const -{ - return name; -} -FunctionPtr ExtractKeyValuePairs::create(ContextPtr) +namespace { - return std::make_shared(); -} +using namespace DB; -static ColumnPtr extract(ColumnPtr data_column, std::shared_ptr extractor) -{ - auto offsets = ColumnUInt64::create(); - - auto keys = ColumnString::create(); - auto values = ColumnString::create(); - - uint64_t offset = 0u; - - for (auto i = 0u; i < data_column->size(); i++) - { - auto row = data_column->getDataAt(i).toView(); - - auto pairs_count = extractor->extract(row, keys, values); - - offset += pairs_count; - - offsets->insert(offset); - } - - ColumnPtr keys_ptr = std::move(keys); - - return ColumnMap::create(keys_ptr, std::move(values), std::move(offsets)); -} - -auto ExtractKeyValuePairs::getExtractor(const ArgumentExtractor::ParsedArguments & parsed_arguments) +auto getExtractor(const ArgumentExtractor::ParsedArguments & parsed_arguments) { auto builder = KeyValuePairExtractorBuilder(); @@ -80,38 +45,53 @@ auto ExtractKeyValuePairs::getExtractor(const ArgumentExtractor::ParsedArguments return builder.build(); } -ColumnPtr ExtractKeyValuePairs::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const +ColumnPtr extract(ColumnPtr data_column, std::shared_ptr extractor) { - auto parsed_arguments = ArgumentExtractor::extract(arguments); + auto offsets = ColumnUInt64::create(); - auto extractor_without_escaping = getExtractor(parsed_arguments); + auto keys = ColumnString::create(); + auto values = ColumnString::create(); - return extract(parsed_arguments.data_column, extractor_without_escaping); + uint64_t offset = 0u; + + for (auto i = 0u; i < data_column->size(); i++) + { + auto row = data_column->getDataAt(i).toView(); + + auto pairs_count = extractor->extract(row, keys, values); + + offset += pairs_count; + + offsets->insert(offset); + } + + ColumnPtr keys_ptr = std::move(keys); + + return ColumnMap::create(keys_ptr, std::move(values), std::move(offsets)); } -bool ExtractKeyValuePairs::isVariadic() const -{ - return true; + } -bool ExtractKeyValuePairs::isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo &) const +namespace DB { - return false; -} -std::size_t ExtractKeyValuePairs::getNumberOfArguments() const +ExtractKeyValuePairs::ExtractKeyValuePairs() { - return 0u; } DataTypePtr ExtractKeyValuePairs::getReturnTypeImpl(const DataTypes &) const { - return return_type; + return std::make_shared(std::make_shared(), std::make_shared()); } -ColumnNumbers ExtractKeyValuePairs::getArgumentsThatAreAlwaysConstant() const +ColumnPtr ExtractKeyValuePairs::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const { - return {1, 2, 3, 4, 5}; + auto parsed_arguments = ArgumentExtractor::extract(arguments); + + auto extractor_without_escaping = getExtractor(parsed_arguments); + + return extract(parsed_arguments.data_column, extractor_without_escaping); } REGISTER_FUNCTION(ExtractKeyValuePairs) diff --git a/src/Functions/keyvaluepair/api/extractKeyValuePairs.h b/src/Functions/keyvaluepair/extractKeyValuePairs.h similarity index 54% rename from src/Functions/keyvaluepair/api/extractKeyValuePairs.h rename to src/Functions/keyvaluepair/extractKeyValuePairs.h index 082da40138f2..53ad46c0da37 100644 --- a/src/Functions/keyvaluepair/api/extractKeyValuePairs.h +++ b/src/Functions/keyvaluepair/extractKeyValuePairs.h @@ -1,13 +1,9 @@ #pragma once -#include - -#include #include + #include #include -#include -#include "ArgumentExtractor.h" namespace DB { @@ -19,26 +15,38 @@ class ExtractKeyValuePairs : public IFunction static constexpr auto name = "extractKeyValuePairs"; - static FunctionPtr create(ContextPtr); + String getName() const override + { + return name; + } - String getName() const override; + static FunctionPtr create(ContextPtr) + { + return std::make_shared(); + } ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t) const override; - - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo &) const override; - - bool isVariadic() const override; - - size_t getNumberOfArguments() const override; - DataTypePtr getReturnTypeImpl(const DataTypes &) const override; -private: - DataTypePtr return_type; - - static auto getExtractor(const ArgumentExtractor::ParsedArguments & parsed_arguments); - - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override; + bool isVariadic() const override + { + return true; + } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo &) const override + { + return false; + } + + std::size_t getNumberOfArguments() const override + { + return 0u; + } + + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override + { + return {1, 2, 3, 4, 5}; + } }; } diff --git a/src/Functions/keyvaluepair/src/impl/CHKeyValuePairExtractor.h b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h similarity index 79% rename from src/Functions/keyvaluepair/src/impl/CHKeyValuePairExtractor.h rename to src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h index ce868c411e50..fe022551eb52 100644 --- a/src/Functions/keyvaluepair/src/impl/CHKeyValuePairExtractor.h +++ b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h @@ -3,25 +3,28 @@ #include #include #include -#include "Functions/keyvaluepair/src/impl/state/State.h" -#include "fmt/core.h" -#include "state/StateHandler.h" -#include "../KeyValuePairExtractor.h" +#include +#include + +#include #include namespace DB { -template +template class CHKeyValuePairExtractor : public KeyValuePairExtractor { - using Key = typename KeyStateHandler::ElementType; - using Value = typename ValueStateHandler::ElementType; + using Key = typename StateHandler::KeyType; + using Value = typename StateHandler::ValueType; + + using State = typename DB::extractKV::StateHandler::State; + using NextState = DB::extractKV::StateHandler::NextState; public: - CHKeyValuePairExtractor(KeyStateHandler key_state_handler_, ValueStateHandler value_state_handler_) - : key_state_handler(std::move(key_state_handler_)), value_state_handler(std::move(value_state_handler_)) + CHKeyValuePairExtractor(StateHandler state_handler_) + : state_handler(std::move(state_handler_)) {} uint64_t extract(const std::string & data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) override @@ -38,10 +41,9 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor Value value; uint64_t row_offset = 0; - const auto & config = key_state_handler.extractor_configuration; + const auto & config = state_handler.extractor_configuration; std::cerr << "CHKeyValuePairExtractor::extract with " - << typeid(key_state_handler).name() << " \\ " - << typeid(value_state_handler).name() + << typeid(state_handler).name() << "\nConfiguration" << "\n\tKV delimiter: '" << config.key_value_delimiter << "'" << "\n\tquote char : '" << config.quoting_character << "'" @@ -97,40 +99,41 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor switch (state) { case State::WAITING_KEY: - return key_state_handler.wait(file); + return state_handler.waitKey(file); case State::READING_KEY: { - auto result = key_state_handler.read(file, key); + auto result = state_handler.readKey(file, key); std::cerr << "CHKeyValuePairExtractor::processState key: " << fancyQuote(key) << std::endl; return result; } case State::READING_QUOTED_KEY: { - auto result = key_state_handler.readQuoted(file, key); + auto result = state_handler.readQuotedKey(file, key); std::cerr << "CHKeyValuePairExtractor::processState key: " << fancyQuote(key) << std::endl; return result; } case State::READING_KV_DELIMITER: - return key_state_handler.readKeyValueDelimiter(file); + return state_handler.readKeyValueDelimiter(file); case State::WAITING_VALUE: { - return value_state_handler.wait(file); + return state_handler.waitValue(file); } case State::READING_VALUE: { - auto result = value_state_handler.read(file, value); + auto result = state_handler.readValue(file, value); std::cerr << "CHKeyValuePairExtractor::processState value: " << fancyQuote(value) << std::endl; return result; } case State::READING_QUOTED_VALUE: { - auto result = value_state_handler.readQuoted(file, value); + auto result = state_handler.readQuotedValue(file, value); std::cerr << "CHKeyValuePairExtractor::processState value: " << fancyQuote(value) << std::endl; return result; } case State::FLUSH_PAIR: return flushPair(file, key, value, keys, values, row_offset); - case END: + + case State::END: return {0, state}; } } @@ -152,8 +155,7 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor return {0, file.size() == 0 ? State::END : State::WAITING_KEY}; } - KeyStateHandler key_state_handler; - ValueStateHandler value_state_handler; + StateHandler state_handler; }; } diff --git a/src/Functions/keyvaluepair/impl/CMakeLists.txt b/src/Functions/keyvaluepair/impl/CMakeLists.txt new file mode 100644 index 000000000000..c61fb520fcdf --- /dev/null +++ b/src/Functions/keyvaluepair/impl/CMakeLists.txt @@ -0,0 +1,7 @@ +include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") +add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs_core .) +add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs_core impl) + +add_library(clickhouse_functions_extractkeyvaluepairs_core ${clickhouse_functions_extractkeyvaluepairs_core_sources} ${clickhouse_functions_extractkeyvaluepairs_core_headers}) + +target_link_libraries(clickhouse_functions_extractkeyvaluepairs_core PRIVATE dbms) diff --git a/src/Functions/keyvaluepair/src/impl/state/Configuration.cpp b/src/Functions/keyvaluepair/impl/Configuration.cpp similarity index 97% rename from src/Functions/keyvaluepair/src/impl/state/Configuration.cpp rename to src/Functions/keyvaluepair/impl/Configuration.cpp index 547b924e1df5..da5af6f946f3 100644 --- a/src/Functions/keyvaluepair/src/impl/state/Configuration.cpp +++ b/src/Functions/keyvaluepair/impl/Configuration.cpp @@ -1,4 +1,5 @@ -#include "Configuration.h" +#include + #include namespace DB @@ -9,6 +10,9 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } +namespace extractKV +{ + Configuration::Configuration(char key_value_delimiter_, char quoting_character_, std::vector pair_delimiters_) : key_value_delimiter(key_value_delimiter_), quoting_character(quoting_character_), pair_delimiters(std::move(pair_delimiters_)) { @@ -74,3 +78,5 @@ void ConfigurationFactory::validate(char key_value_delimiter, char quoting_chara } } + +} diff --git a/src/Functions/keyvaluepair/src/impl/state/Configuration.h b/src/Functions/keyvaluepair/impl/Configuration.h similarity index 97% rename from src/Functions/keyvaluepair/src/impl/state/Configuration.h rename to src/Functions/keyvaluepair/impl/Configuration.h index 36414ae50135..12607a78b5cf 100644 --- a/src/Functions/keyvaluepair/src/impl/state/Configuration.h +++ b/src/Functions/keyvaluepair/impl/Configuration.h @@ -4,7 +4,8 @@ namespace DB { - +namespace extractKV +{ struct ConfigurationFactory; class Configuration @@ -35,5 +36,6 @@ struct ConfigurationFactory static constexpr auto MAX_NUMBER_OF_PAIR_DELIMITERS = 8u; }; +} } diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp b/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.cpp similarity index 73% rename from src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp rename to src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.cpp index 1695e2e5e853..5147bd24b2ac 100644 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.cpp +++ b/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.cpp @@ -1,6 +1,5 @@ -#include "InlineEscapingStateHandler.h" -#include -#include +#include +#include #include #include @@ -17,27 +16,32 @@ size_t consumeWithEscapeSequence(std::string_view file, size_t start_pos, size_t return buf.getPosition(); } + +using NextState = DB::extractKV::StateHandler::NextState; + } namespace DB { -InlineEscapingKeyStateHandler::InlineEscapingKeyStateHandler(Configuration configuration_) - : extractor_configuration(std::move(configuration_)) +namespace extractKV +{ + +InlineEscapingStateHandler::InlineEscapingStateHandler(Configuration extractor_configuration_) + : extractor_configuration(std::move(extractor_configuration_)) { wait_needles = EscapingNeedleFactory::getWaitNeedles(extractor_configuration); read_needles = EscapingNeedleFactory::getReadNeedles(extractor_configuration); read_quoted_needles = EscapingNeedleFactory::getReadQuotedNeedles(extractor_configuration); } -NextState InlineEscapingKeyStateHandler::wait(std::string_view file) const +NextState InlineEscapingStateHandler::waitKey(std::string_view file) const { const auto quoting_character = extractor_configuration.quoting_character; - size_t pos = 0; - while (const auto * p = find_first_not_symbols_or_null({file.begin() + pos, file.end()}, wait_needles)) + + if (const auto * p = find_first_not_symbols_or_null(file, wait_needles)) { const size_t character_position = p - file.begin(); - if (*p == quoting_character) { return {character_position + 1u, State::READING_QUOTED_KEY}; @@ -57,7 +61,7 @@ NextState InlineEscapingKeyStateHandler::wait(std::string_view file) const * If I find a key value delimiter and that is empty, I do not need to copy? hm,m hm hm * */ -NextState InlineEscapingKeyStateHandler::read(std::string_view file, ElementType & key) const +NextState InlineEscapingStateHandler::readKey(std::string_view file, KeyType & key) const { const auto & [key_value_delimiter, quoting_character, pair_delimiters] = extractor_configuration; @@ -97,7 +101,7 @@ NextState InlineEscapingKeyStateHandler::read(std::string_view file, ElementType return {file.size(), State::END}; } -NextState InlineEscapingKeyStateHandler::readQuoted(std::string_view file, ElementType & key) const +NextState InlineEscapingStateHandler::readQuotedKey(std::string_view file, KeyType & key) const { const auto quoting_character = extractor_configuration.quoting_character; @@ -137,7 +141,7 @@ NextState InlineEscapingKeyStateHandler::readQuoted(std::string_view file, Eleme return {file.size(), State::END}; } -NextState InlineEscapingKeyStateHandler::readKeyValueDelimiter(std::string_view file) const +NextState InlineEscapingStateHandler::readKeyValueDelimiter(std::string_view file) const { const auto current_character = file[0]; if (current_character == extractor_configuration.key_value_delimiter) @@ -148,50 +152,35 @@ NextState InlineEscapingKeyStateHandler::readKeyValueDelimiter(std::string_view return {0, State::WAITING_KEY}; } - -InlineEscapingValueStateHandler::InlineEscapingValueStateHandler(Configuration extractor_configuration_) - : extractor_configuration(std::move(extractor_configuration_)) -{ - read_needles = EscapingNeedleFactory::getReadNeedles(extractor_configuration); - read_quoted_needles = EscapingNeedleFactory::getReadQuotedNeedles(extractor_configuration); -} - -NextState InlineEscapingValueStateHandler::wait(std::string_view file) const +NextState InlineEscapingStateHandler::waitValue(std::string_view file) const { - const auto & [key_value_delimiter, quoting_character, pair_delimiters] = extractor_configuration; + const auto & [key_value_delimiter, quoting_character, _] = extractor_configuration; size_t pos = 0; - if (pos < file.size()) - { - const auto current_character = file[pos]; + const auto current_character = file[pos]; - if (quoting_character == current_character) - { - return {pos + 1u, State::READING_QUOTED_VALUE}; - } - else if (key_value_delimiter == current_character) - { - return {pos, State::WAITING_KEY}; - } - else - { - return {pos, State::READING_VALUE}; - } + if (current_character == quoting_character) + { + return {pos + 1u, State::READING_QUOTED_VALUE}; + } + else if (current_character == key_value_delimiter) + { + return {pos, State::WAITING_KEY}; } return {pos, State::READING_VALUE}; } -NextState InlineEscapingValueStateHandler::read(std::string_view file, ElementType & value) const +NextState InlineEscapingStateHandler::readValue(std::string_view file, ValueType & value) const { - const auto & [key_value_delimiter, quoting_character, pair_delimiters] = extractor_configuration; + const auto & [key_value_delimiter, _, pair_delimiters] = extractor_configuration; value.clear(); size_t pos = 0; while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_needles)) { - auto character_position = p - file.begin(); + const size_t character_position = p - file.begin(); size_t next_pos = character_position + 1u; if (*p == '\\') @@ -221,21 +210,20 @@ NextState InlineEscapingValueStateHandler::read(std::string_view file, ElementTy // Reached end of input, consume rest of the file as value and make sure KV pair is produced. value.insert(value.end(), file.begin() + pos, file.end()); - return {pos, State::FLUSH_PAIR}; } -NextState InlineEscapingValueStateHandler::readQuoted(std::string_view file, ElementType & value) const +NextState InlineEscapingStateHandler::readQuotedValue(std::string_view file, ValueType & value) const { const auto quoting_character = extractor_configuration.quoting_character; - const std::string_view needles{read_quoted_needles.begin(), read_quoted_needles.end()}; + + size_t pos = 0; value.clear(); - size_t pos = 0; - while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, needles)) + while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_quoted_needles)) { - auto character_position = p - file.begin(); + const size_t character_position = p - file.begin(); size_t next_pos = character_position + 1u; if (*p == '\\') @@ -261,3 +249,5 @@ NextState InlineEscapingValueStateHandler::readQuoted(std::string_view file, Ele } } + +} diff --git a/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.h b/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.h new file mode 100644 index 000000000000..120d2f3d95bb --- /dev/null +++ b/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.h @@ -0,0 +1,42 @@ +#pragma once + +#include +#include + +#include +#include +#include + +namespace DB +{ + +namespace extractKV +{ + +class InlineEscapingStateHandler : public StateHandler +{ +public: + using KeyType = std::string; + using ValueType = std::string; + + explicit InlineEscapingStateHandler(Configuration configuration_); + + [[nodiscard]] NextState waitKey(std::string_view file) const; + [[nodiscard]] NextState readKey(std::string_view file, KeyType & key) const; + [[nodiscard]] NextState readQuotedKey(std::string_view file, KeyType & key) const; + [[nodiscard]] NextState readKeyValueDelimiter(std::string_view file) const; + [[nodiscard]] NextState waitValue(std::string_view file) const; + [[nodiscard]] NextState readValue(std::string_view file, ValueType & value) const; + [[nodiscard]] NextState readQuotedValue(std::string_view file, ValueType & value) const; + + const Configuration extractor_configuration; + +private: + std::vector wait_needles; + std::vector read_needles; + std::vector read_quoted_needles; +}; + +} + +} diff --git a/src/Functions/keyvaluepair/src/KeyValuePairExtractor.h b/src/Functions/keyvaluepair/impl/KeyValuePairExtractor.h similarity index 100% rename from src/Functions/keyvaluepair/src/KeyValuePairExtractor.h rename to src/Functions/keyvaluepair/impl/KeyValuePairExtractor.h diff --git a/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.cpp b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp similarity index 54% rename from src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.cpp rename to src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp index 57eda58d38da..a71f38413d4d 100644 --- a/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.cpp +++ b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp @@ -1,10 +1,9 @@ -#include "KeyValuePairExtractorBuilder.h" +#include -#include "Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.h" -#include "Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.h" -#include "Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.h" -#include "impl/CHKeyValuePairExtractor.h" -#include "impl/state/Configuration.h" +#include +#include +#include +#include namespace DB { @@ -43,30 +42,30 @@ std::shared_ptr KeyValuePairExtractorBuilder::build() con return buildWithoutEscaping(); } -std::shared_ptr KeyValuePairExtractorBuilder::buildWithoutEscaping() const +namespace { - auto configuration = ConfigurationFactory::createWithoutEscaping(key_value_pair_delimiter, quoting_character, item_delimiters); +using namespace extractKV; - CKeyStateHandler auto key_state_handler = NoEscapingKeyStateHandler( - configuration - ); +template +auto makeStateHandler(const T && handler) +{ + return std::make_shared>(handler); +} - CValueStateHandler auto value_state_handler = NoEscapingValueStateHandler( - configuration - ); +} - return std::make_shared>(key_state_handler, value_state_handler); +std::shared_ptr KeyValuePairExtractorBuilder::buildWithoutEscaping() const +{ + auto configuration = ConfigurationFactory::createWithoutEscaping(key_value_pair_delimiter, quoting_character, item_delimiters); + + return makeStateHandler(NoEscapingStateHandler(configuration)); } std::shared_ptr KeyValuePairExtractorBuilder::buildWithEscaping() const { auto configuration = ConfigurationFactory::createWithEscaping(key_value_pair_delimiter, quoting_character, item_delimiters); - CKeyStateHandler auto key_state_handler = InlineEscapingKeyStateHandler(configuration); - - CValueStateHandler auto value_state_handler = InlineEscapingValueStateHandler(configuration); - - return std::make_shared>(key_state_handler, value_state_handler); + return makeStateHandler(InlineEscapingStateHandler(configuration)); } } diff --git a/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.h b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h similarity index 74% rename from src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.h rename to src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h index a6d7e9c1b65a..4bed22962f8f 100644 --- a/src/Functions/keyvaluepair/src/KeyValuePairExtractorBuilder.h +++ b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h @@ -1,10 +1,6 @@ #pragma once -#include "Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingKeyStateHandler.h" -#include "Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.h" -#include "KeyValuePairExtractor.h" -//#include "impl/CHKeyValuePairExtractor.h" - +#include #include namespace DB diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/util/NeedleFactory.cpp b/src/Functions/keyvaluepair/impl/NeedleFactory.cpp similarity index 86% rename from src/Functions/keyvaluepair/src/impl/state/strategies/util/NeedleFactory.cpp rename to src/Functions/keyvaluepair/impl/NeedleFactory.cpp index 6863e28984c4..f48487df36fc 100644 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/util/NeedleFactory.cpp +++ b/src/Functions/keyvaluepair/impl/NeedleFactory.cpp @@ -1,9 +1,12 @@ -#include "NeedleFactory.h" +#include namespace DB { -std::vector NeedleFactory::getWaitNeedles(const DB::Configuration & extractor_configuration) +namespace extractKV +{ + +std::vector NeedleFactory::getWaitNeedles(const Configuration & extractor_configuration) { const auto & [key_value_delimiter, quoting_character, pair_delimiters] = extractor_configuration; @@ -49,7 +52,7 @@ std::vector NeedleFactory::getReadQuotedNeedles(const Configuration & extr return needles; } -std::vector EscapingNeedleFactory::getWaitNeedles(const DB::Configuration & extractor_configuration) +std::vector EscapingNeedleFactory::getWaitNeedles(const Configuration & extractor_configuration) { auto needles = NeedleFactory::getWaitNeedles(extractor_configuration); @@ -77,3 +80,5 @@ std::vector EscapingNeedleFactory::getReadQuotedNeedles(const Configuratio } } + +} diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/util/NeedleFactory.h b/src/Functions/keyvaluepair/impl/NeedleFactory.h similarity index 92% rename from src/Functions/keyvaluepair/src/impl/state/strategies/util/NeedleFactory.h rename to src/Functions/keyvaluepair/impl/NeedleFactory.h index 43920091ab68..180e30bce02d 100644 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/util/NeedleFactory.h +++ b/src/Functions/keyvaluepair/impl/NeedleFactory.h @@ -1,12 +1,15 @@ #pragma once -#include +#include -#include +#include namespace DB { +namespace extractKV +{ + /* * Did not spend much time here, running against the clock :) * @@ -31,3 +34,5 @@ class EscapingNeedleFactory }; } + +} diff --git a/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp b/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp new file mode 100644 index 000000000000..63d5df45d5d4 --- /dev/null +++ b/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp @@ -0,0 +1,221 @@ +#include + +#include + +#include + +namespace +{ + +std::string_view createElement(std::string_view file, std::size_t begin, std::size_t end) +{ + return std::string_view{file.begin() + begin, file.begin() + end}; +} + +using NextState = DB::extractKV::StateHandler::NextState; + +} + +namespace DB +{ + +namespace extractKV +{ + + +NoEscapingStateHandler::NoEscapingStateHandler(Configuration extractor_configuration_) +: extractor_configuration(std::move(extractor_configuration_)) +{ + wait_needles = NeedleFactory::getWaitNeedles(extractor_configuration); + read_needles = NeedleFactory::getReadNeedles(extractor_configuration); + read_quoted_needles = NeedleFactory::getReadQuotedNeedles(extractor_configuration); +} + +NextState NoEscapingStateHandler::waitKey(std::string_view file) const +{ + const auto quoting_character = extractor_configuration.quoting_character; + + if (const auto * p = find_first_not_symbols_or_null(file, wait_needles)) + { + const size_t character_position = p - file.begin(); + if (*p == quoting_character) + { + return {character_position + 1u, State::READING_QUOTED_KEY}; + } + else + { + return {character_position, State::READING_KEY}; + } + } + + return {file.size(), State::END}; +} + +NextState NoEscapingStateHandler::readKey(std::string_view file, KeyType & key) const +{ + const auto & [key_value_delimiter, quoting_character, pair_delimiters] + = extractor_configuration; + + key = {}; + + size_t pos = 0; + auto start_index = pos; + + while (auto p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_needles)) + { + const size_t character_position = p - file.begin(); + auto next_pos = character_position + 1u; + + if (*p == key_value_delimiter) + { + key = createElement(file, start_index, character_position); + + if (key.empty()) + { + return {next_pos, State::WAITING_KEY}; + } + + return {next_pos, State::WAITING_VALUE}; + } + else if (std::find(pair_delimiters.begin(), pair_delimiters.end(), *p) != pair_delimiters.end()) + { + return {next_pos, State::WAITING_KEY}; + } + + pos = next_pos; + // TODO: add check to not read past end of `file`'s data? + } + + return {file.size(), State::END}; +} + +NextState NoEscapingStateHandler::readQuotedKey(std::string_view file, KeyType & key) const +{ + const auto quoting_character = extractor_configuration.quoting_character; + + key = {}; + + size_t pos = 0; + auto start_index = pos; + + while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_quoted_needles)) + { + const size_t character_position = p - file.begin(); + auto next_pos = character_position + 1u; + + if (*p == quoting_character) + { + key = createElement(file, start_index, character_position); + + if (key.empty()) + { + return {next_pos, State::WAITING_KEY}; + } + + return {next_pos, State::READING_KV_DELIMITER}; + } + + pos = next_pos; + } + + return {file.size(), State::END}; +} + +NextState NoEscapingStateHandler::readKeyValueDelimiter(std::string_view file) const +{ + const auto current_character = file[0]; + if (current_character == extractor_configuration.key_value_delimiter) + { + return {1, WAITING_VALUE}; + } + + return {0, State::WAITING_KEY}; +} + +NextState NoEscapingStateHandler::waitValue(std::string_view file) const +{ + const auto & [key_value_delimiter, quoting_character, _] = extractor_configuration; + + size_t pos = 0; + const auto current_character = file[pos]; + + if (current_character == quoting_character) + { + return {pos + 1u, State::READING_QUOTED_VALUE}; + } + else if (current_character == key_value_delimiter) + { + return {pos, State::WAITING_KEY}; + } + + return {pos, State::READING_VALUE}; + +// // TODO: can't bre reached, remove it? +// return {file.size(), State::READING_VALUE}; +} + +NextState NoEscapingStateHandler::readValue(std::string_view file, ValueType & value) const +{ + const auto & [key_value_delimiter, quoting_character, pair_delimiters] = extractor_configuration; + + value = {}; + + size_t pos = 0; + auto start_index = pos; + while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_needles)) + { + const size_t character_position = p - file.begin(); + size_t next_pos = character_position + 1u; + + if (*p == key_value_delimiter) + { + return {next_pos, State::WAITING_KEY}; + } + else if (std::find(pair_delimiters.begin(), pair_delimiters.end(), *p) != pair_delimiters.end()) + { + // reached next pair + value = createElement(file, start_index, character_position); + + return {next_pos, State::FLUSH_PAIR}; + } + + pos = next_pos; + } + + // TODO: do I really need the below logic? + // this allows empty values at the end + value = createElement(file, start_index, file.size()); + return {file.size(), State::FLUSH_PAIR}; +} + +NextState NoEscapingStateHandler::readQuotedValue(std::string_view file, ValueType & value) const +{ + const auto quoting_character = extractor_configuration.quoting_character; + + size_t pos = 0; + auto start_index = pos; + + value = {}; + + while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_quoted_needles)) + { + const size_t character_position = p - file.begin(); + size_t next_pos = character_position + 1u; + + if (*p == quoting_character) + { + value = createElement(file, start_index, character_position); + + std::cerr << "NoEscapingStateHandler::readQuoted Going to consume up to: «" << fancyQuote(file.substr(0, next_pos)) << " to " << fancyQuote(file.substr(next_pos)) << std::endl; + return {next_pos, State::FLUSH_PAIR}; + } + + pos = next_pos; + } + + return {file.size(), State::END}; +} + +} + +} diff --git a/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.h b/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.h new file mode 100644 index 000000000000..e547a503530f --- /dev/null +++ b/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.h @@ -0,0 +1,42 @@ +#pragma once + +#include +#include + +#include +#include +#include + +namespace DB +{ + +namespace extractKV +{ + +class NoEscapingStateHandler : public StateHandler +{ +public: + using KeyType = std::string_view; + using ValueType = std::string_view; + + explicit NoEscapingStateHandler(Configuration extractor_configuration_); + + [[nodiscard]] NextState waitKey(std::string_view file) const; + [[nodiscard]] NextState readKey(std::string_view file, KeyType & key) const; + [[nodiscard]] NextState readQuotedKey(std::string_view file, KeyType & key) const; + [[nodiscard]] NextState readKeyValueDelimiter(std::string_view file) const; + [[nodiscard]] NextState waitValue(std::string_view file) const; + [[nodiscard]] NextState readValue(std::string_view file, ValueType & value) const; + [[nodiscard]] NextState readQuotedValue(std::string_view file, ValueType & value) const; + + const Configuration extractor_configuration; + +private: + std::vector wait_needles; + std::vector read_needles; + std::vector read_quoted_needles; +}; + +} + +} diff --git a/src/Functions/keyvaluepair/impl/StateHandler.h b/src/Functions/keyvaluepair/impl/StateHandler.h new file mode 100644 index 000000000000..362e2d9618ff --- /dev/null +++ b/src/Functions/keyvaluepair/impl/StateHandler.h @@ -0,0 +1,90 @@ +#pragma once + +#include + +#include + +namespace DB +{ + +namespace extractKV +{ + +class StateHandler +{ +public: + enum State + { + // Skip characters until it finds a valid first key character. Might jump to READING_KEY, READING_QUOTED_KEY or END. + WAITING_KEY, + // Tries to read a key. Might jump to WAITING_KEY, WAITING_VALUE or END. + READING_KEY, + // Tries to read an quoted/ quoted key. Might jump to WAITING_KEY, READING_KV_DELIMITER or END. + READING_QUOTED_KEY, + // Tries to read the key value pair delimiter. Might jump to WAITING_KEY, WAITING_VALUE or END. + READING_KV_DELIMITER, + // Skip characters until it finds a valid first value character. Might jump to READING_QUOTED_VALUE or READING_VALUE. + WAITING_VALUE, + // Tries to read a value. Jumps to FLUSH_PAIR. + READING_VALUE, + // Tries to read an quoted/ quoted value. Might jump to FLUSH_PAIR or END. + READING_QUOTED_VALUE, + // In this state, both key and value have already been collected and should be flushed. Might jump to WAITING_KEY or END. + FLUSH_PAIR, + END + }; + + struct NextState + { + std::size_t position_in_string; + State state; + }; + + StateHandler() = default; + StateHandler(const StateHandler &) = default; + + virtual ~StateHandler() = default; +}; + +} + +// TODO(vnemkov): Debug stuff, remove before merging + +template +struct CustomQuoted +{ + const char * start_quote = "\""; + const char * end_quote = "\""; + + const T & value; +}; + +template +CustomQuoted customQuote(const char * start_quote, const T & value, const char * end_quote = nullptr) +{ + assert(start_quote != nullptr); + + return CustomQuoted{ + .start_quote = start_quote, + .end_quote = end_quote ? end_quote : start_quote, + .value = value + }; +} + +template +CustomQuoted fancyQuote(const T & value) +{ + return CustomQuoted{ + .start_quote = "«", + .end_quote = "»", + .value = value + }; +} + +template +std::ostream & operator<<(std::ostream & ostr, const CustomQuoted & val) +{ + return ostr << val.start_quote << val.value << val.end_quote; +} + +} diff --git a/src/Functions/keyvaluepair/src/CMakeLists.txt b/src/Functions/keyvaluepair/src/CMakeLists.txt deleted file mode 100644 index ae30de13f6a6..000000000000 --- a/src/Functions/keyvaluepair/src/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") -add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs_core .) -add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs_core impl) -add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs_core impl/state) -add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs_core impl/state/strategies) -add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs_core impl/state/strategies/escaping) -add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs_core impl/state/strategies/noescaping) -add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs_core impl/state/strategies/util) - -add_library(clickhouse_functions_extractkeyvaluepairs_core ${clickhouse_functions_extractkeyvaluepairs_core_sources} ${clickhouse_functions_extractkeyvaluepairs_core_headers}) - -target_link_libraries(clickhouse_functions_extractkeyvaluepairs_core PRIVATE dbms) \ No newline at end of file diff --git a/src/Functions/keyvaluepair/src/impl/state/State.h b/src/Functions/keyvaluepair/src/impl/state/State.h deleted file mode 100644 index 8e9c920e7c35..000000000000 --- a/src/Functions/keyvaluepair/src/impl/state/State.h +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - -#include - -namespace DB -{ - -enum State -{ - // Skip characters until it finds a valid first key character. Might jump to READING_KEY, READING_QUOTED_KEY or END. - WAITING_KEY, - // Tries to read a key. Might jump to WAITING_KEY, WAITING_VALUE or END. - READING_KEY, - // Tries to read an quoted/ quoted key. Might jump to WAITING_KEY, READING_KV_DELIMITER or END. - READING_QUOTED_KEY, - // Tries to read the key value pair delimiter. Might jump to WAITING_KEY, WAITING_VALUE or END. - READING_KV_DELIMITER, - // Skip characters until it finds a valid first value character. Might jump to READING_QUOTED_VALUE or READING_VALUE. - WAITING_VALUE, - // Tries to read a value. Jumps to FLUSH_PAIR. - READING_VALUE, - // Tries to read an quoted/ quoted value. Might jump to FLUSH_PAIR or END. - READING_QUOTED_VALUE, - // In this state, both key and value have already been collected and should be flushed. Might jump to WAITING_KEY or END. - FLUSH_PAIR, - END -}; - -struct NextState -{ - std::size_t position_in_string; - State state; -}; - -} diff --git a/src/Functions/keyvaluepair/src/impl/state/StateHandler.cpp b/src/Functions/keyvaluepair/src/impl/state/StateHandler.cpp deleted file mode 100644 index a1beb077bb5e..000000000000 --- a/src/Functions/keyvaluepair/src/impl/state/StateHandler.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "StateHandler.h" - -namespace DB -{ - -std::string_view StateHandler::createElement(std::string_view file, std::size_t begin, std::size_t end) -{ - return std::string_view{file.begin() + begin, file.begin() + end}; -} - -} diff --git a/src/Functions/keyvaluepair/src/impl/state/StateHandler.h b/src/Functions/keyvaluepair/src/impl/state/StateHandler.h deleted file mode 100644 index ae2bbfc61241..000000000000 --- a/src/Functions/keyvaluepair/src/impl/state/StateHandler.h +++ /dev/null @@ -1,101 +0,0 @@ -#pragma once - -#include -#include -#include -#include "State.h" - -#include - -namespace DB -{ - -template -concept CInlineEscapingKeyStateHandler = requires(KeyStateHandler handler) -{ - { handler.wait(std::string_view {}) } -> std::same_as; - { handler.read(std::string_view {}, std::declval()) } -> std::same_as; - { handler.readQuoted(std::string_view {}, std::declval()) } -> std::same_as; - { handler.readKeyValueDelimiter(std::string_view {}) } -> std::same_as; -}; - -template -concept CNoEscapingKeyStateHandler = requires(KeyStateHandler handler) -{ - { handler.wait(std::string_view {}) } -> std::same_as; - { handler.read(std::string_view {}, std::declval()) } -> std::same_as; - { handler.readQuoted(std::string_view {}, std::declval()) } -> std::same_as; - { handler.readKeyValueDelimiter(std::string_view {}) } -> std::same_as; -}; - -template -concept CKeyStateHandler = CInlineEscapingKeyStateHandler || CNoEscapingKeyStateHandler; - -template -concept CInlineEscapingValueStateHandler = requires(ValueStateHandler handler) -{ - { handler.wait(std::string_view {}) } -> std::same_as; - { handler.read(std::string_view {}, std::declval()) } -> std::same_as; - { handler.readQuoted(std::string_view {}, std::declval()) } -> std::same_as; -}; - -template -concept CNoEscapingValueStateHandler = requires(ValueStateHandler handler) -{ - { handler.wait(std::string_view {}) } -> std::same_as; - { handler.read(std::string_view {}, std::declval()) } -> std::same_as; - { handler.readQuoted(std::string_view {}, std::declval()) } -> std::same_as; -}; - -template -concept CValueStateHandler = CInlineEscapingValueStateHandler || CNoEscapingValueStateHandler; - -struct StateHandler -{ - StateHandler() = default; - StateHandler(const StateHandler &) = default; - - virtual ~StateHandler() = default; - -protected: - [[nodiscard]] static std::string_view createElement(std::string_view file, std::size_t begin, std::size_t end); -}; - -template -struct CustomQuoted -{ - const char * start_quote = "\""; - const char * end_quote = "\""; - - const T & value; -}; - -template -CustomQuoted customQuote(const char * start_quote, const T & value, const char * end_quote = nullptr) -{ - assert(start_quote != nullptr); - - return CustomQuoted{ - .start_quote = start_quote, - .end_quote = end_quote ? end_quote : start_quote, - .value = value - }; -} - -template -CustomQuoted fancyQuote(const T & value) -{ - return CustomQuoted{ - .start_quote = "«", - .end_quote = "»", - .value = value - }; -} - -template -std::ostream & operator<<(std::ostream & ostr, const CustomQuoted & val) -{ - return ostr << val.start_quote << val.value << val.end_quote; -} - -} diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.h b/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.h deleted file mode 100644 index 23bbd062284a..000000000000 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/escaping/InlineEscapingStateHandler.h +++ /dev/null @@ -1,55 +0,0 @@ -#pragma once - -#include -#include -#include "Functions/keyvaluepair/src/impl/state/Configuration.h" -#include "Functions/keyvaluepair/src/impl/state/State.h" -#include "Functions/keyvaluepair/src/impl/state/StateHandler.h" - -namespace DB -{ - -class InlineEscapingKeyStateHandler : public StateHandler -{ -public: - using ElementType = std::string; - - explicit InlineEscapingKeyStateHandler(Configuration configuration_); - - [[nodiscard]] NextState wait(std::string_view file) const; - - [[nodiscard]] NextState read(std::string_view file, ElementType & key) const; - - [[nodiscard]] NextState readQuoted(std::string_view file, ElementType & key) const; - - [[nodiscard]] NextState readKeyValueDelimiter(std::string_view file) const; - -private: - Configuration extractor_configuration; - - std::vector wait_needles; - std::vector read_needles; - std::vector read_quoted_needles; -}; - - -class InlineEscapingValueStateHandler : public StateHandler -{ -public: - using ElementType = std::string; - - explicit InlineEscapingValueStateHandler(Configuration extractor_configuration_); - - [[nodiscard]] NextState wait(std::string_view file) const; - - [[nodiscard]] NextState read(std::string_view file, ElementType & value) const; - - [[nodiscard]] NextState readQuoted(std::string_view file, ElementType & value) const; - -private: - Configuration extractor_configuration; - std::vector read_needles; - std::vector read_quoted_needles; -}; - -} diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.cpp b/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.cpp deleted file mode 100644 index 7bc90e9c7e0b..000000000000 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.cpp +++ /dev/null @@ -1,110 +0,0 @@ -#include "NoEscapingValueStateHandler.h" -#include -#include - -namespace DB -{ - -NoEscapingValueStateHandler::NoEscapingValueStateHandler(Configuration extractor_configuration_) - : extractor_configuration(std::move(extractor_configuration_)) -{ - read_needles = NeedleFactory::getReadNeedles(extractor_configuration); - read_quoted_needles = NeedleFactory::getReadQuotedNeedles(extractor_configuration); -} - -NextState NoEscapingValueStateHandler::wait(std::string_view file) const -{ - const auto & [key_value_delimiter, quoting_character, pair_delimiters] - = extractor_configuration; - - size_t pos = 0; - if (pos < file.size()) - { - const auto current_character = file[pos]; - - if (quoting_character == current_character) - { - return {pos + 1u, State::READING_QUOTED_VALUE}; - } - else if (key_value_delimiter == current_character) - { - return {pos, State::WAITING_KEY}; - } - else - { - return {pos, State::READING_VALUE}; - } - } - - return {file.size(), State::READING_VALUE}; -} - -NextState NoEscapingValueStateHandler::read(std::string_view file, ElementType & value) const -{ - size_t pos = 0; - auto start_index = pos; - - value = {}; - - BoundsSafeCharacterFinder finder; - - const auto & [key_value_delimiter, quoting_character, pair_delimiters] - = extractor_configuration; - - while (auto character_position_opt = finder.findFirst(file, pos, read_needles)) - { - auto character_position = *character_position_opt; - auto character = file[character_position]; - auto next_pos = character_position + 1u; - - if (key_value_delimiter == character) - { - return {next_pos, State::WAITING_KEY}; - } - else if (std::find(pair_delimiters.begin(), pair_delimiters.end(), character) != pair_delimiters.end()) - { - value = createElement(file, start_index, character_position); - return {next_pos, State::FLUSH_PAIR}; - } - - pos = next_pos; - } - - // TODO: do I really need the below logic? - // this allows empty values at the end - value = createElement(file, start_index, file.size()); - return {file.size(), State::FLUSH_PAIR}; -} - -NextState NoEscapingValueStateHandler::readQuoted(std::string_view file, ElementType & value) const -{ - size_t pos = 0; - auto start_index = pos; - - value = {}; - - BoundsSafeCharacterFinder finder; - - const auto quoting_character = extractor_configuration.quoting_character; - - while (auto character_position_opt = finder.findFirst(file, pos, read_quoted_needles)) - { - auto character_position = *character_position_opt; - auto character = file[character_position]; - auto next_pos = character_position + 1u; - - if (quoting_character == character) - { - value = createElement(file, start_index, character_position); - - std::cerr << "NoEscapingValueStateHandler::readQuoted Going to consume up to: «" << fancyQuote(file.substr(0, next_pos)) << " to " << fancyQuote(file.substr(next_pos)) << std::endl; - return {next_pos, State::FLUSH_PAIR}; - } - - pos = next_pos; - } - - return {file.size(), State::END}; -} - -} diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.h b/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.h deleted file mode 100644 index 25901156ca67..000000000000 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/noescaping/NoEscapingValueStateHandler.h +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once - -#include -#include "Functions/keyvaluepair/src/impl/state/Configuration.h" -#include "Functions/keyvaluepair/src/impl/state/StateHandler.h" - -namespace DB -{ - -class NoEscapingValueStateHandler : public StateHandler -{ -public: - using ElementType = std::string_view; - - explicit NoEscapingValueStateHandler(Configuration extractor_configuration_); - - [[nodiscard]] NextState wait(std::string_view file) const; - - [[nodiscard]] NextState read(std::string_view file, ElementType & value) const; - - [[nodiscard]] NextState readQuoted(std::string_view file, ElementType & value) const; - -private: - Configuration extractor_configuration; - std::vector read_needles; - std::vector read_quoted_needles; -}; - -} diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/util/CharacterFinder.cpp b/src/Functions/keyvaluepair/src/impl/state/strategies/util/CharacterFinder.cpp deleted file mode 100644 index a4c738cc3aa5..000000000000 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/util/CharacterFinder.cpp +++ /dev/null @@ -1,80 +0,0 @@ -#include "CharacterFinder.h" -#include - - -namespace DB -{ - -std::optional CharacterFinder::findFirst(std::string_view haystack, const std::vector & needles) -{ - if (!needles.empty()) - { - if (const auto * ptr = find_first_symbols_or_null(haystack, {needles.begin(), needles.end()})) - { - return ptr - haystack.begin(); - } - } - - return std::nullopt; -} - -std::optional CharacterFinder::findFirst(std::string_view haystack, std::size_t offset, const std::vector & needles) -{ - if (auto position = findFirst({haystack.begin() + offset, haystack.end()}, needles)) - { - return offset + *position; - } - - return std::nullopt; -} - -std::optional CharacterFinder::findFirstNot(std::string_view haystack, const std::vector & needles) -{ -// if (needles.empty()) -// { -// return 0u; -// } - - if (const auto * ptr = find_first_not_symbols_or_null(haystack, {needles.begin(), needles.end()})) - { - return ptr - haystack.begin(); - } - - return std::nullopt; -} - -std::optional CharacterFinder::findFirstNot(std::string_view haystack, std::size_t offset, const std::vector & needles) -{ - if (auto position = findFirstNot({haystack.begin() + offset, haystack.end()}, needles)) - { - return offset + *position; - } - - return std::nullopt; -} - -std::optional BoundsSafeCharacterFinder::findFirst( - std::string_view haystack, std::size_t offset, const std::vector & needles -) const -{ - if (haystack.size() > offset) - { - return CharacterFinder::findFirst(haystack, offset, needles); - } - - return std::nullopt; -} - -std::optional BoundsSafeCharacterFinder::findFirstNot( - std::string_view haystack, std::size_t offset, const std::vector & needles -) const -{ - if (haystack.size() > offset) - { - return CharacterFinder::findFirstNot(haystack, offset, needles); - } - - return std::nullopt; -} - -} diff --git a/src/Functions/keyvaluepair/src/impl/state/strategies/util/CharacterFinder.h b/src/Functions/keyvaluepair/src/impl/state/strategies/util/CharacterFinder.h deleted file mode 100644 index 3f518a51c418..000000000000 --- a/src/Functions/keyvaluepair/src/impl/state/strategies/util/CharacterFinder.h +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace DB -{ - -class CharacterFinder -{ -public: - using Position = std::size_t; - virtual ~CharacterFinder() = default; - - static std::optional findFirst(std::string_view haystack, const std::vector & needles); - - static std::optional findFirst(std::string_view haystack, std::size_t offset, const std::vector & needles); - - static std::optional findFirstNot(std::string_view haystack, const std::vector & needles); - - static std::optional findFirstNot(std::string_view haystack, std::size_t offset, const std::vector & needles); - -}; - -/* - * Maybe decorator would be better :) - * */ -class BoundsSafeCharacterFinder -{ - using Position = std::size_t; -public: - std::optional findFirst(std::string_view haystack, std::size_t offset, const std::vector & needles) const; - - std::optional findFirstNot(std::string_view haystack, std::size_t offset, const std::vector & needles) const; -}; - -} - diff --git a/src/Functions/keyvaluepair/tests/gtest_character_finder.cpp b/src/Functions/keyvaluepair/tests/gtest_character_finder.cpp deleted file mode 100644 index aba637079a94..000000000000 --- a/src/Functions/keyvaluepair/tests/gtest_character_finder.cpp +++ /dev/null @@ -1,99 +0,0 @@ -#include -#include - - -namespace DB -{ -void test_find_first( - const CharacterFinder& finder, const std::string_view& haystack, - const std::vector& needles, const std::optional>& expected_result) -{ - auto pos = finder.findFirst(haystack, needles); - - ASSERT_EQ(pos.has_value(), expected_result.has_value()); - - if (expected_result) - { - auto [expected_character, expected_position] = *expected_result; - - ASSERT_EQ(expected_position, *pos); - ASSERT_EQ(expected_character, haystack[*pos]); - } -} - -void test_find_first_not(const CharacterFinder& finder, const std::string_view& haystack, const std::vector& needles, const std::optional>& expected_result) -{ - auto pos = finder.findFirstNot(haystack, needles); - - ASSERT_EQ(pos.has_value(), expected_result.has_value()); - - if (expected_result) - { - auto [expected_character, expected_position] = *expected_result; - - ASSERT_EQ(expected_position, *pos); - ASSERT_EQ(expected_character, haystack[*pos]); - } -} - -TEST(extractKVPair_CharacterFinderTest, FindFirst) -{ - CharacterFinder finder; - - // Test empty haystack - std::string_view empty; - std::vector needles = {'a', 'b', 'c'}; - test_find_first(finder, empty, needles, std::nullopt); - - // Test haystack with no needles - std::string_view no_needles = "hello world"; - std::vector empty_needles = {}; - test_find_first(finder, no_needles, empty_needles, std::nullopt); - - // Test haystack with a single needle - std::string_view single_needle = "hello world"; - std::vector single_needles = {'o'}; - test_find_first(finder, single_needle, single_needles, std::make_optional>('o', 4)); - - // Test haystack with multiple needles - std::string_view multiple_needles = "hello world"; - std::vector all_needles = {'a', 'e', 'i', 'o', 'u'}; - test_find_first(finder, multiple_needles, all_needles, std::make_optional>('e', 1)); - - // Test haystack with special characters - std::string_view special_needle = R"(this=that\other"thing")"; - std::vector special_needles = {'=', '\\', '"'}; - test_find_first(finder, special_needle, special_needles, std::make_optional>('=', 4)); -} - -TEST(extractKVPair_CharacterFinderTest, FindFirstNot) -{ - CharacterFinder finder; - -// Test empty haystack - std::string_view empty; - std::vector needles = {'a', 'b', 'c'}; - test_find_first_not(finder, empty, needles, std::nullopt); - - // Test haystack with no needles - std::string_view no_needles = "hello world"; - std::vector empty_needles = {}; - test_find_first_not(finder, no_needles, empty_needles, std::make_optional>('h', 0)); - - // Test haystack with a single needle - std::string_view single_needle = "hello world"; - std::vector single_needles = {'o'}; - test_find_first_not(finder, single_needle, single_needles, std::make_optional>('h', 0)); - -// Test haystack with multiple needles - std::string_view multiple_needles = "hello world"; - std::vector all_needles = {'h', 'e', 'l', 'o', 'w', 'r', 'd', ' '}; - test_find_first_not(finder, multiple_needles, all_needles, std::nullopt); - - // 17 byte long string, only loop takes place - std::string_view multiple_needles_haystack = "helloworldddddd"; - - test_find_first_not(finder, multiple_needles_haystack, {'h', 'e', 'l', 'o', 'w', 'r', 'd'}, std::nullopt); -} - -} diff --git a/src/Functions/keyvaluepair/tests/gtest_escaped_sequence_reader.cpp b/src/Functions/keyvaluepair/tests/gtest_escaped_sequence_reader.cpp deleted file mode 100644 index 8e9595aa2735..000000000000 --- a/src/Functions/keyvaluepair/tests/gtest_escaped_sequence_reader.cpp +++ /dev/null @@ -1,71 +0,0 @@ -//#include -//#include - -//namespace DB -//{ - -//void test(std::string_view input, const std::vector & expected_characters, std::size_t expected_bytes_read) -//{ -// auto read_result = extractKVPair_EscapedCharacterReader::read(input); - -// ASSERT_EQ(expected_characters, read_result.characters); - -// auto bytes_read = read_result.ptr - input.begin(); - -// ASSERT_EQ(bytes_read, expected_bytes_read); -//} - -//void test(std::string_view input, const std::vector & expected_characters) -//{ -// return test(input, expected_characters, input.size()); -//} - -//TEST(extractKVPair_EscapedCharacterReader, HexDigits) -//{ -//// test("\\xA", {0xA}); -// test("\\xAA", {0xAA}); -// test("\\xff", {0xFF}); -//// test("\\x0", {0x0}); -// test("\\x00", {0x0}); - -//// test("\\xA$", {0xA}, 3); -// test("\\xAA$", {0xAA}, 4); -// test("\\xff$", {0xFF}, 4); -//// test("\\x0$", {0x0}, 3); -// test("\\x00$", {0x0}, 4); - -// test("\\x", {}); -//} - -//TEST(extractKVPair_EscapedCharacterReader, OctalDigits) -//{ -// test("\\0377", {0xFF}); -// test("\\0137", {0x5F}); -// test("\\013", {0xB}); -// test("\\0000", {0x0}); -// test("\\000", {0x0}); -// test("\\00", {0x0}); - -// test("\\0377$", {0xFF}, 5); -// test("\\0137$", {0x5F}, 5); -// test("\\013$", {0xB}, 4); -// test("\\0000$", {0x0}, 5); -// test("\\000$", {0x0}, 4); -// test("\\00$", {0x0}, 3); -//} - -//TEST(extractKVPair_EscapedCharacterReader, RegularEscapeSequences) -//{ -// test("\\n", {0xA}, 2); -// test("\\r", {0xD}, 2); -// test("\\r", {0xD}, 2); -//} - -//TEST(extractKVPair_EscapedCharacterReader, RandomEscapedCharacters) -//{ -// test("\\h", {'\\', 'h'}); -// test("\\g", {'\\', 'g'}); -// test("\\", {}); -//} - -//} diff --git a/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp b/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp index 1be9254519ef..11fcb03ad7e4 100644 --- a/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include diff --git a/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp b/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp new file mode 100644 index 000000000000..1b114a4d1f0e --- /dev/null +++ b/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp @@ -0,0 +1,137 @@ +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace +{ +using namespace DB; +using namespace std::literals; + +// Print as a map with a single row +auto ToColumnMap(const auto & keys, const auto & values, const ColumnPtr offsets = nullptr) +{ + return ColumnMap::create( + std::move(keys->clone()), + std::move(values->clone()), + offsets ? offsets : ColumnUInt64::create(1, keys->size()) + ); +} + +// Print as a map with a single row +std::string PrintMap(const auto & keys, const auto & values) +{ + auto map_column = ToColumnMap(keys, values); + auto serialization = DataTypeFactory::instance().get("Map(String, String)")->getSerialization(ISerialization::Kind::DEFAULT); + + WriteBufferFromOwnString buff; + serialization->serializeTextJSON(*map_column, 0, buff, FormatSettings{}); + + return std::move(buff.str()); +} + +template +struct Dump +{ + const T & value; + + friend std::ostream & operator<<(std::ostream & ostr, const Dump & d) + { + return dumpValue(ostr, d.value); + } +}; + +template +auto print_with_dump(const T & value) +{ + return Dump{value}; +} + +} + +struct KeyValuePairExtractorTestParam +{ + KeyValuePairExtractorBuilder builder; + std::string input; + std::vector> expected; +}; + +struct extractKVPair_KeyValuePairExtractorTest : public ::testing::TestWithParam +{}; + +TEST_P(extractKVPair_KeyValuePairExtractorTest, Match) +{ + const auto & [builder, input, expected] = GetParam(); + SCOPED_TRACE(input); + + auto kv_parser = builder.build(); + SCOPED_TRACE(typeid(kv_parser).name()); + + auto keys = ColumnString::create(); + auto values = ColumnString::create(); + + auto pairs_found = kv_parser->extract(input, keys, values); + ASSERT_EQ(expected.size(), pairs_found) + << "\texpected: " << print_with_dump(expected) << "\n" + << "\tactual : " << print_with_dump(*ToColumnMap(keys, values)); + + size_t i = 0; + for (const auto & expected_kv : expected) + { + EXPECT_EQ(expected_kv.first, keys->getDataAt(i)) + << fancyQuote(expected_kv.first) << "\nvs\n" + << fancyQuote(keys->getDataAt(i)); + + EXPECT_EQ(expected_kv.second, values->getDataAt(i)) + << fancyQuote(expected_kv.second) << "\nvs\n" + << fancyQuote(values->getDataAt(i)); + + ++i; + } +} + +const std::vector> neymar_expected{ + {"name","neymar"}, + {"age","31"}, + {"team","psg"}, + {"nationality","brazil"}, + {"last_key","last_value"} +}; + +INSTANTIATE_TEST_SUITE_P(Simple, extractKVPair_KeyValuePairExtractorTest, + ::testing::ValuesIn(std::initializer_list + { + { + KeyValuePairExtractorBuilder().withQuotingCharacter('\''), + R"in(name:'neymar';'age':31;team:psg;nationality:brazil,last_key:last_value)in", + neymar_expected + }, + { + // Different escaping char + KeyValuePairExtractorBuilder().withQuotingCharacter('"'), + R"in(name:"neymar";"age":31;team:psg;nationality:brazil,last_key:last_value)in", + neymar_expected + }, + { + // same as case 1, but with another handler + KeyValuePairExtractorBuilder().withQuotingCharacter('\'').withEscaping(), + R"in(name:'neymar';'age':31;team:psg;nationality:brazil,last_key:last_value)in", + neymar_expected + } + } + ) +); diff --git a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp index a8f4aaaa35cc..b439f7724164 100644 --- a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp @@ -1,4 +1,4 @@ -#include +#include #include namespace DB diff --git a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_value_state_handler.cpp b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_value_state_handler.cpp index 6eeed4dc7b8e..fc8b453106a5 100644 --- a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_value_state_handler.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_value_state_handler.cpp @@ -1,4 +1,4 @@ -#include +#include #include namespace DB diff --git a/src/Functions/keyvaluepair/tests/gtest_key_value_pair_extractor.cpp b/src/Functions/keyvaluepair/tests/gtest_key_value_pair_extractor.cpp index c7245c8af27e..13b34ec48d32 100644 --- a/src/Functions/keyvaluepair/tests/gtest_key_value_pair_extractor.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_key_value_pair_extractor.cpp @@ -1,4 +1,4 @@ -//#include +//#include //#include // diff --git a/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp b/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp index 1bf2d71c1282..6a324afb9687 100644 --- a/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp @@ -1,4 +1,4 @@ -#include +#include #include namespace DB From 73430b8d57bf404693be569c5b1f33636ea67aa3 Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Wed, 29 Mar 2023 18:37:44 +0200 Subject: [PATCH 09/12] Minor adjustments --- .../keyvaluepair/impl/InlineEscapingStateHandler.cpp | 3 ++- src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp | 5 ++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.cpp b/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.cpp index 5147bd24b2ac..a55e81d1f964 100644 --- a/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.cpp +++ b/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.cpp @@ -44,6 +44,7 @@ NextState InlineEscapingStateHandler::waitKey(std::string_view file) const const size_t character_position = p - file.begin(); if (*p == quoting_character) { + // +1 to skip quoting character return {character_position + 1u, State::READING_QUOTED_KEY}; } else @@ -63,7 +64,7 @@ NextState InlineEscapingStateHandler::waitKey(std::string_view file) const NextState InlineEscapingStateHandler::readKey(std::string_view file, KeyType & key) const { - const auto & [key_value_delimiter, quoting_character, pair_delimiters] = extractor_configuration; + const auto & [key_value_delimiter, _, pair_delimiters] = extractor_configuration; key.clear(); diff --git a/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp b/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp index 63d5df45d5d4..0f34f7803291 100644 --- a/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp +++ b/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp @@ -53,8 +53,7 @@ NextState NoEscapingStateHandler::waitKey(std::string_view file) const NextState NoEscapingStateHandler::readKey(std::string_view file, KeyType & key) const { - const auto & [key_value_delimiter, quoting_character, pair_delimiters] - = extractor_configuration; + const auto & [key_value_delimiter, _, pair_delimiters] = extractor_configuration; key = {}; @@ -156,7 +155,7 @@ NextState NoEscapingStateHandler::waitValue(std::string_view file) const NextState NoEscapingStateHandler::readValue(std::string_view file, ValueType & value) const { - const auto & [key_value_delimiter, quoting_character, pair_delimiters] = extractor_configuration; + const auto & [key_value_delimiter, _, pair_delimiters] = extractor_configuration; value = {}; From 4b7005b8b17b261a87785018ace64807483f5072 Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Thu, 30 Mar 2023 09:24:41 +0200 Subject: [PATCH 10/12] Changes discussed with Arthur --- src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h | 2 +- .../keyvaluepair/impl/InlineEscapingStateHandler.cpp | 7 +++++-- src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp | 3 --- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h index fe022551eb52..0a1bc4fe5102 100644 --- a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h +++ b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h @@ -76,7 +76,7 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor } data.remove_prefix(next_state.position_in_string); - state = next_state.state; + state = next_state.state; // No state expects empty input if (data.size() == 0) diff --git a/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.cpp b/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.cpp index a55e81d1f964..198d09d28967 100644 --- a/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.cpp +++ b/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.cpp @@ -190,6 +190,9 @@ NextState InlineEscapingStateHandler::readValue(std::string_view file, ValueType next_pos = character_position + escape_seq_len; if (escape_seq_len == 0) { + // It is agreed that value with an invalid escape seqence in it + // is considered malformed and shoudn't be included in result. + value.clear(); return {next_pos, State::WAITING_KEY}; } } @@ -211,7 +214,7 @@ NextState InlineEscapingStateHandler::readValue(std::string_view file, ValueType // Reached end of input, consume rest of the file as value and make sure KV pair is produced. value.insert(value.end(), file.begin() + pos, file.end()); - return {pos, State::FLUSH_PAIR}; + return {file.size(), State::FLUSH_PAIR}; } NextState InlineEscapingStateHandler::readQuotedValue(std::string_view file, ValueType & value) const @@ -246,7 +249,7 @@ NextState InlineEscapingStateHandler::readQuotedValue(std::string_view file, Val pos = next_pos; } - return {pos, State::END}; + return {file.size(), State::END}; } } diff --git a/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp b/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp index 0f34f7803291..68f18ffca838 100644 --- a/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp +++ b/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp @@ -148,9 +148,6 @@ NextState NoEscapingStateHandler::waitValue(std::string_view file) const } return {pos, State::READING_VALUE}; - -// // TODO: can't bre reached, remove it? -// return {file.size(), State::READING_VALUE}; } NextState NoEscapingStateHandler::readValue(std::string_view file, ValueType & value) const From 31a8a7b8638d6f8645da9ddb755623981c2bc7d8 Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Thu, 30 Mar 2023 09:25:10 +0200 Subject: [PATCH 11/12] Fixed unit-tests --- ...test_escaping_key_value_pair_extractor.cpp | 4 ++ .../tests/gtest_extractKeyValuePairs.cpp | 65 ++++++++++++++++++- ...test_inline_escaping_key_state_handler.cpp | 61 +++++++++-------- ...st_inline_escaping_value_state_handler.cpp | 21 ++++-- .../gtest_no_escaping_key_state_handler.cpp | 44 ++++++++----- 5 files changed, 144 insertions(+), 51 deletions(-) diff --git a/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp b/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp index 11fcb03ad7e4..91ec1ca74a05 100644 --- a/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp @@ -1,4 +1,8 @@ #include +#include + +#include + #include #include diff --git a/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp b/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp index 1b114a4d1f0e..b259cf1a2efb 100644 --- a/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp @@ -104,7 +104,8 @@ TEST_P(extractKVPair_KeyValuePairExtractorTest, Match) } } -const std::vector> neymar_expected{ +using ExpectedValues = std::vector>; +const ExpectedValues neymar_expected{ {"name","neymar"}, {"age","31"}, {"team","psg"}, @@ -135,3 +136,65 @@ INSTANTIATE_TEST_SUITE_P(Simple, extractKVPair_KeyValuePairExtractorTest, } ) ); + +// It is agreed that value with an invalid escape seqence in it is considered malformed and shoudn't be included in result. +INSTANTIATE_TEST_SUITE_P(InvalidEscapeSeqInValue, extractKVPair_KeyValuePairExtractorTest, + ::testing::ValuesIn(std::initializer_list + { + { + KeyValuePairExtractorBuilder().withEscaping(), + R"in(valid_key:valid_value key:invalid_val\\ third_key:third_value)in", + ExpectedValues{ + {"valid_key", "valid_value"}, + {"third_key", "third_value"} + } + }, + { + // Special case when invalid seq is the last symbol + KeyValuePairExtractorBuilder().withEscaping(), + R"in(valid_key:valid_value key:invalid_val\\)in", + ExpectedValues{ + {"valid_key", "valid_value"} + } + }, + { + KeyValuePairExtractorBuilder().withEscaping().withQuotingCharacter('"'), + R"in(valid_key:valid_value key:"invalid val\\ " "third key":"third value")in", + ExpectedValues{ + {"valid_key", "valid_value"}, + {"third key", "third value"} + } + }, + // Not handling escape sequences == do not care of broken one, `invalid_val\` must be present + { + KeyValuePairExtractorBuilder(), + R"in(valid_key:valid_value key:invalid_val\ third_key:third_value)in", + ExpectedValues{ + {"valid_key", "valid_value"}, + {"key", "invalid_val\\"}, + {"third_key", "third_value"} + } + }, + { + // Special case when invalid seq is the last symbol + KeyValuePairExtractorBuilder(), + R"in(valid_key:valid_value key:invalid_val\)in", + ExpectedValues{ + {"valid_key", "valid_value"}, + {"key", "invalid_val\\"} + } + }, + { + KeyValuePairExtractorBuilder().withQuotingCharacter('"'), + R"in(valid_key:valid_value key:"invalid val\ " "third key":"third value")in", + ExpectedValues{ + {"valid_key", "valid_value"}, + {"key", "invalid val\\ "}, + {"third key", "third value"}, + } + }, + } + ) +); + + diff --git a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp index b439f7724164..c812aac99851 100644 --- a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp @@ -1,19 +1,28 @@ #include #include -namespace DB +#include +#include + +namespace { -void test_wait(const InlineEscapingKeyStateHandler & handler, std::string_view input, std::size_t expected_pos, State expected_state) +using namespace DB; +using namespace DB::extractKV; + +using State = extractKV::StateHandler::State; +using NextState = extractKV::StateHandler::NextState; + +void test_wait(const InlineEscapingStateHandler & handler, std::string_view input, std::size_t expected_pos, State expected_state) { - auto next_state = handler.wait(input); + auto next_state = handler.waitKey(input); ASSERT_EQ(next_state.position_in_string, expected_pos); ASSERT_EQ(next_state.state, expected_state); } template -void test_read(const InlineEscapingKeyStateHandler & handler, std::string_view input, std::string_view expected_element, +void test_read(const InlineEscapingStateHandler & handler, std::string_view input, std::string_view expected_element, std::size_t expected_pos, State expected_state) { NextState next_state; @@ -21,11 +30,11 @@ void test_read(const InlineEscapingKeyStateHandler & handler, std::string_view i if constexpr (quoted) { - next_state = handler.readQuoted(input, element); + next_state = handler.readQuotedKey(input, element); } else { - next_state = handler.read(input, element); + next_state = handler.readKey(input, element); } ASSERT_EQ(next_state.position_in_string, expected_pos); @@ -33,32 +42,34 @@ void test_read(const InlineEscapingKeyStateHandler & handler, std::string_view i ASSERT_EQ(element, expected_element); } -void test_read(const InlineEscapingKeyStateHandler & handler, std::string_view input, std::string_view expected_element, +void test_read(const InlineEscapingStateHandler & handler, std::string_view input, std::string_view expected_element, std::size_t expected_pos, State expected_state) { test_read(handler, input, expected_element, expected_pos, expected_state); } -void test_read_quoted(const InlineEscapingKeyStateHandler & handler, std::string_view input, std::string_view expected_element, +void test_read_quoted(const InlineEscapingStateHandler & handler, std::string_view input, std::string_view expected_element, std::size_t expected_pos, State expected_state) { test_read(handler, input, expected_element, expected_pos, expected_state); } +} + TEST(extractKVPair_InlineEscapingKeyStateHandler, Wait) { auto pair_delimiters = std::vector{',', ' '}; auto configuration = ConfigurationFactory::createWithEscaping(':', '"', pair_delimiters); - InlineEscapingKeyStateHandler handler(configuration); + InlineEscapingStateHandler handler(configuration); - test_wait(handler, "name", 0u, READING_KEY); - test_wait(handler, "\\:name", 2u, READING_KEY); - test_wait(handler, R"(\\"name)", 3u, READING_QUOTED_KEY); + test_wait(handler, "name", 0u, State::READING_KEY); + test_wait(handler, "\\:name", 2u, State::READING_KEY); + test_wait(handler, R"(\\"name)", 3u, State::READING_QUOTED_KEY); - test_wait(handler, "", 0u, END); - test_wait(handler, "\\\\", 2u, END); + test_wait(handler, "", 0u, State::END); + test_wait(handler, "\\\\", 2u, State::END); } TEST(extractKVPair_InlineEscapingKeyStateHandler, Read) @@ -67,7 +78,7 @@ TEST(extractKVPair_InlineEscapingKeyStateHandler, Read) auto configuration = ConfigurationFactory::createWithEscaping(':', '"', pair_delimiters); - InlineEscapingKeyStateHandler handler(configuration); + InlineEscapingStateHandler handler(configuration); std::string key_str = "name"; std::string key_with_delimiter_str = key_str + ':'; @@ -75,15 +86,15 @@ TEST(extractKVPair_InlineEscapingKeyStateHandler, Read) std::string key_with_delimiter_and_random_characters_str = key_str + ':' + "a$a\\:''\""; // no delimiter, should discard - test_read(handler, key_str, "", key_str.size(), END); + test_read(handler, key_str, "", key_str.size(), State::END); // valid - test_read(handler, key_with_delimiter_str, key_str, key_with_delimiter_str.size(), WAITING_VALUE); + test_read(handler, key_with_delimiter_str, key_str, key_with_delimiter_str.size(), State::WAITING_VALUE); // valid as well - test_read(handler, key_with_delimiter_and_random_characters_str, key_str, key_with_delimiter_str.size(), WAITING_VALUE); + test_read(handler, key_with_delimiter_and_random_characters_str, key_str, key_with_delimiter_str.size(), State::WAITING_VALUE); - test_read(handler, "", "", 0u, END); + test_read(handler, "", "", 0u, State::END); } TEST(extractKVPair_InlineEscapingKeyStateHandler, ReadEnclosed) @@ -92,7 +103,7 @@ TEST(extractKVPair_InlineEscapingKeyStateHandler, ReadEnclosed) auto configuration = ConfigurationFactory::createWithEscaping(':', '"', pair_delimiters); - InlineEscapingKeyStateHandler handler(configuration); + InlineEscapingStateHandler handler(configuration); std::string regular_key = "name"; std::string regular_key_with_end_quote = regular_key + "\""; @@ -101,10 +112,8 @@ TEST(extractKVPair_InlineEscapingKeyStateHandler, ReadEnclosed) std::string key_with_escape_character = regular_key + R"(\n\x4E")"; - test_read_quoted(handler, regular_key, "", regular_key.size(), END); - test_read_quoted(handler, regular_key_with_end_quote, regular_key, regular_key_with_end_quote.size(), READING_KV_DELIMITER); - test_read_quoted(handler, key_with_special_characters_with_end_quote, key_with_special_characters, key_with_special_characters_with_end_quote.size(), READING_KV_DELIMITER); - test_read_quoted(handler, key_with_escape_character, regular_key + "\nN", key_with_escape_character.size(), READING_KV_DELIMITER); -} - + test_read_quoted(handler, regular_key, "", regular_key.size(), State::END); + test_read_quoted(handler, regular_key_with_end_quote, regular_key, regular_key_with_end_quote.size(), State::READING_KV_DELIMITER); + test_read_quoted(handler, key_with_special_characters_with_end_quote, key_with_special_characters, key_with_special_characters_with_end_quote.size(), State::READING_KV_DELIMITER); + test_read_quoted(handler, key_with_escape_character, regular_key + "\nN", key_with_escape_character.size(), State::READING_KV_DELIMITER); } diff --git a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_value_state_handler.cpp b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_value_state_handler.cpp index fc8b453106a5..53382de320d7 100644 --- a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_value_state_handler.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_value_state_handler.cpp @@ -1,25 +1,34 @@ #include + +#include #include -namespace DB +namespace { +using namespace DB; +using namespace DB::extractKV; + +using State = extractKV::StateHandler::State; +using NextState = extractKV::StateHandler::NextState; + + void test_wait(const auto & handler, std::string_view input, std::size_t expected_pos, State expected_state) { - auto next_state = handler.wait(input); + auto next_state = handler.waitValue(input); ASSERT_EQ(next_state.position_in_string, expected_pos); ASSERT_EQ(next_state.state, expected_state); } +} + TEST(extractKVPair_InlineEscapingValueStateHandler, Wait) { auto pair_delimiters = std::vector {','}; auto configuration = ConfigurationFactory::createWithEscaping(':', '"', pair_delimiters); - InlineEscapingValueStateHandler handler(configuration); - - test_wait(handler, " los$ yours3lf", 0u, READING_VALUE); -} + InlineEscapingStateHandler handler(configuration); + test_wait(handler, " los$ yours3lf", 0u, State::READING_VALUE); } diff --git a/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp b/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp index 6a324afb9687..75cf33713932 100644 --- a/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp @@ -1,12 +1,19 @@ -#include +#include +#include + #include -namespace DB +namespace { +using namespace DB; +using namespace DB::extractKV; + +using State = extractKV::StateHandler::State; +using NextState = extractKV::StateHandler::NextState; void test_wait(const auto & handler, std::string_view input, std::size_t expected_pos, State expected_state) { - auto next_state = handler.wait(input); + auto next_state = handler.waitKey(input); ASSERT_EQ(next_state.position_in_string, expected_pos); ASSERT_EQ(next_state.state, expected_state); @@ -21,11 +28,11 @@ void test_read(const auto & handler, std::string_view input, std::string_view ex if constexpr (quoted) { - next_state = handler.readQuoted(input, element); + next_state = handler.readQuotedKey(input, element); } else { - next_state = handler.read(input, element); + next_state = handler.readKey(input, element); } ASSERT_EQ(next_state.position_in_string, expected_pos); @@ -45,23 +52,25 @@ void test_read_quoted(const auto & handler, std::string_view input, std::string_ test_read(handler, input, expected_element, expected_pos, expected_state); } +} + TEST(extractKVPair_NoEscapingKeyStateHandler, Wait) { auto pair_delimiters = std::vector{',', ' ', '$'}; auto configuration = ConfigurationFactory::createWithEscaping(':', '"', pair_delimiters); - NoEscapingKeyStateHandler handler(configuration); + NoEscapingStateHandler handler(configuration); - test_wait(handler, "name", 0u, READING_KEY); - test_wait(handler, "\\:name", 0u, READING_KEY); + test_wait(handler, "name", 0u, State::READING_KEY); + test_wait(handler, "\\:name", 0u, State::READING_KEY); // quoted expected pos is + 1 because as of now it is skipped, maybe I should change it - test_wait(handler, "\"name", 1u, READING_QUOTED_KEY); + test_wait(handler, "\"name", 1u, State::READING_QUOTED_KEY); - test_wait(handler, ", $name", 3u, READING_KEY); - test_wait(handler, ", $\"name", 4u, READING_QUOTED_KEY); + test_wait(handler, ", $name", 3u, State::READING_KEY); + test_wait(handler, ", $\"name", 4u, State::READING_QUOTED_KEY); - test_wait(handler, "", 0u, END); + test_wait(handler, "", 0u, State::END); } TEST(extractKVPair_NoEscapingKeyStateHandler, Read) @@ -70,7 +79,7 @@ TEST(extractKVPair_NoEscapingKeyStateHandler, Read) auto configuration = ConfigurationFactory::createWithEscaping(':', '"', pair_delimiters); - NoEscapingKeyStateHandler handler(configuration); + NoEscapingStateHandler handler(configuration); std::string key_str = "name"; std::string key_with_delimiter_str = key_str + ':'; @@ -78,15 +87,14 @@ TEST(extractKVPair_NoEscapingKeyStateHandler, Read) std::string key_with_delimiter_and_random_characters_str = key_str + ':' + "a$a\\:''\""; // no delimiter, should discard - test_read(handler, key_str, "", key_str.size(), END); + test_read(handler, key_str, "", key_str.size(), State::END); // valid - test_read(handler, key_with_delimiter_str, key_str, key_with_delimiter_str.size(), WAITING_VALUE); + test_read(handler, key_with_delimiter_str, key_str, key_with_delimiter_str.size(), State::WAITING_VALUE); // valid as well - test_read(handler, key_with_delimiter_and_random_characters_str, key_str, key_with_delimiter_str.size(), WAITING_VALUE); + test_read(handler, key_with_delimiter_and_random_characters_str, key_str, key_with_delimiter_str.size(), State::WAITING_VALUE); - test_read(handler, "", "", 0u, END); + test_read(handler, "", "", 0u, State::END); } -} From 89ebe61de851ee4993b823fc092eb621f40fd41a Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Thu, 30 Mar 2023 15:21:22 +0200 Subject: [PATCH 12/12] Implemented writing to ColumnString directly via StringWriter --- .../impl/CHKeyValuePairExtractor.h | 47 +++++----- .../impl/InlineEscapingStateHandler.cpp | 48 +++++----- .../impl/InlineEscapingStateHandler.h | 11 +-- .../impl/NoEscapingStateHandler.cpp | 40 ++++----- .../impl/NoEscapingStateHandler.h | 11 +-- .../keyvaluepair/impl/StateHandler.h | 88 +++++++++++++++++-- ...test_escaping_key_value_pair_extractor.cpp | 4 +- .../tests/gtest_extractKeyValuePairs.cpp | 4 +- ...test_inline_escaping_key_state_handler.cpp | 8 +- .../gtest_no_escaping_key_state_handler.cpp | 8 +- 10 files changed, 169 insertions(+), 100 deletions(-) diff --git a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h index 0a1bc4fe5102..aa9e421c901a 100644 --- a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h +++ b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h @@ -7,6 +7,7 @@ #include #include +// TODO: debug stuff, remove it before merging #include #include @@ -16,9 +17,6 @@ namespace DB template class CHKeyValuePairExtractor : public KeyValuePairExtractor { - using Key = typename StateHandler::KeyType; - using Value = typename StateHandler::ValueType; - using State = typename DB::extractKV::StateHandler::State; using NextState = DB::extractKV::StateHandler::NextState; @@ -37,8 +35,8 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor // std::cerr << "CHKeyValuePairExtractor::extract: \"" << data << "\"" << std::endl; auto state = State::WAITING_KEY; - Key key; - Value value; + extractKV::StringWriter key(*keys); + extractKV::StringWriter value(*values); uint64_t row_offset = 0; const auto & config = state_handler.extractor_configuration; @@ -59,7 +57,7 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor << fancyQuote(data) << std::endl; - next_state = processState(data, state, key, value, keys, values, row_offset); + next_state = processState(data, state, key, value, row_offset); std::cerr << "CHKeyValuePairExtractor::extract 2, new_state: " << magic_enum::enum_name(next_state.state) @@ -83,18 +81,21 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor break; } + // TODO (vnemkov): consider removing, we should reach FLUSH_PAIR state from state machine. // if break occured earlier, consume previously generated pair - if (state == State::FLUSH_PAIR || !(key.empty() && value.empty())) - flushPair(data, key, value, keys, values, row_offset); + if (state == State::FLUSH_PAIR || !(key.isEmpty() && value.isEmpty())) + flushPair(data, key, value, row_offset); + + keys->validate(); + values->validate(); return row_offset; } private: - NextState processState(std::string_view file, State state, Key & key, - Value & value, ColumnString::MutablePtr & keys, - ColumnString::MutablePtr & values, uint64_t & row_offset) + NextState processState(std::string_view file, State state, extractKV::StringWriter & key, + extractKV::StringWriter & value, uint64_t & row_offset) { switch (state) { @@ -103,13 +104,13 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor case State::READING_KEY: { auto result = state_handler.readKey(file, key); - std::cerr << "CHKeyValuePairExtractor::processState key: " << fancyQuote(key) << std::endl; + std::cerr << "CHKeyValuePairExtractor::processState key: " << fancyQuote(key.uncommittedChunk()) << std::endl; return result; } case State::READING_QUOTED_KEY: { auto result = state_handler.readQuotedKey(file, key); - std::cerr << "CHKeyValuePairExtractor::processState key: " << fancyQuote(key) << std::endl; + std::cerr << "CHKeyValuePairExtractor::processState key: " << fancyQuote(key.uncommittedChunk()) << std::endl; return result; } case State::READING_KV_DELIMITER: @@ -121,33 +122,29 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor case State::READING_VALUE: { auto result = state_handler.readValue(file, value); - std::cerr << "CHKeyValuePairExtractor::processState value: " << fancyQuote(value) << std::endl; + std::cerr << "CHKeyValuePairExtractor::processState value: " << fancyQuote(value.uncommittedChunk()) << std::endl; return result; } case State::READING_QUOTED_VALUE: { auto result = state_handler.readQuotedValue(file, value); - std::cerr << "CHKeyValuePairExtractor::processState value: " << fancyQuote(value) << std::endl; + std::cerr << "CHKeyValuePairExtractor::processState value: " << fancyQuote(value.uncommittedChunk()) << std::endl; return result; } case State::FLUSH_PAIR: - return flushPair(file, key, value, keys, values, row_offset); + return flushPair(file, key, value, row_offset); case State::END: return {0, state}; } } - NextState flushPair(const std::string_view & file, Key & key, - Value & value, ColumnString::MutablePtr & keys, - ColumnString::MutablePtr & values, uint64_t & row_offset) + NextState flushPair(const std::string_view & file, extractKV::StringWriter & key, + extractKV::StringWriter & value, uint64_t & row_offset) { - std::cerr << "CHKeyValuePairExtractor::flushPair key: " << fancyQuote(key) << ", value: " << fancyQuote(value) << std::endl; - keys->insertData(key.data(), key.size()); - values->insertData(value.data(), value.size()); - - key = {}; - value = {}; + std::cerr << "CHKeyValuePairExtractor::flushPair key: " << fancyQuote(key.uncommittedChunk()) << ", value: " << fancyQuote(value.uncommittedChunk()) << std::endl; + key.commit(); + value.commit(); ++row_offset; std::cerr << "CHKeyValuePairExtractor::flushPair total pairs: " << row_offset << std::endl; diff --git a/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.cpp b/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.cpp index 198d09d28967..06daf780378d 100644 --- a/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.cpp +++ b/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.cpp @@ -7,12 +7,15 @@ namespace { -size_t consumeWithEscapeSequence(std::string_view file, size_t start_pos, size_t character_pos, std::string & output) +size_t consumeWithEscapeSequence(std::string_view file, size_t start_pos, size_t character_pos, DB::extractKV::StringWriter & output) { - output.insert(output.end(), file.begin() + start_pos, file.begin() + character_pos); + output.append(file.begin() + start_pos, file.begin() + character_pos); + std::string tmp_out; DB::ReadBufferFromMemory buf(file.begin() + character_pos, file.size() - character_pos); - DB::parseComplexEscapeSequence(output, buf); + + DB::parseComplexEscapeSequence(tmp_out, buf); + output.append(tmp_out); return buf.getPosition(); } @@ -62,11 +65,11 @@ NextState InlineEscapingStateHandler::waitKey(std::string_view file) const * If I find a key value delimiter and that is empty, I do not need to copy? hm,m hm hm * */ -NextState InlineEscapingStateHandler::readKey(std::string_view file, KeyType & key) const +NextState InlineEscapingStateHandler::readKey(std::string_view file, StringWriter & key) const { const auto & [key_value_delimiter, _, pair_delimiters] = extractor_configuration; - key.clear(); + key.reset(); size_t pos = 0; while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_needles)) @@ -83,9 +86,10 @@ NextState InlineEscapingStateHandler::readKey(std::string_view file, KeyType & k return {next_pos, State::WAITING_KEY}; } } - else if (*p == key_value_delimiter) + else + if (*p == key_value_delimiter) { - key.insert(key.end(), file.begin() + pos, file.begin() + character_position); + key.append(file.begin() + pos, file.begin() + character_position); return {next_pos, State::WAITING_VALUE}; } @@ -102,11 +106,11 @@ NextState InlineEscapingStateHandler::readKey(std::string_view file, KeyType & k return {file.size(), State::END}; } -NextState InlineEscapingStateHandler::readQuotedKey(std::string_view file, KeyType & key) const +NextState InlineEscapingStateHandler::readQuotedKey(std::string_view file, StringWriter & key) const { const auto quoting_character = extractor_configuration.quoting_character; - key.clear(); + key.reset(); size_t pos = 0; while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_quoted_needles)) @@ -126,9 +130,9 @@ NextState InlineEscapingStateHandler::readQuotedKey(std::string_view file, KeyTy } else if (*p == quoting_character) { - key.insert(key.end(), file.begin() + pos, file.begin() + character_position); + key.append(file.begin() + pos, file.begin() + character_position); - if (key.empty()) + if (key.isEmpty()) { return {next_pos, State::WAITING_KEY}; } @@ -172,11 +176,11 @@ NextState InlineEscapingStateHandler::waitValue(std::string_view file) const return {pos, State::READING_VALUE}; } -NextState InlineEscapingStateHandler::readValue(std::string_view file, ValueType & value) const +NextState InlineEscapingStateHandler::readValue(std::string_view file, StringWriter & value) const { const auto & [key_value_delimiter, _, pair_delimiters] = extractor_configuration; - value.clear(); + value.reset(); size_t pos = 0; while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_needles)) @@ -192,11 +196,12 @@ NextState InlineEscapingStateHandler::readValue(std::string_view file, ValueType { // It is agreed that value with an invalid escape seqence in it // is considered malformed and shoudn't be included in result. - value.clear(); + value.reset(); return {next_pos, State::WAITING_KEY}; } } - else if (*p == key_value_delimiter) + else + if (*p == key_value_delimiter) { // reached new key return {next_pos, State::WAITING_KEY}; @@ -204,7 +209,7 @@ NextState InlineEscapingStateHandler::readValue(std::string_view file, ValueType else if (std::find(pair_delimiters.begin(), pair_delimiters.end(), *p) != pair_delimiters.end()) { // reached next pair - value.insert(value.end(), file.begin() + pos, file.begin() + character_position); + value.append(file.begin() + pos, file.begin() + character_position); return {next_pos, State::FLUSH_PAIR}; } @@ -213,17 +218,17 @@ NextState InlineEscapingStateHandler::readValue(std::string_view file, ValueType } // Reached end of input, consume rest of the file as value and make sure KV pair is produced. - value.insert(value.end(), file.begin() + pos, file.end()); + value.append(file.begin() + pos, file.end()); return {file.size(), State::FLUSH_PAIR}; } -NextState InlineEscapingStateHandler::readQuotedValue(std::string_view file, ValueType & value) const +NextState InlineEscapingStateHandler::readQuotedValue(std::string_view file, StringWriter & value) const { const auto quoting_character = extractor_configuration.quoting_character; size_t pos = 0; - value.clear(); + value.reset(); while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_quoted_needles)) { @@ -239,9 +244,10 @@ NextState InlineEscapingStateHandler::readQuotedValue(std::string_view file, Val return {next_pos, State::WAITING_KEY}; } } - else if (*p == quoting_character) + else + if (*p == quoting_character) { - value.insert(value.end(), file.begin() + pos, file.begin() + character_position); + value.append(file.begin() + pos, file.begin() + character_position); return {next_pos, State::FLUSH_PAIR}; } diff --git a/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.h b/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.h index 120d2f3d95bb..2d532a4938d9 100644 --- a/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.h +++ b/src/Functions/keyvaluepair/impl/InlineEscapingStateHandler.h @@ -16,18 +16,15 @@ namespace extractKV class InlineEscapingStateHandler : public StateHandler { public: - using KeyType = std::string; - using ValueType = std::string; - explicit InlineEscapingStateHandler(Configuration configuration_); [[nodiscard]] NextState waitKey(std::string_view file) const; - [[nodiscard]] NextState readKey(std::string_view file, KeyType & key) const; - [[nodiscard]] NextState readQuotedKey(std::string_view file, KeyType & key) const; + [[nodiscard]] NextState readKey(std::string_view file, StringWriter & key) const; + [[nodiscard]] NextState readQuotedKey(std::string_view file, StringWriter & key) const; [[nodiscard]] NextState readKeyValueDelimiter(std::string_view file) const; [[nodiscard]] NextState waitValue(std::string_view file) const; - [[nodiscard]] NextState readValue(std::string_view file, ValueType & value) const; - [[nodiscard]] NextState readQuotedValue(std::string_view file, ValueType & value) const; + [[nodiscard]] NextState readValue(std::string_view file, StringWriter & value) const; + [[nodiscard]] NextState readQuotedValue(std::string_view file, StringWriter & value) const; const Configuration extractor_configuration; diff --git a/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp b/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp index 68f18ffca838..e3715bb1bca2 100644 --- a/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp +++ b/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.cpp @@ -3,15 +3,11 @@ #include #include +#include "Functions/keyvaluepair/impl/StateHandler.h" namespace { -std::string_view createElement(std::string_view file, std::size_t begin, std::size_t end) -{ - return std::string_view{file.begin() + begin, file.begin() + end}; -} - using NextState = DB::extractKV::StateHandler::NextState; } @@ -51,11 +47,11 @@ NextState NoEscapingStateHandler::waitKey(std::string_view file) const return {file.size(), State::END}; } -NextState NoEscapingStateHandler::readKey(std::string_view file, KeyType & key) const +NextState NoEscapingStateHandler::readKey(std::string_view file, StringWriter & key) const { const auto & [key_value_delimiter, _, pair_delimiters] = extractor_configuration; - key = {}; + key.reset(); size_t pos = 0; auto start_index = pos; @@ -67,9 +63,9 @@ NextState NoEscapingStateHandler::readKey(std::string_view file, KeyType & key) if (*p == key_value_delimiter) { - key = createElement(file, start_index, character_position); + key.append(file.begin() + start_index, file.begin() + character_position); - if (key.empty()) + if (key.isEmpty()) { return {next_pos, State::WAITING_KEY}; } @@ -88,25 +84,25 @@ NextState NoEscapingStateHandler::readKey(std::string_view file, KeyType & key) return {file.size(), State::END}; } -NextState NoEscapingStateHandler::readQuotedKey(std::string_view file, KeyType & key) const +NextState NoEscapingStateHandler::readQuotedKey(std::string_view file, StringWriter & key) const { const auto quoting_character = extractor_configuration.quoting_character; - key = {}; + key.reset(); size_t pos = 0; auto start_index = pos; while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_quoted_needles)) { - const size_t character_position = p - file.begin(); - auto next_pos = character_position + 1u; + size_t character_position = p - file.begin(); + size_t next_pos = character_position + 1u; if (*p == quoting_character) { - key = createElement(file, start_index, character_position); + key.append(file.begin() + start_index, file.begin() + character_position); - if (key.empty()) + if (key.isEmpty()) { return {next_pos, State::WAITING_KEY}; } @@ -150,11 +146,11 @@ NextState NoEscapingStateHandler::waitValue(std::string_view file) const return {pos, State::READING_VALUE}; } -NextState NoEscapingStateHandler::readValue(std::string_view file, ValueType & value) const +NextState NoEscapingStateHandler::readValue(std::string_view file, StringWriter & value) const { const auto & [key_value_delimiter, _, pair_delimiters] = extractor_configuration; - value = {}; + value.reset(); size_t pos = 0; auto start_index = pos; @@ -170,7 +166,7 @@ NextState NoEscapingStateHandler::readValue(std::string_view file, ValueType & v else if (std::find(pair_delimiters.begin(), pair_delimiters.end(), *p) != pair_delimiters.end()) { // reached next pair - value = createElement(file, start_index, character_position); + value.append(file.begin() + start_index, file.begin() + character_position); return {next_pos, State::FLUSH_PAIR}; } @@ -180,18 +176,18 @@ NextState NoEscapingStateHandler::readValue(std::string_view file, ValueType & v // TODO: do I really need the below logic? // this allows empty values at the end - value = createElement(file, start_index, file.size()); + value.append(file.begin() + start_index, file.end()); return {file.size(), State::FLUSH_PAIR}; } -NextState NoEscapingStateHandler::readQuotedValue(std::string_view file, ValueType & value) const +NextState NoEscapingStateHandler::readQuotedValue(std::string_view file, StringWriter & value) const { const auto quoting_character = extractor_configuration.quoting_character; size_t pos = 0; auto start_index = pos; - value = {}; + value.reset(); while (const auto * p = find_first_symbols_or_null({file.begin() + pos, file.end()}, read_quoted_needles)) { @@ -200,7 +196,7 @@ NextState NoEscapingStateHandler::readQuotedValue(std::string_view file, ValueTy if (*p == quoting_character) { - value = createElement(file, start_index, character_position); + value.append(file.begin() + start_index, file.begin() + character_position); std::cerr << "NoEscapingStateHandler::readQuoted Going to consume up to: «" << fancyQuote(file.substr(0, next_pos)) << " to " << fancyQuote(file.substr(next_pos)) << std::endl; return {next_pos, State::FLUSH_PAIR}; diff --git a/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.h b/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.h index e547a503530f..fa426939d7e0 100644 --- a/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.h +++ b/src/Functions/keyvaluepair/impl/NoEscapingStateHandler.h @@ -16,18 +16,15 @@ namespace extractKV class NoEscapingStateHandler : public StateHandler { public: - using KeyType = std::string_view; - using ValueType = std::string_view; - explicit NoEscapingStateHandler(Configuration extractor_configuration_); [[nodiscard]] NextState waitKey(std::string_view file) const; - [[nodiscard]] NextState readKey(std::string_view file, KeyType & key) const; - [[nodiscard]] NextState readQuotedKey(std::string_view file, KeyType & key) const; + [[nodiscard]] NextState readKey(std::string_view file, StringWriter & key) const; + [[nodiscard]] NextState readQuotedKey(std::string_view file, StringWriter & key) const; [[nodiscard]] NextState readKeyValueDelimiter(std::string_view file) const; [[nodiscard]] NextState waitValue(std::string_view file) const; - [[nodiscard]] NextState readValue(std::string_view file, ValueType & value) const; - [[nodiscard]] NextState readQuotedValue(std::string_view file, ValueType & value) const; + [[nodiscard]] NextState readValue(std::string_view file, StringWriter & value) const; + [[nodiscard]] NextState readQuotedValue(std::string_view file, StringWriter & value) const; const Configuration extractor_configuration; diff --git a/src/Functions/keyvaluepair/impl/StateHandler.h b/src/Functions/keyvaluepair/impl/StateHandler.h index 362e2d9618ff..6feb1977c736 100644 --- a/src/Functions/keyvaluepair/impl/StateHandler.h +++ b/src/Functions/keyvaluepair/impl/StateHandler.h @@ -1,5 +1,7 @@ #pragma once +#include + #include #include @@ -46,11 +48,75 @@ class StateHandler virtual ~StateHandler() = default; }; + +class StringWriter +{ + ColumnString & col; + ColumnString::Chars & chars; + UInt64 prev_commit_pos = 0; + +public: + StringWriter(ColumnString & col_) + : col(col_), + chars(col.getChars()) + {} + + ~StringWriter() + { + // Make sure that ColumnString invariants are not broken. + if (!isEmpty()) + reset(); + } + + void append(std::string_view new_data) + { + chars.insert(new_data.begin(), new_data.end()); + } + + template + void append(const T * begin, const T * end) + { + chars.insert(begin, end); + } + + void reset() + { + chars.resize_assume_reserved(prev_commit_pos); + } + + bool isEmpty() const + { + return chars.size() == prev_commit_pos; + } + + void commit() + { + if (isEmpty()) + return; + + chars.push_back('\0'); + std::cerr << "Committing on " << &col << " prev_commit_pos: " << prev_commit_pos << " new offset: " << chars.size() << " there will be " << col.getOffsets().size() + 1 << " rows" << std::endl; + col.getOffsets().emplace_back(chars.size()); + prev_commit_pos = chars.size(); + } + + std::string_view uncommittedChunk() const + { + return std::string_view(chars.raw_data() + prev_commit_pos, chars.raw_data() + chars.size()); + } + +// const ColumnString & data() const +// { +// return col; +// } +}; + + } // TODO(vnemkov): Debug stuff, remove before merging -template +template struct CustomQuoted { const char * start_quote = "\""; @@ -59,31 +125,35 @@ struct CustomQuoted const T & value; }; -template -CustomQuoted customQuote(const char * start_quote, const T & value, const char * end_quote = nullptr) +template +CustomQuoted customQuote(const char * start_quote, const T & value, const char * end_quote = nullptr) { assert(start_quote != nullptr); - return CustomQuoted{ + return CustomQuoted{ .start_quote = start_quote, .end_quote = end_quote ? end_quote : start_quote, .value = value }; } -template -CustomQuoted fancyQuote(const T & value) +inline auto fancyQuote(const std::string_view & value) { - return CustomQuoted{ + return CustomQuoted{ .start_quote = "«", .end_quote = "»", .value = value }; } -template -std::ostream & operator<<(std::ostream & ostr, const CustomQuoted & val) +template +std::ostream & operator<<(std::ostream & ostr, const CustomQuoted & val) { + if constexpr (prepend_length) + { + ostr << "(" << val.value.length() << ") "; + } + return ostr << val.start_quote << val.value << val.end_quote; } diff --git a/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp b/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp index 91ec1ca74a05..784a259b3dc7 100644 --- a/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_escaping_key_value_pair_extractor.cpp @@ -30,8 +30,8 @@ TEST(extractKVPair_EscapingKeyValuePairExtractor, EscapeSequences) ASSERT_EQ(keys->size(), pairs_count); ASSERT_EQ(keys->size(), values->size()); - ASSERT_EQ(keys->getDataAt(0), "key1"); - ASSERT_EQ(keys->getDataAt(1), "key2"); + ASSERT_EQ(keys->getDataAt(0).toView(), "key1"); + ASSERT_EQ(keys->getDataAt(1).toView(), "key2"); assert_byte_equality(values->getDataAt(0), {0xFF}); assert_byte_equality(values->getDataAt(1), {0xA, 0x9, 0xD}); diff --git a/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp b/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp index b259cf1a2efb..ab06fb6e90fd 100644 --- a/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_extractKeyValuePairs.cpp @@ -94,11 +94,11 @@ TEST_P(extractKVPair_KeyValuePairExtractorTest, Match) { EXPECT_EQ(expected_kv.first, keys->getDataAt(i)) << fancyQuote(expected_kv.first) << "\nvs\n" - << fancyQuote(keys->getDataAt(i)); + << fancyQuote(keys->getDataAt(i).toView()); EXPECT_EQ(expected_kv.second, values->getDataAt(i)) << fancyQuote(expected_kv.second) << "\nvs\n" - << fancyQuote(values->getDataAt(i)); + << fancyQuote(values->getDataAt(i).toView()); ++i; } diff --git a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp index c812aac99851..3a93079b92c8 100644 --- a/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_inline_escaping_key_state_handler.cpp @@ -1,5 +1,6 @@ #include -#include +#include + #include #include @@ -25,8 +26,9 @@ template void test_read(const InlineEscapingStateHandler & handler, std::string_view input, std::string_view expected_element, std::size_t expected_pos, State expected_state) { + auto str = ColumnString::create(); NextState next_state; - std::string element; + StringWriter element(*str); if constexpr (quoted) { @@ -39,7 +41,7 @@ void test_read(const InlineEscapingStateHandler & handler, std::string_view inpu ASSERT_EQ(next_state.position_in_string, expected_pos); ASSERT_EQ(next_state.state, expected_state); - ASSERT_EQ(element, expected_element); + ASSERT_EQ(element.uncommittedChunk(), expected_element); } void test_read(const InlineEscapingStateHandler & handler, std::string_view input, std::string_view expected_element, diff --git a/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp b/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp index 75cf33713932..b162982e9bba 100644 --- a/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp +++ b/src/Functions/keyvaluepair/tests/gtest_no_escaping_key_state_handler.cpp @@ -1,6 +1,8 @@ #include #include +#include + #include namespace @@ -24,7 +26,9 @@ void test_read(const auto & handler, std::string_view input, std::string_view ex std::size_t expected_pos, State expected_state) { NextState next_state; - std::string_view element; + + auto col = ColumnString::create(); + StringWriter element(*col); if constexpr (quoted) { @@ -37,7 +41,7 @@ void test_read(const auto & handler, std::string_view input, std::string_view ex ASSERT_EQ(next_state.position_in_string, expected_pos); ASSERT_EQ(next_state.state, expected_state); - ASSERT_EQ(element, expected_element); + ASSERT_EQ(element.uncommittedChunk(), expected_element); } void test_read(const auto & handler, std::string_view input, std::string_view expected_element,