Skip to content

Commit

Permalink
Merge pull request #2 from Enmk/parsey_key_value_function-refactoring
Browse files Browse the repository at this point in the history
Parsey key value function refactoring
  • Loading branch information
arthurpassos authored Mar 30, 2023
2 parents 487296c + 89ebe61 commit c02f883
Show file tree
Hide file tree
Showing 51 changed files with 1,618 additions and 1,572 deletions.
117 changes: 94 additions & 23 deletions base/base/find_symbols.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ template <char ...chars> constexpr bool is_in(char x) { return ((x == chars) ||

static bool is_in(char c, const char * symbols, size_t num_chars)
{
for (auto i = 0u; i < num_chars; i++)
for (size_t i = 0u; i < num_chars; ++i)
{
if (c == symbols[i])
{
Expand All @@ -66,6 +66,43 @@ inline __m128i mm_is_in(__m128i bytes)
__m128i eq = mm_is_in<s1, tail...>(bytes);
return _mm_or_si128(eq0, eq);
}

inline __m128i mm_is_in(__m128i bytes, const char * symbols, size_t num_chars)
{
__m128i accumulator = _mm_setzero_si128();
for (size_t i = 0; i < num_chars; ++i)
{
__m128i eq = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(symbols[i]));
accumulator = _mm_or_si128(accumulator, eq);
}

return accumulator;
}

inline std::vector<__m128i> mm_is_in_prepare(const char * symbols, size_t num_chars)
{
std::vector<__m128i> result;
result.reserve(num_chars);

for (size_t i = 0; i < num_chars; ++i)
{
result.emplace_back(_mm_set1_epi8(symbols[i]));
}

return result;
}

inline __m128i mm_is_in_execute(__m128i bytes, const std::vector<__m128i> & needles)
{
__m128i accumulator = _mm_setzero_si128();
for (const auto & needle : needles)
{
__m128i eq = _mm_cmpeq_epi8(bytes, needle);
accumulator = _mm_or_si128(accumulator, eq);
}

return accumulator;
}
#endif

template <bool positive>
Expand Down Expand Up @@ -112,6 +149,32 @@ inline const char * find_first_symbols_sse2(const char * const begin, const char
return return_mode == ReturnMode::End ? end : nullptr;
}

template <bool positive, ReturnMode return_mode>
inline const char * find_first_symbols_sse2(const char * const begin, const char * const end, const char * symbols, size_t num_chars)
{
const char * pos = begin;
const auto needles = mm_is_in_prepare(symbols, num_chars);

#if defined(__SSE2__)
for (; pos + 15 < end; pos += 16)
{
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));

__m128i eq = mm_is_in_execute(bytes, needles);

uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq)));
if (bit_mask)
return pos + __builtin_ctz(bit_mask);
}
#endif

for (; pos < end; ++pos)
if (maybe_negate<positive>(is_in(*pos, symbols, num_chars)))
return pos;

return return_mode == ReturnMode::End ? end : nullptr;
}


template <bool positive, ReturnMode return_mode, char... symbols>
inline const char * find_last_symbols_sse2(const char * const begin, const char * const end)
Expand Down Expand Up @@ -192,21 +255,6 @@ inline const char * find_first_symbols_sse42(const char * const begin, const cha
return return_mode == ReturnMode::End ? end : nullptr;
}


/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do.

template <bool positive, ReturnMode return_mode, char... symbols>
inline const char * find_first_symbols_dispatch(const char * begin, const char * end)
requires(0 <= sizeof...(symbols) && sizeof...(symbols) <= 16)
{
#if defined(__SSE4_2__)
if (sizeof...(symbols) >= 5)
return find_first_symbols_sse42<positive, return_mode, sizeof...(symbols), symbols...>(begin, end);
else
#endif
return find_first_symbols_sse2<positive, return_mode, symbols...>(begin, end);
}

template <bool positive, ReturnMode return_mode>
inline const char * find_first_symbols_sse42(const char * const begin, const char * const end, const char * symbols, size_t num_chars)
{
Expand All @@ -215,7 +263,10 @@ inline const char * find_first_symbols_sse42(const char * const begin, const cha
#if defined(__SSE4_2__)
constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT;

const __m128i set = _mm_loadu_si128(reinterpret_cast<const __m128i *>(symbols));
// This is to avoid read past end of `symbols` if `num_chars < 16`.
char buffer[16] = {'\0'};
memcpy(buffer, symbols, num_chars);
const __m128i set = _mm_loadu_si128(reinterpret_cast<const __m128i *>(buffer));

for (; pos + 15 < end; pos += 16)
{
Expand All @@ -241,10 +292,30 @@ inline const char * find_first_symbols_sse42(const char * const begin, const cha
return return_mode == ReturnMode::End ? end : nullptr;
}

/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do.

template <bool positive, ReturnMode return_mode, char... symbols>
inline const char * find_first_symbols_dispatch(const char * begin, const char * end)
requires(0 <= sizeof...(symbols) && sizeof...(symbols) <= 16)
{
#if defined(__SSE4_2__)
if (sizeof...(symbols) >= 5)
return find_first_symbols_sse42<positive, return_mode, sizeof...(symbols), symbols...>(begin, end);
else
#endif
return find_first_symbols_sse2<positive, return_mode, symbols...>(begin, end);
}

template <bool positive, ReturnMode return_mode>
auto find_first_symbols_sse42(std::string_view haystack, std::string_view symbols)
inline const char * find_first_symbols_dispatch(const std::string_view haystack, const std::string_view symbols)
{
return find_first_symbols_sse42<positive, return_mode>(haystack.begin(), haystack.end(), symbols.begin(), symbols.size());
const size_t num_chars = std::min<size_t>(symbols.size(), 16);
#if defined(__SSE4_2__)
if (num_chars >= 5)
return find_first_symbols_sse42<positive, return_mode>(haystack.begin(), haystack.end(), symbols.begin(), num_chars);
else
#endif
return find_first_symbols_sse2<positive, return_mode>(haystack.begin(), haystack.end(), symbols.begin(), num_chars);
}

}
Expand All @@ -266,7 +337,7 @@ inline char * find_first_symbols(char * begin, char * end)

inline const char * find_first_symbols(std::string_view haystack, std::string_view symbols)
{
return detail::find_first_symbols_sse42<true, detail::ReturnMode::End>(haystack, symbols);
return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End>(haystack, symbols);
}

template <char... symbols>
Expand All @@ -283,7 +354,7 @@ inline char * find_first_not_symbols(char * begin, char * end)

inline const char * find_first_not_symbols(std::string_view haystack, std::string_view symbols)
{
return detail::find_first_symbols_sse42<false, detail::ReturnMode::End>(haystack, symbols);
return detail::find_first_symbols_dispatch<false, detail::ReturnMode::End>(haystack, symbols);
}

template <char... symbols>
Expand All @@ -300,7 +371,7 @@ inline char * find_first_symbols_or_null(char * begin, char * end)

inline const char * find_first_symbols_or_null(std::string_view haystack, std::string_view symbols)
{
return detail::find_first_symbols_sse42<true, detail::ReturnMode::Nullptr>(haystack, symbols);
return detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr>(haystack, symbols);
}

template <char... symbols>
Expand All @@ -317,7 +388,7 @@ inline char * find_first_not_symbols_or_null(char * begin, char * end)

inline const char * find_first_not_symbols_or_null(std::string_view haystack, std::string_view symbols)
{
return detail::find_first_symbols_sse42<false, detail::ReturnMode::Nullptr>(haystack, symbols);
return detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr>(haystack, symbols);
}

template <char... symbols>
Expand Down
140 changes: 139 additions & 1 deletion src/Common/tests/gtest_find_symbols.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ void test_find_first_not(const std::string & haystack, const std::string & symbo

TEST(FindSymbols, SimpleTest)
{
std::string s = "Hello, world! Goodbye...";
const std::string s = "Hello, world! Goodbye...";
const char * begin = s.data();
const char * end = s.data() + s.size();

Expand All @@ -34,6 +34,9 @@ TEST(FindSymbols, SimpleTest)
ASSERT_EQ(find_first_symbols<'H'>(begin, end), begin);
ASSERT_EQ((find_first_symbols<'a', 'e'>(begin, end)), begin + 1);

ASSERT_EQ((find_first_symbols<'a', 'e', 'w', 'x', 'z'>(begin, end)), begin + 1);
ASSERT_EQ((find_first_symbols<'p', 'q', 's', 'x', 'z'>(begin, end)), end);

ASSERT_EQ(find_last_symbols_or_null<'a'>(begin, end), nullptr);
ASSERT_EQ(find_last_symbols_or_null<'e'>(begin, end), end - 4);
ASSERT_EQ(find_last_symbols_or_null<'.'>(begin, end), end - 1);
Expand All @@ -54,6 +57,141 @@ TEST(FindSymbols, SimpleTest)
}
}

template <bool positive, detail::ReturnMode return_mode>
inline const char * find_first_symbols_sse42_MY(const char * const begin, const char * const end, const char * symbols, size_t num_chars)
{
using namespace detail;
const char * pos = begin;

#if defined(__SSE4_2__)
constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT;

#if defined(__AVX512F__) || defined(__AVX512BW__) || defined(__AVX__) || defined(__AVX2__)

#else
// This is to avoid read past end of allocated string while loading `set` from `symbols` if `num_chars < 16`.
char buffer[16] = {'\0'};
memcpy(buffer, symbols, num_chars);
const __m128i set = _mm_loadu_si128(reinterpret_cast<const __m128i *>(buffer));
#endif

for (; pos + 15 < end; pos += 16)
{
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));

if constexpr (positive)
{
if (_mm_cmpestrc(set, num_chars, bytes, 16, mode))
return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode);
}
else
{
if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY))
return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY);
}
}
#endif

for (; pos < end; ++pos)
if (maybe_negate<positive>(is_in(*pos, symbols, num_chars)))
return pos;

return return_mode == ReturnMode::End ? end : nullptr;
}

template <char... symbols>
inline const char * find_first_symbols_MY(const char * begin, const char * end)
{
return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End, symbols...>(begin, end);
}

TEST(FindSymbols, RunTimeNeedle)
{
auto test_haystack = [](const auto & haystack, const auto & unfindable_needle) {
#define TEST_HAYSTACK_AND_NEEDLE(haystack_, needle_) \
do { \
const auto & h = haystack_; \
const auto & n = needle_; \
EXPECT_EQ( \
std::find_first_of(h.data(), h.data() + h.size(), n.data(), n.data() + n.size()), \
find_first_symbols(h, n) \
) << "haystack: \"" << h << "\" (" << static_cast<const void*>(h.data()) << ")" \
<< ", needle: \"" << n << "\""; \
} \
while (false)

// can't find needle
TEST_HAYSTACK_AND_NEEDLE(haystack, unfindable_needle);

#define TEST_WITH_MODIFIED_NEEDLE(haystack, in_needle, needle_update_statement) \
do \
{ \
std::string needle = (in_needle); \
(needle_update_statement); \
TEST_HAYSTACK_AND_NEEDLE(haystack, needle); \
} \
while (false)

// findable symbol is at beginnig of the needle
// Can find at first pos of haystack
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.front() = haystack.front());
// Can find at first pos of haystack
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.front() = haystack.back());
// Can find in the middle of haystack
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.front() = haystack[haystack.size() / 2]);

// findable symbol is at end of the needle
// Can find at first pos of haystack
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.back() = haystack.front());
// Can find at first pos of haystack
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.back() = haystack.back());
// Can find in the middle of haystack
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.back() = haystack[haystack.size() / 2]);

// findable symbol is in the middle of the needle
// Can find at first pos of haystack
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle[needle.size() / 2] = haystack.front());
// Can find at first pos of haystack
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle[needle.size() / 2] = haystack.back());
// Can find in the middle of haystack
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle[needle.size() / 2] = haystack[haystack.size() / 2]);

#undef TEST_WITH_MODIFIED_NEEDLE
#undef TEST_HAYSTACK_AND_NEEDLE
};

// there are 4 major groups of cases:
// haystack < 16 bytes, haystack > 16 bytes
// needle < 5 bytes, needle >= 5 bytes

// First and last symbols of haystack should be unique
const std::string long_haystack = "Hello, world! Goodbye...?";
const std::string short_haystack = "Hello, world!";

// In sync with find_first_symbols_dispatch code: long needles receve special treatment.
// as of now "long" means >= 5
const std::string unfindable_long_needle = "0123456789ABCDEF";
const std::string unfindable_short_needle = "0123";

{
SCOPED_TRACE("Long haystack");
test_haystack(long_haystack, unfindable_long_needle);
test_haystack(long_haystack, unfindable_short_needle);
}

{
SCOPED_TRACE("Short haystack");
test_haystack(short_haystack, unfindable_long_needle);
test_haystack(short_haystack, unfindable_short_needle);
}

// Check that nothing matches on big haystack,
EXPECT_EQ(find_first_symbols(long_haystack, "ABCDEFIJKLMNOPQRSTUVWXYZacfghijkmnpqstuvxz"), long_haystack.data() + long_haystack.size());

// only 16 bytes of haystack are checked, so nothing is found
EXPECT_EQ(find_first_symbols(long_haystack, "ABCDEFIJKLMNOPQR0helloworld"), long_haystack.data() + long_haystack.size());
}

TEST(FindNotSymbols, AllSymbolsPresent)
{
std::string str_with_17_bytes = "hello world hello";
Expand Down
3 changes: 1 addition & 2 deletions src/Functions/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,7 @@ add_subdirectory(JSONPath)
list (APPEND PRIVATE_LIBS clickhouse_functions_jsonpath)

add_subdirectory(keyvaluepair)
list (APPEND OBJECT_LIBS $<TARGET_OBJECTS:clickhouse_functions_extractkeyvaluepairs_core>)
list (APPEND OBJECT_LIBS $<TARGET_OBJECTS:clickhouse_functions_extractkeyvaluepairs_api>)
list (APPEND OBJECT_LIBS $<TARGET_OBJECTS:clickhouse_functions_extractkeyvaluepairs>)

# Signed integer overflow on user-provided data inside boost::geometry - ignore.
set_source_files_properties("pointInPolygon.cpp" PROPERTIES COMPILE_FLAGS -fno-sanitize=signed-integer-overflow)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "ArgumentExtractor.h"
#include <Functions/keyvaluepair/ArgumentExtractor.h>

namespace DB
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#pragma once

#include <optional>

#include <Columns/IColumn.h>
#include <Core/ColumnsWithTypeAndName.h>

#include <optional>

namespace DB
{

Expand Down
9 changes: 7 additions & 2 deletions src/Functions/keyvaluepair/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
add_subdirectory(src)
add_subdirectory(api)
include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake")
add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs .)
add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs impl)

add_library(clickhouse_functions_extractkeyvaluepairs ${clickhouse_functions_extractkeyvaluepairs_sources} ${clickhouse_functions_extractkeyvaluepairs_headers})

target_link_libraries(clickhouse_functions_extractkeyvaluepairs PRIVATE dbms)
6 changes: 0 additions & 6 deletions src/Functions/keyvaluepair/api/CMakeLists.txt

This file was deleted.

Loading

0 comments on commit c02f883

Please sign in to comment.