diff --git a/CHANGELOG.md b/CHANGELOG.md index 1da17b96ce6..b378c5dab46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ * Fixed `realm_query_parse_for_list` ignoring existing query ([#5850](https://github.com/realm/realm-core/pull/5850)). * Fixed not allowing asymmetric tables in partition based sync ([#5691](https://github.com/realm/realm-core/issues/5691)). * Disable auto refresh for old realm instance passed to migration callbacks. ([#5856](https://github.com/realm/realm-core/pull/5856)). +* If a case insensitive query searched for a string including an 4-byte UTF8 character, the program would crash ([#5825](https://github.com/realm/realm-core/issues/5825), since v2.3.0) ### Breaking changes * None. diff --git a/Package.swift b/Package.swift index 1f11542b1ae..1aef10edadf 100644 --- a/Package.swift +++ b/Package.swift @@ -120,6 +120,7 @@ let notSyncServerSources: [String] = [ "realm/table_cluster_tree.cpp", "realm/table_ref.cpp", "realm/table_view.cpp", + "realm/transaction.cpp", "realm/unicode.cpp", "realm/util", "realm/utilities.cpp", diff --git a/src/realm/exceptions.hpp b/src/realm/exceptions.hpp index 549023aec65..a560cb405a8 100644 --- a/src/realm/exceptions.hpp +++ b/src/realm/exceptions.hpp @@ -176,6 +176,25 @@ class NoSubscriptionForWrite : public std::runtime_error { NoSubscriptionForWrite(const std::string& msg); }; +namespace query_parser { + +/// Exception thrown when parsing fails due to invalid syntax. +struct SyntaxError : std::runtime_error { + using std::runtime_error::runtime_error; +}; + +/// Exception thrown when binding a syntactically valid query string in a +/// context where it does not make sense. +struct InvalidQueryError : std::runtime_error { + using std::runtime_error::runtime_error; +}; + +/// Exception thrown when there is a problem accessing the arguments in a query string +struct InvalidQueryArgError : std::invalid_argument { + using std::invalid_argument::invalid_argument; +}; + +} // namespace query_parser /// The \c LogicError exception class is intended to be thrown only when /// applications (or bindings) violate rules that are stated (or ought to have diff --git a/src/realm/parser/query_parser.hpp b/src/realm/parser/query_parser.hpp index 19999887a8c..f4338a66b62 100644 --- a/src/realm/parser/query_parser.hpp +++ b/src/realm/parser/query_parser.hpp @@ -31,22 +31,6 @@ namespace realm::query_parser { -/// Exception thrown when parsing fails due to invalid syntax. -struct SyntaxError : std::runtime_error { - using std::runtime_error::runtime_error; -}; - -/// Exception thrown when binding a syntactically valid query string in a -/// context where it does not make sense. -struct InvalidQueryError : std::runtime_error { - using std::runtime_error::runtime_error; -}; - -/// Exception thrown when there is a problem accessing the arguments in a query string -struct InvalidQueryArgError : std::invalid_argument { - using std::invalid_argument::invalid_argument; -}; - struct AnyContext { template T unbox(const std::any& wrapper) diff --git a/src/realm/query.cpp b/src/realm/query.cpp index dd88c46eb97..150a5ca77dd 100644 --- a/src/realm/query.cpp +++ b/src/realm/query.cpp @@ -1827,20 +1827,6 @@ void* Query::query_thread(void* arg) #endif // REALM_MULTITHREADQUERY -std::string Query::validate() const -{ - if (!m_groups.size()) - return ""; - - if (error_code != "") // errors detected by QueryInterface - return error_code; - - if (!root_node()) - return "Syntax error"; - - return root_node()->validate(); // errors detected by QueryEngine -} - std::string Query::get_description(util::serializer::SerialisationState& state) const { std::string description; diff --git a/src/realm/query.hpp b/src/realm/query.hpp index 85b8d9368ee..fc45606f8ce 100644 --- a/src/realm/query.hpp +++ b/src/realm/query.hpp @@ -320,8 +320,6 @@ class Query final { // or empty vector if the query is not associated with a table. TableVersions sync_view_if_needed() const; - std::string validate() const; - std::string get_description(const std::string& class_prefix = "") const; std::string get_description(util::serializer::SerialisationState& state) const; diff --git a/src/realm/query_engine.hpp b/src/realm/query_engine.hpp index 9a854e82367..ab49c6ab70c 100644 --- a/src/realm/query_engine.hpp +++ b/src/realm/query_engine.hpp @@ -225,16 +225,6 @@ class ParentNode { ArrayPayload* source_column); - virtual std::string validate() - { - if (error_code != "") - return error_code; - if (m_child == nullptr) - return ""; - else - return m_child->validate(); - } - ParentNode(const ParentNode& from); void add_child(std::unique_ptr child) @@ -320,7 +310,6 @@ class ParentNode { ConstTableRef m_table = ConstTableRef(); const Cluster* m_cluster = nullptr; QueryStateBase* m_state = nullptr; - std::string error_code; static std::vector s_dummy_keys; ColumnType get_real_column_type(ColKey key) @@ -1582,7 +1571,7 @@ class StringNode : public StringNodeBase { auto upper = case_map(v, true); auto lower = case_map(v, false); if (!upper || !lower) { - error_code = "Malformed UTF-8: " + std::string(v); + throw std::runtime_error(util::format("Malformed UTF-8: %1", v)); } else { m_ucase = std::move(*upper); @@ -1707,7 +1696,7 @@ class StringNode : public StringNodeBase { auto upper = case_map(v, true); auto lower = case_map(v, false); if (!upper || !lower) { - error_code = "Malformed UTF-8: " + std::string(v); + throw query_parser::InvalidQueryError(util::format("Malformed UTF-8: %1", v)); } else { m_ucase = std::move(*upper); @@ -1921,7 +1910,7 @@ class StringNode : public StringNodeEqualBase { auto upper = case_map(v, true); auto lower = case_map(v, false); if (!upper || !lower) { - error_code = "Malformed UTF-8: " + std::string(v); + throw query_parser::InvalidQueryError(util::format("Malformed UTF-8: %1", v)); } else { m_ucase = std::move(*upper); @@ -2106,27 +2095,6 @@ class OrNode : public ParentNode { return index; } - std::string validate() override - { - if (error_code != "") - return error_code; - if (m_conditions.size() == 0) - return "Missing left-hand side of OR"; - if (m_conditions.size() == 1) - return "Missing right-hand side of OR"; - std::string s; - if (m_child != 0) - s = m_child->validate(); - if (s != "") - return s; - for (size_t i = 0; i < m_conditions.size(); ++i) { - s = m_conditions[i]->validate(); - if (s != "") - return s; - } - return ""; - } - std::unique_ptr clone() const override { return std::unique_ptr(new OrNode(*this)); @@ -2166,6 +2134,9 @@ class NotNode : public ParentNode { : m_condition(std::move(condition)) { m_dT = 50.0; + if (!m_condition) { + throw query_parser::InvalidQueryError("Missing argument to Not"); + } } void table_changed() override @@ -2194,23 +2165,6 @@ class NotNode : public ParentNode { size_t find_first_local(size_t start, size_t end) override; - std::string validate() override - { - if (error_code != "") - return error_code; - if (m_condition == 0) - return "Missing argument to Not"; - std::string s; - if (m_child != 0) - s = m_child->validate(); - if (s != "") - return s; - s = m_condition->validate(); - if (s != "") - return s; - return ""; - } - std::string describe(util::serializer::SerialisationState& state) const override { if (m_condition) { diff --git a/src/realm/unicode.cpp b/src/realm/unicode.cpp index cac213f821d..d7553619dca 100644 --- a/src/realm/unicode.cpp +++ b/src/realm/unicode.cpp @@ -274,79 +274,6 @@ bool utf8_compare(StringData string1, StringData string2) return false; } -// Here is a version for Windows that may be closer to what is ultimately needed. -/* -bool case_map(const char* begin, const char* end, StringBuffer& dest, bool upper) -{ -const int wide_buffer_size = 32; -wchar_t wide_buffer[wide_buffer_size]; - -dest.resize(end-begin); -size_t dest_offset = 0; - -for (;;) { -int num_out; - -// Decode -{ -size_t num_in = end - begin; -if (size_t(32) <= num_in) { -num_out = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, 32, wide_buffer, wide_buffer_size); -if (num_out != 0) { -begin += 32; -goto convert; -} -if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) return false; -} -if (num_in == 0) break; -int n = num_in < size_t(8) ? int(num_in) : 8; -num_out = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, n, wide_buffer, wide_buffer_size); -if (num_out != 0) { -begin += n; -goto convert; -} -return false; -} - -convert: -if (upper) { -for (int i=0; i::max(), free)) free = std::numeric_limits::max(); -int n = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, wide_buffer, num_out, -dest.data() + dest_offset, int(free), 0, 0); -if (i != 0) { -dest_offset += n; -continue; -} -if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) return false; -size_t dest_size = dest.size(); -if (int_multiply_with_overflow_detect(dest_size, 2)) { -if (dest_size == std::numeric_limits::max()) return false; -dest_size = std::numeric_limits::max(); -} -dest.resize(dest_size); -goto encode; -} -} - -dest.resize(dest_offset); -return true; -} -*/ - - // Converts UTF-8 source into upper or lower case. This function // preserves the byte length of each UTF-8 character in following way: // If an output character differs in size, it is simply substituded by @@ -358,29 +285,42 @@ util::Optional case_map(StringData source, bool upper) result.resize(source.size()); #if defined(_WIN32) + constexpr int tmp_buffer_size = 32; const char* begin = source.data(); const char* end = begin + source.size(); auto output = result.begin(); while (begin != end) { - int n = static_cast(sequence_length(*begin)); - if (n == 0 || end - begin < n) - return util::none; + auto n = end - begin; + if (n > tmp_buffer_size) { + // Break the input string into chunks - but don't break in the middle of a multibyte character + const char* p = begin; + const char* buffer_end = begin + tmp_buffer_size; + while (p < buffer_end) { + size_t len = sequence_length(*p); + p += len; + if (p > buffer_end) { + p -= len; + break; + } + } + n = p - begin; + } - wchar_t tmp[2]; // FIXME: Why no room for UTF-16 surrogate + wchar_t tmp[tmp_buffer_size]; - int n2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, n, tmp, 1); + int n2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, int(n), tmp, tmp_buffer_size); if (n2 == 0) return util::none; - REALM_ASSERT(n2 == 1); - tmp[n2] = 0; + if (n2 < tmp_buffer_size) + tmp[n2] = 0; // Note: If tmp[0] == 0, it is because the string contains a // null-chacarcter, which is perfectly fine. - wchar_t mapped_tmp[2]; - LCMapStringEx(LOCALE_NAME_INVARIANT, upper ? LCMAP_UPPERCASE : LCMAP_LOWERCASE, tmp, 1, mapped_tmp, 2, - nullptr, nullptr, 0); + wchar_t mapped_tmp[tmp_buffer_size]; + LCMapStringEx(LOCALE_NAME_INVARIANT, upper ? LCMAP_UPPERCASE : LCMAP_LOWERCASE, tmp, n2, mapped_tmp, + tmp_buffer_size, nullptr, nullptr, 0); // FIXME: The intention is to use flag 'WC_ERR_INVALID_CHARS' // to catch invalid UTF-8. Even though the documentation says @@ -388,7 +328,8 @@ util::Optional case_map(StringData source, bool upper) // the flag is specified, the function fails with error // ERROR_INVALID_FLAGS. DWORD flags = 0; - int n3 = WideCharToMultiByte(CP_UTF8, flags, mapped_tmp, 1, &*output, static_cast(end - begin), 0, 0); + auto m = static_cast(end - begin); + int n3 = WideCharToMultiByte(CP_UTF8, flags, mapped_tmp, n2, &*output, m, 0, 0); if (n3 == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER) return util::none; @@ -402,32 +343,75 @@ util::Optional case_map(StringData source, bool upper) return result; #else - // FIXME: Implement this! Note that this is trivial in C++11 due - // to its built-in support for UTF-8. In C++03 it is trivial when - // __STDC_ISO_10646__ is defined. Also consider using ICU. Maybe - // GNU has something to offer too. - - // For now we handle just the ASCII subset + size_t sz = source.size(); typedef std::char_traits traits; - if (upper) { - size_t n = source.size(); - for (size_t i = 0; i < n; ++i) { - char c = source[i]; - if (traits::lt(0x60, c) && traits::lt(c, 0x7B)) - c = traits::to_char_type(traits::to_int_type(c) - 0x20); - result[i] = c; + for (size_t i = 0; i < sz; ++i) { + char c = source[i]; + auto int_val = traits::to_int_type(c); + + auto copy_bytes = [&](size_t n) { + if (i + n > sz) { + return false; + } + for (size_t j = 1; j < n; j++) { + result[i++] = c; + c = source[i]; + if ((c & 0xC0) != 0x80) { + return false; + } + } + return true; + }; + + if (int_val < 0x80) { + // Handle ASCII + if (upper && (c >= 'a' && c <= 'z')) { + c -= 0x20; + } + else if (!upper && (c >= 'A' && c <= 'Z')) { + c += 0x20; + } } - } - else { // lower - size_t n = source.size(); - for (size_t i = 0; i < n; ++i) { - char c = source[i]; - if (traits::lt(0x40, c) && traits::lt(c, 0x5B)) - c = traits::to_char_type(traits::to_int_type(c) + 0x20); - result[i] = c; + else { + if ((int_val & 0xE0) == 0xc0) { + // 2 byte utf-8 + if (i + 2 > sz) { + return {}; + } + c = source[i + 1]; + if ((c & 0xC0) != 0x80) { + return {}; + } + auto u = ((int_val << 6) + (traits::to_int_type(c) & 0x3F)) & 0x7FF; + // Handle some Latin-1 supplement characters + if (upper && (u >= 0xE0 && u <= 0xFE && u != 0xF7)) { + u -= 0x20; + } + else if (!upper && (u >= 0xC0 && u <= 0xDE && u != 0xD7)) { + u += 0x20; + } + + result[i++] = static_cast((u >> 6) | 0xC0); + c = static_cast((u & 0x3f) | 0x80); + } + else if ((int_val & 0xF0) == 0xE0) { + // 3 byte utf-8 + if (!copy_bytes(3)) { + return {}; + } + } + else if ((int_val & 0xF8) == 0xF0) { + // 4 byte utf-8 + if (!copy_bytes(4)) { + return {}; + } + } + else { + return {}; + } } + result[i] = c; } - return result; #endif } diff --git a/test/test_index_string.cpp b/test/test_index_string.cpp index db03d519f4a..5f5dee20914 100644 --- a/test/test_index_string.cpp +++ b/test/test_index_string.cpp @@ -1826,4 +1826,26 @@ TEST(StringIndex_MixedEqualBitPattern) CHECK_EQUAL(tv.get_object(1).get_any(col), val1); } +TEST(Unicode_Casemap) +{ + std::string inp = "A very old house 🏠 is on 🔥, we have to save the 🦄"; + auto out = case_map(inp, true); + if (CHECK(out)) { + CHECK_EQUAL(*out, "A VERY OLD HOUSE 🏠 IS ON 🔥, WE HAVE TO SAVE THE 🦄"); + } + + StringData trailing_garbage(inp.data(), 19); // String terminated inside icon + out = case_map(trailing_garbage, true); + CHECK_NOT(out); + + inp = "rødgrød med fløde"; + out = case_map(inp, true); + if (CHECK(out)) { + CHECK_EQUAL(*out, "RØDGRØD MED FLØDE"); + } + out = case_map(out, false); + if (CHECK(out)) { + CHECK_EQUAL(*out, inp); + } +} #endif // TEST_INDEX_STRING diff --git a/test/test_parser.cpp b/test/test_parser.cpp index 0c149a8b922..2df0660b4f6 100644 --- a/test/test_parser.cpp +++ b/test/test_parser.cpp @@ -824,7 +824,7 @@ TEST(Parser_StringOperations) TableRef t = g.add_table("person"); ColKey name_col = t->add_column(type_String, "name", true); ColKey link_col = t->add_column(*t, "father"); - std::vector names = {"Billy", "Bob", "Joe", "Jake", "Joel"}; + std::vector names = {"Billy", "Bob", "Joe", "Jake", "Joel", "Unicorn🦄"}; std::vector people_keys; t->create_objects(names.size(), people_keys); for (size_t i = 0; i < t->size(); ++i) { @@ -834,16 +834,17 @@ TEST(Parser_StringOperations) } t->create_object(); // null t->get_object(people_keys[4]).set_null(link_col); + size_t nb_names = names.size(); verify_query(test_context, t, "name == 'Bob'", 1); verify_query(test_context, t, "father.name == 'Bob'", 1); verify_query(test_context, t, "name ==[c] 'Bob'", 1); verify_query(test_context, t, "father.name ==[c] 'Bob'", 1); - verify_query(test_context, t, "name != 'Bob'", 5); - verify_query(test_context, t, "father.name != 'Bob'", 5); - verify_query(test_context, t, "name !=[c] 'bOB'", 5); - verify_query(test_context, t, "father.name !=[c] 'bOB'", 5); + verify_query(test_context, t, "name != 'Bob'", nb_names); + verify_query(test_context, t, "father.name != 'Bob'", nb_names); + verify_query(test_context, t, "name !=[c] 'bOB'", nb_names); + verify_query(test_context, t, "father.name !=[c] 'bOB'", nb_names); verify_query(test_context, t, "name contains \"oe\"", 2); verify_query(test_context, t, "father.name contains \"oe\"", 2); @@ -865,23 +866,25 @@ TEST(Parser_StringOperations) verify_query(test_context, t, "name like[c] \"?O?\"", 2); verify_query(test_context, t, "father.name like[c] \"?O?\"", 2); + verify_query(test_context, t, "name ==[c] 'unicorn🦄'", 1); + verify_query(test_context, t, "name == NULL", 1); verify_query(test_context, t, "name == nil", 1); verify_query(test_context, t, "NULL == name", 1); - verify_query(test_context, t, "name != NULL", 5); - verify_query(test_context, t, "NULL != name", 5); + verify_query(test_context, t, "name != NULL", nb_names); + verify_query(test_context, t, "NULL != name", nb_names); verify_query(test_context, t, "name ==[c] NULL", 1); verify_query(test_context, t, "NULL ==[c] name", 1); - verify_query(test_context, t, "name !=[c] NULL", 5); - verify_query(test_context, t, "NULL !=[c] name", 5); + verify_query(test_context, t, "name !=[c] NULL", nb_names); + verify_query(test_context, t, "NULL !=[c] name", nb_names); // for strings 'NULL' is also a synonym for the null string - verify_query(test_context, t, "name CONTAINS NULL", 6); - verify_query(test_context, t, "name CONTAINS[c] NULL", 6); - verify_query(test_context, t, "name BEGINSWITH NULL", 6); - verify_query(test_context, t, "name BEGINSWITH[c] NULL", 6); - verify_query(test_context, t, "name ENDSWITH NULL", 6); - verify_query(test_context, t, "name ENDSWITH[c] NULL", 6); + verify_query(test_context, t, "name CONTAINS NULL", t->size()); + verify_query(test_context, t, "name CONTAINS[c] NULL", t->size()); + verify_query(test_context, t, "name BEGINSWITH NULL", t->size()); + verify_query(test_context, t, "name BEGINSWITH[c] NULL", t->size()); + verify_query(test_context, t, "name ENDSWITH NULL", t->size()); + verify_query(test_context, t, "name ENDSWITH[c] NULL", t->size()); verify_query(test_context, t, "name LIKE NULL", 1); verify_query(test_context, t, "name LIKE[c] NULL", 1); diff --git a/test/test_query2.cpp b/test/test_query2.cpp index 50fe64b2439..d9bd7696d20 100644 --- a/test/test_query2.cpp +++ b/test/test_query2.cpp @@ -864,54 +864,6 @@ TEST(Query_FindAllContainsUnicode) CHECK_EQUAL(3, tv2[3].get(col_id)); } -TEST(Query_SyntaxCheck) -{ - Table table; - auto col_int = table.add_column(type_Int, "1"); - table.add_column(type_String, "2"); - - std::string s; - - table.create_object().set_all(1, "a"); - table.create_object().set_all(2, "a"); - table.create_object().set_all(3, "X"); - - Query q1 = table.where().equal(col_int, 2).end_group(); - s = q1.validate(); - CHECK(s != ""); - - Query q2 = table.where().group().group().equal(col_int, 2).end_group(); - s = q2.validate(); - CHECK(s != ""); - - Query q3 = table.where().equal(col_int, 2).Or(); - s = q3.validate(); - CHECK(s != ""); - - Query q4 = table.where().Or().equal(col_int, 2); - s = q4.validate(); - CHECK(s != ""); - - Query q5 = table.where().equal(col_int, 2); - s = q5.validate(); - CHECK(s == ""); - - Query q6 = table.where().group().equal(col_int, 2); - s = q6.validate(); - CHECK(s != ""); - - // FIXME: Work is currently underway to fully support locale - // independent case folding as defined by Unicode. Reenable this test - // when is becomes available. - /* - Query q7 = ttt.where().equal(1, "\xa0", false); -#ifdef REALM_DEBUG - s = q7.verify(); - CHECK(s != ""); -#endif - */ -} - TEST(Query_TestTV_where) { // When using .where(&tv), tv can have any order, and the resulting view will retain its order