realm · jedelbo · Sep 15, 2022 · Aug 31, 2022 · Sep 6, 2022 · Sep 6, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,7 +6,7 @@
 
 ### Fixed
 * <How do the end-user experience this issue? what was the impact?> ([#????](https://github.com/realm/realm-core/issues/????), since v?.?.?)
-* None.
+* If a case insensitive query searched for a string including an 4-byte UTF8 character, the program would crash ([#5825](https://github.com/realm/realm-core/issues/5825), since v2.3.0)
 
 ### Breaking changes
 * None.

diff --git a/src/realm/exceptions.hpp b/src/realm/exceptions.hpp
@@ -176,6 +176,25 @@ class NoSubscriptionForWrite : public std::runtime_error {
     NoSubscriptionForWrite(const std::string& msg);
 };
 
+namespace query_parser {
+
+/// Exception thrown when parsing fails due to invalid syntax.
+struct SyntaxError : std::runtime_error {
+    using std::runtime_error::runtime_error;
+};
+
+/// Exception thrown when binding a syntactically valid query string in a
+/// context where it does not make sense.
+struct InvalidQueryError : std::runtime_error {
+    using std::runtime_error::runtime_error;
+};
+
+/// Exception thrown when there is a problem accessing the arguments in a query string
+struct InvalidQueryArgError : std::invalid_argument {
+    using std::invalid_argument::invalid_argument;
+};
+
+} // namespace query_parser
 
 /// The \c LogicError exception class is intended to be thrown only when
 /// applications (or bindings) violate rules that are stated (or ought to have

diff --git a/src/realm/parser/query_parser.hpp b/src/realm/parser/query_parser.hpp
@@ -31,22 +31,6 @@
 
 namespace realm::query_parser {
 
-/// Exception thrown when parsing fails due to invalid syntax.
-struct SyntaxError : std::runtime_error {
-    using std::runtime_error::runtime_error;
-};
-
-/// Exception thrown when binding a syntactically valid query string in a
-/// context where it does not make sense.
-struct InvalidQueryError : std::runtime_error {
-    using std::runtime_error::runtime_error;
-};
-
-/// Exception thrown when there is a problem accessing the arguments in a query string
-struct InvalidQueryArgError : std::invalid_argument {
-    using std::invalid_argument::invalid_argument;
-};
-
 struct AnyContext {
     template <typename T>
     T unbox(const std::any& wrapper)

diff --git a/src/realm/query.cpp b/src/realm/query.cpp
@@ -1827,20 +1827,6 @@ void* Query::query_thread(void* arg)
 
 #endif // REALM_MULTITHREADQUERY
 
-std::string Query::validate() const
-{
-    if (!m_groups.size())
-        return "";
-
-    if (error_code != "") // errors detected by QueryInterface
-        return error_code;
-
-    if (!root_node())
-        return "Syntax error";
-
-    return root_node()->validate(); // errors detected by QueryEngine
-}
-
 std::string Query::get_description(util::serializer::SerialisationState& state) const
 {
     std::string description;

diff --git a/src/realm/query.hpp b/src/realm/query.hpp
@@ -320,8 +320,6 @@ class Query final {
     // or empty vector if the query is not associated with a table.
     TableVersions sync_view_if_needed() const;
 
-    std::string validate() const;
-
     std::string get_description(const std::string& class_prefix = "") const;
     std::string get_description(util::serializer::SerialisationState& state) const;
 

diff --git a/src/realm/query_engine.hpp b/src/realm/query_engine.hpp
@@ -225,16 +225,6 @@ class ParentNode {
                                    ArrayPayload* source_column);
 
 
-    virtual std::string validate()
-    {
-        if (error_code != "")
-            return error_code;
-        if (m_child == nullptr)
-            return "";
-        else
-            return m_child->validate();
-    }
-
     ParentNode(const ParentNode& from);
 
     void add_child(std::unique_ptr<ParentNode> child)
@@ -320,7 +310,6 @@ class ParentNode {
     ConstTableRef m_table = ConstTableRef();
     const Cluster* m_cluster = nullptr;
     QueryStateBase* m_state = nullptr;
-    std::string error_code;
     static std::vector<ObjKey> s_dummy_keys;
 
     ColumnType get_real_column_type(ColKey key)
@@ -1582,7 +1571,7 @@ class StringNode : public StringNodeBase {
         auto upper = case_map(v, true);
         auto lower = case_map(v, false);
         if (!upper || !lower) {
-            error_code = "Malformed UTF-8: " + std::string(v);
+            throw std::runtime_error(util::format("Malformed UTF-8: %1", v));
         }
         else {
             m_ucase = std::move(*upper);
@@ -1707,7 +1696,7 @@ class StringNode<ContainsIns> : public StringNodeBase {
         auto upper = case_map(v, true);
         auto lower = case_map(v, false);
         if (!upper || !lower) {
-            error_code = "Malformed UTF-8: " + std::string(v);
+            throw query_parser::InvalidQueryError(util::format("Malformed UTF-8: %1", v));
         }
         else {
             m_ucase = std::move(*upper);
@@ -1921,7 +1910,7 @@ class StringNode<EqualIns> : public StringNodeEqualBase {
         auto upper = case_map(v, true);
         auto lower = case_map(v, false);
         if (!upper || !lower) {
-            error_code = "Malformed UTF-8: " + std::string(v);
+            throw query_parser::InvalidQueryError(util::format("Malformed UTF-8: %1", v));
         }
         else {
             m_ucase = std::move(*upper);
@@ -2106,27 +2095,6 @@ class OrNode : public ParentNode {
         return index;
     }
 
-    std::string validate() override
-    {
-        if (error_code != "")
-            return error_code;
-        if (m_conditions.size() == 0)
-            return "Missing left-hand side of OR";
-        if (m_conditions.size() == 1)
-            return "Missing right-hand side of OR";
-        std::string s;
-        if (m_child != 0)
-            s = m_child->validate();
-        if (s != "")
-            return s;
-        for (size_t i = 0; i < m_conditions.size(); ++i) {
-            s = m_conditions[i]->validate();
-            if (s != "")
-                return s;
-        }
-        return "";
-    }
-
     std::unique_ptr<ParentNode> clone() const override
     {
         return std::unique_ptr<ParentNode>(new OrNode(*this));
@@ -2166,6 +2134,9 @@ class NotNode : public ParentNode {
         : m_condition(std::move(condition))
     {
         m_dT = 50.0;
+        if (!m_condition) {
+            throw query_parser::InvalidQueryError("Missing argument to Not");
+        }
     }
 
     void table_changed() override
@@ -2194,23 +2165,6 @@ class NotNode : public ParentNode {
 
     size_t find_first_local(size_t start, size_t end) override;
 
-    std::string validate() override
-    {
-        if (error_code != "")
-            return error_code;
-        if (m_condition == 0)
-            return "Missing argument to Not";
-        std::string s;
-        if (m_child != 0)
-            s = m_child->validate();
-        if (s != "")
-            return s;
-        s = m_condition->validate();
-        if (s != "")
-            return s;
-        return "";
-    }
-
     std::string describe(util::serializer::SerialisationState& state) const override
     {
         if (m_condition) {

diff --git a/src/realm/unicode.cpp b/src/realm/unicode.cpp
@@ -274,79 +274,6 @@ bool utf8_compare(StringData string1, StringData string2)
     return false;
 }
 
-// Here is a version for Windows that may be closer to what is ultimately needed.
-/*
-bool case_map(const char* begin, const char* end, StringBuffer& dest, bool upper)
-{
-const int wide_buffer_size = 32;
-wchar_t wide_buffer[wide_buffer_size];
-
-dest.resize(end-begin);
-size_t dest_offset = 0;
-
-for (;;) {
-int num_out;
-
-// Decode
-{
-size_t num_in = end - begin;
-if (size_t(32) <= num_in) {
-num_out = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, 32, wide_buffer, wide_buffer_size);
-if (num_out != 0) {
-begin += 32;
-goto convert;
-}
-if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) return false;
-}
-if (num_in == 0) break;
-int n = num_in < size_t(8) ? int(num_in) : 8;
-num_out = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, n, wide_buffer, wide_buffer_size);
-if (num_out != 0) {
-begin += n;
-goto convert;
-}
-return false;
-}
-
-convert:
-if (upper) {
-for (int i=0; i<num_out; ++i) {
-CharUpperW(wide_buffer + i);
-}
-}
-else {
-for (int i=0; i<num_out; ++i) {
-CharLowerW(wide_buffer + i);
-}
-}
-
-encode:
-{
-size_t free = dest.size() - dest_offset;
-if (int_less_than(std::numeric_limits<int>::max(), free)) free = std::numeric_limits<int>::max();
-int n = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, wide_buffer, num_out,
-dest.data() + dest_offset, int(free), 0, 0);
-if (i != 0) {
-dest_offset += n;
-continue;
-}
-if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) return false;
-size_t dest_size = dest.size();
-if (int_multiply_with_overflow_detect(dest_size, 2)) {
-if (dest_size == std::numeric_limits<size_t>::max()) return false;
-dest_size = std::numeric_limits<size_t>::max();
-}
-dest.resize(dest_size);
-goto encode;
-}
-}
-
-dest.resize(dest_offset);
-return true;
-}
-*/
-
-
 // Converts UTF-8 source into upper or lower case. This function
 // preserves the byte length of each UTF-8 character in following way:
 // If an output character differs in size, it is simply substituded by
@@ -358,28 +285,41 @@ util::Optional<std::string> case_map(StringData source, bool upper)
     result.resize(source.size());
 
 #if defined(_WIN32)
+    constexpr size_t tmp_buffer_size = 32;
     const char* begin = source.data();
     const char* end = begin + source.size();
     auto output = result.begin();
     while (begin != end) {
-        int n = static_cast<int>(sequence_length(*begin));
-        if (n == 0 || end - begin < n)
-            return util::none;
+        size_t n = end - begin;
+        if (n > tmp_buffer_size) {
+            // Break the input string into chunks - but don't break in the middle of a multibyte character
+            const char* p = begin;
+            const char* buffer_end = begin + tmp_buffer_size;
+            while (p < buffer_end) {
+                size_t len = sequence_length(*p);
+                p += len;
+                if (p > buffer_end) {
+                    p -= len;
+                    break;
+                }
+            }
+            n = p - begin;
+        }
 
-        wchar_t tmp[2]; // FIXME: Why no room for UTF-16 surrogate
+        wchar_t tmp[tmp_buffer_size];
 
-        int n2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, n, tmp, 1);
+        int n2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, n, tmp, tmp_buffer_size);
         if (n2 == 0)
             return util::none;
 
-        REALM_ASSERT(n2 == 1);
-        tmp[n2] = 0;
+        if (n2 < tmp_buffer_size)
+            tmp[n2] = 0;
 
         // Note: If tmp[0] == 0, it is because the string contains a
         // null-chacarcter, which is perfectly fine.
 
-        wchar_t mapped_tmp[2];
-        LCMapStringEx(LOCALE_NAME_INVARIANT, upper ? LCMAP_UPPERCASE : LCMAP_LOWERCASE, tmp, 1, mapped_tmp, 2,
+        wchar_t mapped_tmp[tmp_buffer_size];
+        LCMapStringEx(LOCALE_NAME_INVARIANT, upper ? LCMAP_UPPERCASE : LCMAP_LOWERCASE, tmp, n2, mapped_tmp, tmp_buffer_size,
                       nullptr, nullptr, 0);
 
         // FIXME: The intention is to use flag 'WC_ERR_INVALID_CHARS'
@@ -388,7 +328,8 @@ util::Optional<std::string> case_map(StringData source, bool upper)
         // the flag is specified, the function fails with error
         // ERROR_INVALID_FLAGS.
         DWORD flags = 0;
-        int n3 = WideCharToMultiByte(CP_UTF8, flags, mapped_tmp, 1, &*output, static_cast<int>(end - begin), 0, 0);
+        auto m = static_cast<int>(end - begin);
+        int n3 = WideCharToMultiByte(CP_UTF8, flags, mapped_tmp, n2, &*output, m, 0, 0);
         if (n3 == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
             return util::none;
 

diff --git a/test/test_index_string.cpp b/test/test_index_string.cpp
@@ -1826,4 +1826,12 @@ TEST(StringIndex_MixedEqualBitPattern)
     CHECK_EQUAL(tv.get_object(1).get_any(col), val1);
 }
 
+TEST(Unicode_Casemap)
+{
+    std::string inp = "A very old house 🏠 is on 🔥, we have to save the 🦄";
+    auto out = case_map(inp, true);
+    if (CHECK(out)) {
+        CHECK_EQUAL(*out, "A VERY OLD HOUSE 🏠 IS ON 🔥, WE HAVE TO SAVE THE 🦄");
+    }
+}
 #endif // TEST_INDEX_STRING