From 7f3d7c3c82359a373fce6b892ddf4e8c0b25bf13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B8rgen=20Edelbo?= <jorgen.edelbo@mongodb.com>
Date: Wed, 14 Sep 2022 10:53:42 +0200
Subject: [PATCH] Improve unicode handling better on non-Windows

---
 src/realm/unicode.cpp      | 74 ++++++++++++++++++++++++++------------
 test/test_index_string.cpp | 14 ++++++++
 2 files changed, 66 insertions(+), 22 deletions(-)
diff --git a/src/realm/unicode.cpp b/src/realm/unicode.cpp
index a68f0c56ca5..d485f31984d 100644
--- a/src/realm/unicode.cpp
+++ b/src/realm/unicode.cpp
@@ -343,32 +343,62 @@ util::Optional<std::string> case_map(StringData source, bool upper)
 
     return result;
 #else
-    // FIXME: Implement this! Note that this is trivial in C++11 due
-    // to its built-in support for UTF-8. In C++03 it is trivial when
-    // __STDC_ISO_10646__ is defined. Also consider using ICU. Maybe
-    // GNU has something to offer too.
-
-    // For now we handle just the ASCII subset
+    size_t sz = source.size();
     typedef std::char_traits<char> traits;
-    if (upper) {
-        size_t n = source.size();
-        for (size_t i = 0; i < n; ++i) {
-            char c = source[i];
-            if (traits::lt(0x60, c) && traits::lt(c, 0x7B))
-                c = traits::to_char_type(traits::to_int_type(c) - 0x20);
-            result[i] = c;
+    for (size_t i = 0; i < sz; ++i) {
+        char c = source[i];
+
+        auto copy_bytes = [&](size_t n) {
+            if (i + n > sz) {
+                return false;
+            }
+            for (size_t j = 1; j < n; j++) {
+                result[i++] = c;
+                c = source[i];
+            }
+            return true;
+        };
+
+        if (upper && (c >= 'a' && c <= 'z')) {
+            c -= 0x20;
         }
-    }
-    else { // lower
-        size_t n = source.size();
-        for (size_t i = 0; i < n; ++i) {
-            char c = source[i];
-            if (traits::lt(0x40, c) && traits::lt(c, 0x5B))
-                c = traits::to_char_type(traits::to_int_type(c) + 0x20);
-            result[i] = c;
+        else if (!upper && (c >= 'A' && c <= 'Z')) {
+            c += 0x20;
         }
-    }
+        else if (traits::to_int_type(c) > 0x7f) {
+            auto int_val = traits::to_int_type(c);
+            if ((int_val & 0xE0) == 0xc0) {
+                // 2 byte utf-8
+                if (i + 2 > sz) {
+                    return {};
+                }
+                auto u = ((int_val << 6) + (traits::to_int_type(source[i + 1]) & 0x3F)) & 0x7FF;
+                // Handle some Latin-1 supplement characters
+                if (upper && (u >= 0xE0 && u <= 0xFE && u != 0xF7)) {
+                    u -= 0x20;
+                }
+                else if (!upper && (u >= 0xC0 && u <= 0xDE && u != 0xD7)) {
+                    u += 0x20;
+                }
 
+                result[i++] = static_cast<char>((u >> 6) | 0xC0);
+                c = static_cast<char>((u & 0x3f) | 0x80);
+            }
+            else if ((int_val & 0xF0) == 0xE0) {
+                // 3 byte utf-8
+                if (!copy_bytes(3)) {
+                    return {};
+                }
+            }
+            else if ((int_val & 0xF8) == 0xF0) {
+                // 4 byte utf-8
+                if (!copy_bytes(4)) {
+                    return {};
+                }
+            }
+        }
+        result[i] = c;
+    }
     return result;
 #endif
 }
diff --git a/test/test_index_string.cpp b/test/test_index_string.cpp
index 3e6306d11fc..5f5dee20914 100644
--- a/test/test_index_string.cpp
+++ b/test/test_index_string.cpp
@@ -1833,5 +1833,19 @@ TEST(Unicode_Casemap)
     if (CHECK(out)) {
         CHECK_EQUAL(*out, "A VERY OLD HOUSE 🏠 IS ON 🔥, WE HAVE TO SAVE THE 🦄");
     }
+
+    StringData trailing_garbage(inp.data(), 19); // String terminated inside icon
+    out = case_map(trailing_garbage, true);
+    CHECK_NOT(out);
+
+    inp = "rødgrød med fløde";
+    out = case_map(inp, true);
+    if (CHECK(out)) {
+        CHECK_EQUAL(*out, "RØDGRØD MED FLØDE");
+    }
+    out = case_map(out, false);
+    if (CHECK(out)) {
+        CHECK_EQUAL(*out, inp);
+    }
 }
 #endif // TEST_INDEX_STRING