From 7f3d7c3c82359a373fce6b892ddf4e8c0b25bf13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B8rgen=20Edelbo?= Date: Wed, 14 Sep 2022 10:53:42 +0200 Subject: [PATCH] Improve unicode handling better on non-Windows --- src/realm/unicode.cpp | 74 ++++++++++++++++++++++++++------------ test/test_index_string.cpp | 14 ++++++++ 2 files changed, 66 insertions(+), 22 deletions(-) diff --git a/src/realm/unicode.cpp b/src/realm/unicode.cpp index a68f0c56ca5..d485f31984d 100644 --- a/src/realm/unicode.cpp +++ b/src/realm/unicode.cpp @@ -343,32 +343,62 @@ util::Optional case_map(StringData source, bool upper) return result; #else - // FIXME: Implement this! Note that this is trivial in C++11 due - // to its built-in support for UTF-8. In C++03 it is trivial when - // __STDC_ISO_10646__ is defined. Also consider using ICU. Maybe - // GNU has something to offer too. - - // For now we handle just the ASCII subset + size_t sz = source.size(); typedef std::char_traits traits; - if (upper) { - size_t n = source.size(); - for (size_t i = 0; i < n; ++i) { - char c = source[i]; - if (traits::lt(0x60, c) && traits::lt(c, 0x7B)) - c = traits::to_char_type(traits::to_int_type(c) - 0x20); - result[i] = c; + for (size_t i = 0; i < sz; ++i) { + char c = source[i]; + + auto copy_bytes = [&](size_t n) { + if (i + n > sz) { + return false; + } + for (size_t j = 1; j < n; j++) { + result[i++] = c; + c = source[i]; + } + return true; + }; + + if (upper && (c >= 'a' && c <= 'z')) { + c -= 0x20; } - } - else { // lower - size_t n = source.size(); - for (size_t i = 0; i < n; ++i) { - char c = source[i]; - if (traits::lt(0x40, c) && traits::lt(c, 0x5B)) - c = traits::to_char_type(traits::to_int_type(c) + 0x20); - result[i] = c; + else if (!upper && (c >= 'A' && c <= 'Z')) { + c += 0x20; } - } + else if (traits::to_int_type(c) > 0x7f) { + auto int_val = traits::to_int_type(c); + if ((int_val & 0xE0) == 0xc0) { + // 2 byte utf-8 + if (i + 2 > sz) { + return {}; + } + auto u = ((int_val << 6) + (traits::to_int_type(source[i + 1]) & 0x3F)) & 0x7FF; + // Handle some Latin-1 supplement characters + if (upper && (u >= 0xE0 && u <= 0xFE && u != 0xF7)) { + u -= 0x20; + } + else if (!upper && (u >= 0xC0 && u <= 0xDE && u != 0xD7)) { + u += 0x20; + } + result[i++] = static_cast((u >> 6) | 0xC0); + c = static_cast((u & 0x3f) | 0x80); + } + else if ((int_val & 0xF0) == 0xE0) { + // 3 byte utf-8 + if (!copy_bytes(3)) { + return {}; + } + } + else if ((int_val & 0xF8) == 0xF0) { + // 4 byte utf-8 + if (!copy_bytes(4)) { + return {}; + } + } + } + result[i] = c; + } return result; #endif } diff --git a/test/test_index_string.cpp b/test/test_index_string.cpp index 3e6306d11fc..5f5dee20914 100644 --- a/test/test_index_string.cpp +++ b/test/test_index_string.cpp @@ -1833,5 +1833,19 @@ TEST(Unicode_Casemap) if (CHECK(out)) { CHECK_EQUAL(*out, "A VERY OLD HOUSE 🏠 IS ON 🔥, WE HAVE TO SAVE THE 🦄"); } + + StringData trailing_garbage(inp.data(), 19); // String terminated inside icon + out = case_map(trailing_garbage, true); + CHECK_NOT(out); + + inp = "rødgrød med fløde"; + out = case_map(inp, true); + if (CHECK(out)) { + CHECK_EQUAL(*out, "RØDGRØD MED FLØDE"); + } + out = case_map(out, false); + if (CHECK(out)) { + CHECK_EQUAL(*out, inp); + } } #endif // TEST_INDEX_STRING