Skip to content

Commit

Permalink
Improve unicode handling better on non-Windows
Browse files Browse the repository at this point in the history
  • Loading branch information
jedelbo committed Sep 14, 2022
1 parent a7697c5 commit 7f3d7c3
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 22 deletions.
74 changes: 52 additions & 22 deletions src/realm/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,32 +343,62 @@ util::Optional<std::string> case_map(StringData source, bool upper)

return result;
#else
// FIXME: Implement this! Note that this is trivial in C++11 due
// to its built-in support for UTF-8. In C++03 it is trivial when
// __STDC_ISO_10646__ is defined. Also consider using ICU. Maybe
// GNU has something to offer too.

// For now we handle just the ASCII subset
size_t sz = source.size();
typedef std::char_traits<char> traits;
if (upper) {
size_t n = source.size();
for (size_t i = 0; i < n; ++i) {
char c = source[i];
if (traits::lt(0x60, c) && traits::lt(c, 0x7B))
c = traits::to_char_type(traits::to_int_type(c) - 0x20);
result[i] = c;
for (size_t i = 0; i < sz; ++i) {
char c = source[i];

auto copy_bytes = [&](size_t n) {
if (i + n > sz) {
return false;
}
for (size_t j = 1; j < n; j++) {
result[i++] = c;
c = source[i];
}
return true;
};

if (upper && (c >= 'a' && c <= 'z')) {
c -= 0x20;
}
}
else { // lower
size_t n = source.size();
for (size_t i = 0; i < n; ++i) {
char c = source[i];
if (traits::lt(0x40, c) && traits::lt(c, 0x5B))
c = traits::to_char_type(traits::to_int_type(c) + 0x20);
result[i] = c;
else if (!upper && (c >= 'A' && c <= 'Z')) {
c += 0x20;
}
}
else if (traits::to_int_type(c) > 0x7f) {
auto int_val = traits::to_int_type(c);
if ((int_val & 0xE0) == 0xc0) {
// 2 byte utf-8
if (i + 2 > sz) {
return {};
}
auto u = ((int_val << 6) + (traits::to_int_type(source[i + 1]) & 0x3F)) & 0x7FF;
// Handle some Latin-1 supplement characters
if (upper && (u >= 0xE0 && u <= 0xFE && u != 0xF7)) {
u -= 0x20;
}
else if (!upper && (u >= 0xC0 && u <= 0xDE && u != 0xD7)) {
u += 0x20;
}

result[i++] = static_cast<char>((u >> 6) | 0xC0);
c = static_cast<char>((u & 0x3f) | 0x80);
}
else if ((int_val & 0xF0) == 0xE0) {
// 3 byte utf-8
if (!copy_bytes(3)) {
return {};
}
}
else if ((int_val & 0xF8) == 0xF0) {
// 4 byte utf-8
if (!copy_bytes(4)) {
return {};
}
}
}
result[i] = c;
}
return result;
#endif
}
Expand Down
14 changes: 14 additions & 0 deletions test/test_index_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1833,5 +1833,19 @@ TEST(Unicode_Casemap)
if (CHECK(out)) {
CHECK_EQUAL(*out, "A VERY OLD HOUSE 🏠 IS ON 🔥, WE HAVE TO SAVE THE 🦄");
}

StringData trailing_garbage(inp.data(), 19); // String terminated inside icon
out = case_map(trailing_garbage, true);
CHECK_NOT(out);

inp = "rødgrød med fløde";
out = case_map(inp, true);
if (CHECK(out)) {
CHECK_EQUAL(*out, "RØDGRØD MED FLØDE");
}
out = case_map(out, false);
if (CHECK(out)) {
CHECK_EQUAL(*out, inp);
}
}
#endif // TEST_INDEX_STRING

0 comments on commit 7f3d7c3

Please sign in to comment.