Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix handling of 4-byte UTF8 values on Windows #5803

Merged
merged 12 commits into from
Sep 15, 2022
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

### Fixed
* <How do the end-user experience this issue? what was the impact?> ([#????](https://github.com/realm/realm-core/issues/????), since v?.?.?)
* None.
* If a case insensitive query searched for a string including an 4-byte UTF8 character, the program would crash ([#5825](https://github.com/realm/realm-core/issues/5825), since v2.3.0)

### Breaking changes
* None.
Expand Down
19 changes: 19 additions & 0 deletions src/realm/exceptions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,25 @@ class NoSubscriptionForWrite : public std::runtime_error {
NoSubscriptionForWrite(const std::string& msg);
};

namespace query_parser {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I should have put my previous comment on naming here. Since these exceptions are thrown from the query engine now and not necessarily from the query parser, can we change this namespace to something more general such as query?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's not change this in this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok sure. In fact, I'd expect all these exceptions will changing soon with the upcoming error handling work.


/// Exception thrown when parsing fails due to invalid syntax.
struct SyntaxError : std::runtime_error {
using std::runtime_error::runtime_error;
};

/// Exception thrown when binding a syntactically valid query string in a
/// context where it does not make sense.
struct InvalidQueryError : std::runtime_error {
using std::runtime_error::runtime_error;
};

/// Exception thrown when there is a problem accessing the arguments in a query string
struct InvalidQueryArgError : std::invalid_argument {
using std::invalid_argument::invalid_argument;
};

} // namespace query_parser

/// The \c LogicError exception class is intended to be thrown only when
/// applications (or bindings) violate rules that are stated (or ought to have
Expand Down
16 changes: 0 additions & 16 deletions src/realm/parser/query_parser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,6 @@

namespace realm::query_parser {

/// Exception thrown when parsing fails due to invalid syntax.
struct SyntaxError : std::runtime_error {
using std::runtime_error::runtime_error;
};

/// Exception thrown when binding a syntactically valid query string in a
/// context where it does not make sense.
struct InvalidQueryError : std::runtime_error {
using std::runtime_error::runtime_error;
};

/// Exception thrown when there is a problem accessing the arguments in a query string
struct InvalidQueryArgError : std::invalid_argument {
using std::invalid_argument::invalid_argument;
};

struct AnyContext {
template <typename T>
T unbox(const std::any& wrapper)
Expand Down
14 changes: 0 additions & 14 deletions src/realm/query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1827,20 +1827,6 @@ void* Query::query_thread(void* arg)

#endif // REALM_MULTITHREADQUERY

std::string Query::validate() const
{
if (!m_groups.size())
return "";

if (error_code != "") // errors detected by QueryInterface
return error_code;

if (!root_node())
return "Syntax error";

return root_node()->validate(); // errors detected by QueryEngine
}

std::string Query::get_description(util::serializer::SerialisationState& state) const
{
std::string description;
Expand Down
2 changes: 0 additions & 2 deletions src/realm/query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,8 +320,6 @@ class Query final {
// or empty vector if the query is not associated with a table.
TableVersions sync_view_if_needed() const;

std::string validate() const;

std::string get_description(const std::string& class_prefix = "") const;
std::string get_description(util::serializer::SerialisationState& state) const;

Expand Down
58 changes: 6 additions & 52 deletions src/realm/query_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,16 +225,6 @@ class ParentNode {
ArrayPayload* source_column);


virtual std::string validate()
{
if (error_code != "")
return error_code;
if (m_child == nullptr)
return "";
else
return m_child->validate();
}

ParentNode(const ParentNode& from);

void add_child(std::unique_ptr<ParentNode> child)
Expand Down Expand Up @@ -320,7 +310,6 @@ class ParentNode {
ConstTableRef m_table = ConstTableRef();
const Cluster* m_cluster = nullptr;
QueryStateBase* m_state = nullptr;
std::string error_code;
static std::vector<ObjKey> s_dummy_keys;

ColumnType get_real_column_type(ColKey key)
Expand Down Expand Up @@ -1582,7 +1571,7 @@ class StringNode : public StringNodeBase {
auto upper = case_map(v, true);
auto lower = case_map(v, false);
if (!upper || !lower) {
error_code = "Malformed UTF-8: " + std::string(v);
throw query_parser::InvalidQueryError(util::format("Malformed UTF-8: %1", v));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are not "query parser" errors, they are general query errors. Can we use another name?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok - changed to std::runtime_error.

}
else {
m_ucase = std::move(*upper);
Expand Down Expand Up @@ -1707,7 +1696,7 @@ class StringNode<ContainsIns> : public StringNodeBase {
auto upper = case_map(v, true);
auto lower = case_map(v, false);
if (!upper || !lower) {
error_code = "Malformed UTF-8: " + std::string(v);
throw query_parser::InvalidQueryError(util::format("Malformed UTF-8: %1", v));
}
else {
m_ucase = std::move(*upper);
Expand Down Expand Up @@ -1921,7 +1910,7 @@ class StringNode<EqualIns> : public StringNodeEqualBase {
auto upper = case_map(v, true);
auto lower = case_map(v, false);
if (!upper || !lower) {
error_code = "Malformed UTF-8: " + std::string(v);
throw query_parser::InvalidQueryError(util::format("Malformed UTF-8: %1", v));
}
else {
m_ucase = std::move(*upper);
Expand Down Expand Up @@ -2106,27 +2095,6 @@ class OrNode : public ParentNode {
return index;
}

std::string validate() override
{
if (error_code != "")
return error_code;
if (m_conditions.size() == 0)
return "Missing left-hand side of OR";
if (m_conditions.size() == 1)
return "Missing right-hand side of OR";
std::string s;
if (m_child != 0)
s = m_child->validate();
if (s != "")
return s;
for (size_t i = 0; i < m_conditions.size(); ++i) {
s = m_conditions[i]->validate();
if (s != "")
return s;
}
return "";
}

std::unique_ptr<ParentNode> clone() const override
{
return std::unique_ptr<ParentNode>(new OrNode(*this));
Expand Down Expand Up @@ -2166,6 +2134,9 @@ class NotNode : public ParentNode {
: m_condition(std::move(condition))
{
m_dT = 50.0;
if (!m_condition) {
throw query_parser::InvalidQueryError("Missing argument to Not");
}
}

void table_changed() override
Expand Down Expand Up @@ -2194,23 +2165,6 @@ class NotNode : public ParentNode {

size_t find_first_local(size_t start, size_t end) override;

std::string validate() override
{
if (error_code != "")
return error_code;
if (m_condition == 0)
return "Missing argument to Not";
std::string s;
if (m_child != 0)
s = m_child->validate();
if (s != "")
return s;
s = m_condition->validate();
if (s != "")
return s;
return "";
}

std::string describe(util::serializer::SerialisationState& state) const override
{
if (m_condition) {
Expand Down
107 changes: 24 additions & 83 deletions src/realm/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,79 +274,6 @@ bool utf8_compare(StringData string1, StringData string2)
return false;
}

// Here is a version for Windows that may be closer to what is ultimately needed.
/*
bool case_map(const char* begin, const char* end, StringBuffer& dest, bool upper)
{
const int wide_buffer_size = 32;
wchar_t wide_buffer[wide_buffer_size];

dest.resize(end-begin);
size_t dest_offset = 0;

for (;;) {
int num_out;

// Decode
{
size_t num_in = end - begin;
if (size_t(32) <= num_in) {
num_out = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, 32, wide_buffer, wide_buffer_size);
if (num_out != 0) {
begin += 32;
goto convert;
}
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) return false;
}
if (num_in == 0) break;
int n = num_in < size_t(8) ? int(num_in) : 8;
num_out = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, n, wide_buffer, wide_buffer_size);
if (num_out != 0) {
begin += n;
goto convert;
}
return false;
}

convert:
if (upper) {
for (int i=0; i<num_out; ++i) {
CharUpperW(wide_buffer + i);
}
}
else {
for (int i=0; i<num_out; ++i) {
CharLowerW(wide_buffer + i);
}
}

encode:
{
size_t free = dest.size() - dest_offset;
if (int_less_than(std::numeric_limits<int>::max(), free)) free = std::numeric_limits<int>::max();
int n = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, wide_buffer, num_out,
dest.data() + dest_offset, int(free), 0, 0);
if (i != 0) {
dest_offset += n;
continue;
}
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) return false;
size_t dest_size = dest.size();
if (int_multiply_with_overflow_detect(dest_size, 2)) {
if (dest_size == std::numeric_limits<size_t>::max()) return false;
dest_size = std::numeric_limits<size_t>::max();
}
dest.resize(dest_size);
goto encode;
}
}

dest.resize(dest_offset);
return true;
}
*/


// Converts UTF-8 source into upper or lower case. This function
// preserves the byte length of each UTF-8 character in following way:
// If an output character differs in size, it is simply substituded by
Expand All @@ -358,28 +285,41 @@ util::Optional<std::string> case_map(StringData source, bool upper)
result.resize(source.size());

#if defined(_WIN32)
constexpr size_t tmp_buffer_size = 32;
const char* begin = source.data();
const char* end = begin + source.size();
auto output = result.begin();
while (begin != end) {
int n = static_cast<int>(sequence_length(*begin));
if (n == 0 || end - begin < n)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We still need something like this check don't we? What if there are random bytes at the end of the string which appear to be a multibyte character but are just garbage such that sequence_length() returns 4 but there is only one character left in the string?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed the logic a bit so that trailing garbage should be handled. I did not have a chance to test it on a Windows machine today. I will do that tomorrow.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I'd be more convinced that this works if there was a test for trailing garbage as well 😄

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps we should not specify how trailing garbage would be treated and fall back to "garbage in -> garbage out". This is anyway how the non-windows version works.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, let's match the non-windows version. Can you add a test with trailing garbage to check the consistency though?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think I can make a test that has the same outcome on both Windows and Linux. On Linux we will just copy over anything that is not an ASCII letter, but on Windows it will fail if there is something that cannot be translated. That was what I meant with my comment: We should not specify how it behaves as is behaves differently.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I will concede platform dependent behaviour here. However, I'd still request a (platform dependent) test of garbage characters to verify that we do not crash.

return util::none;
size_t n = end - begin;
if (n > tmp_buffer_size) {
// Break the input string into chunks - but don't break in the middle of a multibyte character
const char* p = begin;
n = 0;
while (p != end) {
size_t len = sequence_length(*p);
p += len;
n += len;
if (n > tmp_buffer_size) {
n -= len;
break;
}
}
}

wchar_t tmp[2]; // FIXME: Why no room for UTF-16 surrogate
wchar_t tmp[tmp_buffer_size];

int n2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, n, tmp, 1);
int n2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, n, tmp, tmp_buffer_size);
if (n2 == 0)
return util::none;

REALM_ASSERT(n2 == 1);
tmp[n2] = 0;
if (n2 < tmp_buffer_size)
tmp[n2] = 0;

// Note: If tmp[0] == 0, it is because the string contains a
// null-chacarcter, which is perfectly fine.

wchar_t mapped_tmp[2];
LCMapStringEx(LOCALE_NAME_INVARIANT, upper ? LCMAP_UPPERCASE : LCMAP_LOWERCASE, tmp, 1, mapped_tmp, 2,
wchar_t mapped_tmp[tmp_buffer_size];
LCMapStringEx(LOCALE_NAME_INVARIANT, upper ? LCMAP_UPPERCASE : LCMAP_LOWERCASE, tmp, n2, mapped_tmp, tmp_buffer_size,
nullptr, nullptr, 0);

// FIXME: The intention is to use flag 'WC_ERR_INVALID_CHARS'
Expand All @@ -388,7 +328,8 @@ util::Optional<std::string> case_map(StringData source, bool upper)
// the flag is specified, the function fails with error
// ERROR_INVALID_FLAGS.
DWORD flags = 0;
int n3 = WideCharToMultiByte(CP_UTF8, flags, mapped_tmp, 1, &*output, static_cast<int>(end - begin), 0, 0);
auto m = static_cast<int>(end - begin);
int n3 = WideCharToMultiByte(CP_UTF8, flags, mapped_tmp, n2, &*output, m, 0, 0);
if (n3 == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
return util::none;

Expand Down
8 changes: 8 additions & 0 deletions test/test_index_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1826,4 +1826,12 @@ TEST(StringIndex_MixedEqualBitPattern)
CHECK_EQUAL(tv.get_object(1).get_any(col), val1);
}

TEST(Unicode_Casemap)
{
std::string inp = "A very old house 🏠 is on 🔥, we have to save the 🦄";
auto out = case_map(inp, true);
if (CHECK(out)) {
CHECK_EQUAL(*out, "A VERY OLD HOUSE 🏠 IS ON 🔥, WE HAVE TO SAVE THE 🦄");
}
}
#endif // TEST_INDEX_STRING
Loading