From b0f59837a6afc7a145a8fd008fdfece78973e179 Mon Sep 17 00:00:00 2001 From: Andrei Litvin Date: Thu, 9 Nov 2023 14:10:17 -0500 Subject: [PATCH] Define a `IsValidUtf8` equivalent with unit tests (#30386) * Add a completely untested (but compilable) utf8 tester * Start adding unit tests * More test cases * tests * More tests * Fix typo * Slightly better formatting on test * Add comment about embedded zeroes * Restyle * Allow embedded zeroes (even if it pains me to do so) * Update copyright year and added 2 more tests * Restyle * Some comments and added another set of tests * Use fromCharString * Typo fixes * Fix dates for copyrights * Restyle --- src/lib/support/BUILD.gn | 2 + src/lib/support/tests/BUILD.gn | 1 + src/lib/support/tests/TestUtf8.cpp | 170 +++++++++++++++++++++++++ src/lib/support/utf8.cpp | 194 +++++++++++++++++++++++++++++ src/lib/support/utf8.h | 47 +++++++ 5 files changed, 414 insertions(+) create mode 100644 src/lib/support/tests/TestUtf8.cpp create mode 100644 src/lib/support/utf8.cpp create mode 100644 src/lib/support/utf8.h diff --git a/src/lib/support/BUILD.gn b/src/lib/support/BUILD.gn index c1118ef0aa588b..1c44f3191445d5 100644 --- a/src/lib/support/BUILD.gn +++ b/src/lib/support/BUILD.gn @@ -214,6 +214,8 @@ static_library("support") { "logging/BinaryLogging.cpp", "logging/BinaryLogging.h", "logging/CHIPLogging.h", + "utf8.cpp", + "utf8.h", "verhoeff/Verhoeff.cpp", "verhoeff/Verhoeff.h", "verhoeff/Verhoeff10.cpp", diff --git a/src/lib/support/tests/BUILD.gn b/src/lib/support/tests/BUILD.gn index 49147e7814dab8..1fd19e8d32190e 100644 --- a/src/lib/support/tests/BUILD.gn +++ b/src/lib/support/tests/BUILD.gn @@ -55,6 +55,7 @@ chip_test_suite_using_nltest("tests") { "TestTimeUtils.cpp", "TestTlvJson.cpp", "TestTlvToJson.cpp", + "TestUtf8.cpp", "TestVariant.cpp", "TestZclString.cpp", ] diff --git a/src/lib/support/tests/TestUtf8.cpp b/src/lib/support/tests/TestUtf8.cpp new file mode 100644 index 00000000000000..181fc377a38743 --- /dev/null +++ b/src/lib/support/tests/TestUtf8.cpp @@ -0,0 +1,170 @@ + +/* + * + * Copyright (c) 2023 Project CHIP Authors + * All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include + +namespace { + +using namespace chip; + +void TestValidStrings(nlTestSuite * inSuite, void * inContext) +{ + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan())); // empty span ok + + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString(""))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("abc"))); + + // Various tests from https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html + + // Generic UTF8 + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("κόσμε"))); + + // First possible sequence of a certain length + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("€"))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("ࠀ"))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("𐀀"))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("�����"))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("������"))); + + // Last possible sequence of a certain length + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString(""))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("߿"))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("￿"))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("����"))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("�����"))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("������"))); + + // Other boundary conditions + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("퟿"))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString(""))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("�"))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("􏿿"))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("����"))); + + // NOTE: UTF8 allows embeded NULLs + // even though strings like that are probably not ideal for handling + // Test that we allow this, but consider later to disallow them + // completely if the spec is updated as such + { + char zero[16] = { 0 }; + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 0))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 1))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 2))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 3))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 4))); + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 16))); + } + + { + char insideZero[] = "test\0zero"; + NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(insideZero))); + } +} + +#define TEST_INVALID_BYTES(...) \ + { \ + uint8_t _buff[] = { __VA_ARGS__ }; \ + CharSpan _span(reinterpret_cast(_buff), sizeof(_buff)); \ + NL_TEST_ASSERT(inSuite, !Utf8::IsValid(_span)); \ + } \ + (void) 0 + +void TestInvalidStrings(nlTestSuite * inSuite, void * inContext) +{ + // overly long representation + TEST_INVALID_BYTES(0xe0, 0b1001'1111, 0x80); // A + TEST_INVALID_BYTES(0xed, 0b1011'0000, 0x80); // B + TEST_INVALID_BYTES(0xf0, 0b1000'1111, 0x80); // C + + // Outside codepoint + TEST_INVALID_BYTES(0xf4, 0x90, 0x80, 0x80); // D + TEST_INVALID_BYTES(0xf4, 0x91, 0x82, 0x83); + TEST_INVALID_BYTES(0xf5, 0x81, 0x82, 0x83); + + // Missing continuation + TEST_INVALID_BYTES(0xC2); + TEST_INVALID_BYTES(0xE0); + TEST_INVALID_BYTES(0xE1); + TEST_INVALID_BYTES(0xE1, 0x9F); + TEST_INVALID_BYTES(0xED, 0x9F); + TEST_INVALID_BYTES(0xEE, 0x9F); + TEST_INVALID_BYTES(0xF0); + TEST_INVALID_BYTES(0xF0, 0x9F); + TEST_INVALID_BYTES(0xF0, 0x9F, 0x9F); + TEST_INVALID_BYTES(0xF1); + TEST_INVALID_BYTES(0xF1, 0x9F); + TEST_INVALID_BYTES(0xF1, 0x9F, 0x9F); + TEST_INVALID_BYTES(0xF4); + TEST_INVALID_BYTES(0xF4, 0x9F); + TEST_INVALID_BYTES(0xF4, 0x9F, 0x9F); + + // More tests from https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html + TEST_INVALID_BYTES(0x80); // First continuation byte + TEST_INVALID_BYTES(0xBF); // Last continuation byte + + // Impossible bytes + TEST_INVALID_BYTES(0xFE); + TEST_INVALID_BYTES(0xFF); + TEST_INVALID_BYTES(0xFE, 0xFE, 0xFF, 0xFF); + + // Overlong sequences + // 4.1 Examples of an overlong ASCII character (in w3c tests) + TEST_INVALID_BYTES(0xc0, 0xaf); + TEST_INVALID_BYTES(0xe0, 0x80, 0xaf); + TEST_INVALID_BYTES(0xf0, 0x80, 0x80, 0xaf); + TEST_INVALID_BYTES(0xf8, 0x80, 0x80, 0x80, 0xaf); + TEST_INVALID_BYTES(0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf); + // 4.2 Maximum overlong sequences (in w3c tests) + TEST_INVALID_BYTES(0xc1, 0xbf); + TEST_INVALID_BYTES(0xe0, 0x9f, 0xbf); + TEST_INVALID_BYTES(0xf0, 0x8f, 0xbf, 0xbf); + TEST_INVALID_BYTES(0xf8, 0x87, 0xbf, 0xbf, 0xbf); + TEST_INVALID_BYTES(0xfc, 0x83, 0xbf, 0xbf, 0xbf, 0xbf); + // 4.3 Overlong representation of the NUL character (in w3c tests) + TEST_INVALID_BYTES(0xc0, 0x80); + TEST_INVALID_BYTES(0xe0, 0x80, 0x80); + TEST_INVALID_BYTES(0xf0, 0x80, 0x80, 0x80); + TEST_INVALID_BYTES(0xf8, 0x80, 0x80, 0x80, 0x80); + TEST_INVALID_BYTES(0xfc, 0x80, 0x80, 0x80, 0x80, 0x80); +} + +} // namespace + +// clang-format off +const nlTest sTests[] = +{ + NL_TEST_DEF("TestValidStrings", TestValidStrings), + NL_TEST_DEF("TestInvalidStrings", TestInvalidStrings), + NL_TEST_SENTINEL() +}; +// clang-format on + +int TestUtf8() +{ + nlTestSuite theSuite = { "CHIP UTF8 tests", &sTests[0], nullptr, nullptr }; + nlTestRunner(&theSuite, nullptr); + return nlTestRunnerStats(&theSuite); +} + +CHIP_REGISTER_TEST_SUITE(TestUtf8); diff --git a/src/lib/support/utf8.cpp b/src/lib/support/utf8.cpp new file mode 100644 index 00000000000000..d1511de08ede19 --- /dev/null +++ b/src/lib/support/utf8.cpp @@ -0,0 +1,194 @@ +/* + * + * Copyright (c) 2023 Project CHIP Authors + * All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "utf8.h" + +namespace chip { +namespace Utf8 { + +namespace { +/** + State machine for UTF8 valid bytes + +Table 3-7. Well-Formed UTF-8 Byte Sequences + +Code Points | First B | Second B | Third B | Fourth B +------------------+----------+------------+---------+--------- +U+0000..U+007F | 00..7F | | | +U+0080..U+07FF | C2..DF | 80..BF | | +U+0800..U+0FFF | E0 | A0..BF (A) | 80..BF | +U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | +U+D000..U+D7FF | ED | 80..9F (B) | 80..BF | +U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | +U+10000..U+3FFFF | F0 | 90..BF (C) | 80..BF | 80..BF +U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF +U+100000..U+10FFFF| F4 | 80..8F (D) | 80..BF | 80..BF +*/ + +enum class ParserState +{ + kFirstByte, + kSecondByte_A, + kSecondByte_B, + kSecondByte_C, + kSecondByte_D, + kExtraOneByte, // 0x80 .. 0xBF once + kExtraTwoBytes, // 0x80 .. 0xBF twice + kExtraThreeBytes, // 0x80 .. 0xBF three times + // + kInvalid, // some error +}; + +ParserState NextState(ParserState state, uint8_t value) +{ + switch (state) + { + case ParserState::kFirstByte: + if (value <= 0x7F) + { + return ParserState::kFirstByte; + } + else if ((value >= 0xC2) && (value <= 0xDF)) + { + return ParserState::kExtraOneByte; + } + else if (value == 0xE0) + { + return ParserState::kSecondByte_A; + } + else if ((value >= 0xE1) && (value <= 0xEC)) + { + return ParserState::kExtraTwoBytes; + } + else if (value == 0xED) + { + return ParserState::kSecondByte_B; + } + else if ((value >= 0xEE) && (value <= 0xEF)) + { + return ParserState::kExtraTwoBytes; + } + else if (value == 0xF0) + { + return ParserState::kSecondByte_C; + } + else if ((value >= 0xF1) && (value <= 0xF3)) + { + return ParserState::kExtraThreeBytes; + } + else if (value == 0xF4) + { + return ParserState::kSecondByte_D; + } + else + { + return ParserState::kInvalid; + } + case ParserState::kSecondByte_A: + if (value >= 0xA0 && value <= 0xBF) + { + return ParserState::kExtraOneByte; + } + else + { + return ParserState::kInvalid; + } + case ParserState::kSecondByte_B: + if (value >= 0x80 && value <= 0x9F) + { + return ParserState::kExtraOneByte; + } + else + { + return ParserState::kInvalid; + } + case ParserState::kSecondByte_C: + if (value >= 0x90 && value <= 0xBF) + { + return ParserState::kExtraTwoBytes; + } + else + { + return ParserState::kInvalid; + } + case ParserState::kSecondByte_D: + if (value >= 0x80 && value <= 0x8F) + { + return ParserState::kExtraTwoBytes; + } + else + { + return ParserState::kInvalid; + } + case ParserState::kExtraOneByte: + if (value >= 0x80 && value <= 0xBF) + { + return ParserState::kFirstByte; + } + else + { + return ParserState::kInvalid; + } + case ParserState::kExtraTwoBytes: + if (value >= 0x80 && value <= 0xBF) + { + return ParserState::kExtraOneByte; + } + else + { + return ParserState::kInvalid; + } + case ParserState::kExtraThreeBytes: + if (value >= 0x80 && value <= 0xBF) + { + return ParserState::kExtraTwoBytes; + } + else + { + return ParserState::kInvalid; + } + default: + return ParserState::kInvalid; + } +} + +} // namespace + +bool IsValid(CharSpan span) +{ + ParserState state = ParserState::kFirstByte; + + const char * data = span.data(); + const size_t kLength = span.size(); + + // Every byte should be valid + for (size_t i = 0; i < kLength; i++) + { + state = NextState(state, static_cast(data[i])); + + if (state == ParserState::kInvalid) + { + return false; + } + } + + // finally no continuation should be expected + return state == ParserState::kFirstByte; +} + +} // namespace Utf8 +} // namespace chip diff --git a/src/lib/support/utf8.h b/src/lib/support/utf8.h new file mode 100644 index 00000000000000..71a2bb6612e4fd --- /dev/null +++ b/src/lib/support/utf8.h @@ -0,0 +1,47 @@ +/* + * + * Copyright (c) 2023 Project CHIP Authors + * All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace chip { +namespace Utf8 { + +/** + * Validate that the given span looks like a valid UTF-8 string + * + * UTF-8 encoding described at + * https://www.unicode.org/versions/Unicode12.0.0/UnicodeStandard-12.0.pdf + * but TLDR is: + * + * | Scalar Value | First B | Second B | Third B | Fourth B | + * +----------------------------+----------+----------+----------+----------+ + * | 00000000 0xxxxxxx | 0xxxxxxx | | | | + * | 00000yyy yyxxxxxx | 110yyyyy | 10xxxxxx | | | + * | zzzzyyyy yyxxxxxx | 1110zzzz | 10yyyyyy | 10xxxxxx | | + * | 000uuuuu zzzzyyyy yyxxxxxx | 11110uuu | 10uuzzzz | 10yyyyyy | 10xxxxxx | + * +----------------------------+----------+----------+----------+----------+ + * + * Further more, individual encodings must be minimally encoded, so + * 0xC0 is not a valid first byte( 1100_0000 10xxxxxx could have been encoded + * as one-byte) + */ +bool IsValid(CharSpan span); + +} // namespace Utf8 +} // namespace chip