-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Define a
IsValidUtf8
equivalent with unit tests (#30386)
* Add a completely untested (but compilable) utf8 tester * Start adding unit tests * More test cases * tests * More tests * Fix typo * Slightly better formatting on test * Add comment about embedded zeroes * Restyle * Allow embedded zeroes (even if it pains me to do so) * Update copyright year and added 2 more tests * Restyle * Some comments and added another set of tests * Use fromCharString * Typo fixes * Fix dates for copyrights * Restyle
- Loading branch information
Showing
5 changed files
with
414 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
|
||
/* | ||
* | ||
* Copyright (c) 2023 Project CHIP Authors | ||
* All rights reserved. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <functional> | ||
|
||
#include <lib/support/UnitTestRegistration.h> | ||
#include <lib/support/utf8.h> | ||
|
||
#include <nlunit-test.h> | ||
|
||
namespace { | ||
|
||
using namespace chip; | ||
|
||
void TestValidStrings(nlTestSuite * inSuite, void * inContext) | ||
{ | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan())); // empty span ok | ||
|
||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString(""))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("abc"))); | ||
|
||
// Various tests from https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html | ||
|
||
// Generic UTF8 | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("κόσμε"))); | ||
|
||
// First possible sequence of a certain length | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString(""))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("ࠀ"))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("𐀀"))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("�����"))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("������"))); | ||
|
||
// Last possible sequence of a certain length | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString(""))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("߿"))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString(""))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("����"))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("�����"))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("������"))); | ||
|
||
// Other boundary conditions | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString(""))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString(""))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("�"))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString(""))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("����"))); | ||
|
||
// NOTE: UTF8 allows embeded NULLs | ||
// even though strings like that are probably not ideal for handling | ||
// Test that we allow this, but consider later to disallow them | ||
// completely if the spec is updated as such | ||
{ | ||
char zero[16] = { 0 }; | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 0))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 1))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 2))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 3))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 4))); | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 16))); | ||
} | ||
|
||
{ | ||
char insideZero[] = "test\0zero"; | ||
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(insideZero))); | ||
} | ||
} | ||
|
||
#define TEST_INVALID_BYTES(...) \ | ||
{ \ | ||
uint8_t _buff[] = { __VA_ARGS__ }; \ | ||
CharSpan _span(reinterpret_cast<const char *>(_buff), sizeof(_buff)); \ | ||
NL_TEST_ASSERT(inSuite, !Utf8::IsValid(_span)); \ | ||
} \ | ||
(void) 0 | ||
|
||
void TestInvalidStrings(nlTestSuite * inSuite, void * inContext) | ||
{ | ||
// overly long representation | ||
TEST_INVALID_BYTES(0xe0, 0b1001'1111, 0x80); // A | ||
TEST_INVALID_BYTES(0xed, 0b1011'0000, 0x80); // B | ||
TEST_INVALID_BYTES(0xf0, 0b1000'1111, 0x80); // C | ||
|
||
// Outside codepoint | ||
TEST_INVALID_BYTES(0xf4, 0x90, 0x80, 0x80); // D | ||
TEST_INVALID_BYTES(0xf4, 0x91, 0x82, 0x83); | ||
TEST_INVALID_BYTES(0xf5, 0x81, 0x82, 0x83); | ||
|
||
// Missing continuation | ||
TEST_INVALID_BYTES(0xC2); | ||
TEST_INVALID_BYTES(0xE0); | ||
TEST_INVALID_BYTES(0xE1); | ||
TEST_INVALID_BYTES(0xE1, 0x9F); | ||
TEST_INVALID_BYTES(0xED, 0x9F); | ||
TEST_INVALID_BYTES(0xEE, 0x9F); | ||
TEST_INVALID_BYTES(0xF0); | ||
TEST_INVALID_BYTES(0xF0, 0x9F); | ||
TEST_INVALID_BYTES(0xF0, 0x9F, 0x9F); | ||
TEST_INVALID_BYTES(0xF1); | ||
TEST_INVALID_BYTES(0xF1, 0x9F); | ||
TEST_INVALID_BYTES(0xF1, 0x9F, 0x9F); | ||
TEST_INVALID_BYTES(0xF4); | ||
TEST_INVALID_BYTES(0xF4, 0x9F); | ||
TEST_INVALID_BYTES(0xF4, 0x9F, 0x9F); | ||
|
||
// More tests from https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html | ||
TEST_INVALID_BYTES(0x80); // First continuation byte | ||
TEST_INVALID_BYTES(0xBF); // Last continuation byte | ||
|
||
// Impossible bytes | ||
TEST_INVALID_BYTES(0xFE); | ||
TEST_INVALID_BYTES(0xFF); | ||
TEST_INVALID_BYTES(0xFE, 0xFE, 0xFF, 0xFF); | ||
|
||
// Overlong sequences | ||
// 4.1 Examples of an overlong ASCII character (in w3c tests) | ||
TEST_INVALID_BYTES(0xc0, 0xaf); | ||
TEST_INVALID_BYTES(0xe0, 0x80, 0xaf); | ||
TEST_INVALID_BYTES(0xf0, 0x80, 0x80, 0xaf); | ||
TEST_INVALID_BYTES(0xf8, 0x80, 0x80, 0x80, 0xaf); | ||
TEST_INVALID_BYTES(0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf); | ||
// 4.2 Maximum overlong sequences (in w3c tests) | ||
TEST_INVALID_BYTES(0xc1, 0xbf); | ||
TEST_INVALID_BYTES(0xe0, 0x9f, 0xbf); | ||
TEST_INVALID_BYTES(0xf0, 0x8f, 0xbf, 0xbf); | ||
TEST_INVALID_BYTES(0xf8, 0x87, 0xbf, 0xbf, 0xbf); | ||
TEST_INVALID_BYTES(0xfc, 0x83, 0xbf, 0xbf, 0xbf, 0xbf); | ||
// 4.3 Overlong representation of the NUL character (in w3c tests) | ||
TEST_INVALID_BYTES(0xc0, 0x80); | ||
TEST_INVALID_BYTES(0xe0, 0x80, 0x80); | ||
TEST_INVALID_BYTES(0xf0, 0x80, 0x80, 0x80); | ||
TEST_INVALID_BYTES(0xf8, 0x80, 0x80, 0x80, 0x80); | ||
TEST_INVALID_BYTES(0xfc, 0x80, 0x80, 0x80, 0x80, 0x80); | ||
} | ||
|
||
} // namespace | ||
|
||
// clang-format off | ||
const nlTest sTests[] = | ||
{ | ||
NL_TEST_DEF("TestValidStrings", TestValidStrings), | ||
NL_TEST_DEF("TestInvalidStrings", TestInvalidStrings), | ||
NL_TEST_SENTINEL() | ||
}; | ||
// clang-format on | ||
|
||
int TestUtf8() | ||
{ | ||
nlTestSuite theSuite = { "CHIP UTF8 tests", &sTests[0], nullptr, nullptr }; | ||
nlTestRunner(&theSuite, nullptr); | ||
return nlTestRunnerStats(&theSuite); | ||
} | ||
|
||
CHIP_REGISTER_TEST_SUITE(TestUtf8); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
/* | ||
* | ||
* Copyright (c) 2023 Project CHIP Authors | ||
* All rights reserved. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#include "utf8.h" | ||
|
||
namespace chip { | ||
namespace Utf8 { | ||
|
||
namespace { | ||
/** | ||
State machine for UTF8 valid bytes | ||
Table 3-7. Well-Formed UTF-8 Byte Sequences | ||
Code Points | First B | Second B | Third B | Fourth B | ||
------------------+----------+------------+---------+--------- | ||
U+0000..U+007F | 00..7F | | | | ||
U+0080..U+07FF | C2..DF | 80..BF | | | ||
U+0800..U+0FFF | E0 | A0..BF (A) | 80..BF | | ||
U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | ||
U+D000..U+D7FF | ED | 80..9F (B) | 80..BF | | ||
U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | ||
U+10000..U+3FFFF | F0 | 90..BF (C) | 80..BF | 80..BF | ||
U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | ||
U+100000..U+10FFFF| F4 | 80..8F (D) | 80..BF | 80..BF | ||
*/ | ||
|
||
enum class ParserState | ||
{ | ||
kFirstByte, | ||
kSecondByte_A, | ||
kSecondByte_B, | ||
kSecondByte_C, | ||
kSecondByte_D, | ||
kExtraOneByte, // 0x80 .. 0xBF once | ||
kExtraTwoBytes, // 0x80 .. 0xBF twice | ||
kExtraThreeBytes, // 0x80 .. 0xBF three times | ||
// | ||
kInvalid, // some error | ||
}; | ||
|
||
ParserState NextState(ParserState state, uint8_t value) | ||
{ | ||
switch (state) | ||
{ | ||
case ParserState::kFirstByte: | ||
if (value <= 0x7F) | ||
{ | ||
return ParserState::kFirstByte; | ||
} | ||
else if ((value >= 0xC2) && (value <= 0xDF)) | ||
{ | ||
return ParserState::kExtraOneByte; | ||
} | ||
else if (value == 0xE0) | ||
{ | ||
return ParserState::kSecondByte_A; | ||
} | ||
else if ((value >= 0xE1) && (value <= 0xEC)) | ||
{ | ||
return ParserState::kExtraTwoBytes; | ||
} | ||
else if (value == 0xED) | ||
{ | ||
return ParserState::kSecondByte_B; | ||
} | ||
else if ((value >= 0xEE) && (value <= 0xEF)) | ||
{ | ||
return ParserState::kExtraTwoBytes; | ||
} | ||
else if (value == 0xF0) | ||
{ | ||
return ParserState::kSecondByte_C; | ||
} | ||
else if ((value >= 0xF1) && (value <= 0xF3)) | ||
{ | ||
return ParserState::kExtraThreeBytes; | ||
} | ||
else if (value == 0xF4) | ||
{ | ||
return ParserState::kSecondByte_D; | ||
} | ||
else | ||
{ | ||
return ParserState::kInvalid; | ||
} | ||
case ParserState::kSecondByte_A: | ||
if (value >= 0xA0 && value <= 0xBF) | ||
{ | ||
return ParserState::kExtraOneByte; | ||
} | ||
else | ||
{ | ||
return ParserState::kInvalid; | ||
} | ||
case ParserState::kSecondByte_B: | ||
if (value >= 0x80 && value <= 0x9F) | ||
{ | ||
return ParserState::kExtraOneByte; | ||
} | ||
else | ||
{ | ||
return ParserState::kInvalid; | ||
} | ||
case ParserState::kSecondByte_C: | ||
if (value >= 0x90 && value <= 0xBF) | ||
{ | ||
return ParserState::kExtraTwoBytes; | ||
} | ||
else | ||
{ | ||
return ParserState::kInvalid; | ||
} | ||
case ParserState::kSecondByte_D: | ||
if (value >= 0x80 && value <= 0x8F) | ||
{ | ||
return ParserState::kExtraTwoBytes; | ||
} | ||
else | ||
{ | ||
return ParserState::kInvalid; | ||
} | ||
case ParserState::kExtraOneByte: | ||
if (value >= 0x80 && value <= 0xBF) | ||
{ | ||
return ParserState::kFirstByte; | ||
} | ||
else | ||
{ | ||
return ParserState::kInvalid; | ||
} | ||
case ParserState::kExtraTwoBytes: | ||
if (value >= 0x80 && value <= 0xBF) | ||
{ | ||
return ParserState::kExtraOneByte; | ||
} | ||
else | ||
{ | ||
return ParserState::kInvalid; | ||
} | ||
case ParserState::kExtraThreeBytes: | ||
if (value >= 0x80 && value <= 0xBF) | ||
{ | ||
return ParserState::kExtraTwoBytes; | ||
} | ||
else | ||
{ | ||
return ParserState::kInvalid; | ||
} | ||
default: | ||
return ParserState::kInvalid; | ||
} | ||
} | ||
|
||
} // namespace | ||
|
||
bool IsValid(CharSpan span) | ||
{ | ||
ParserState state = ParserState::kFirstByte; | ||
|
||
const char * data = span.data(); | ||
const size_t kLength = span.size(); | ||
|
||
// Every byte should be valid | ||
for (size_t i = 0; i < kLength; i++) | ||
{ | ||
state = NextState(state, static_cast<uint8_t>(data[i])); | ||
|
||
if (state == ParserState::kInvalid) | ||
{ | ||
return false; | ||
} | ||
} | ||
|
||
// finally no continuation should be expected | ||
return state == ParserState::kFirstByte; | ||
} | ||
|
||
} // namespace Utf8 | ||
} // namespace chip |
Oops, something went wrong.