Define a IsValidUtf8 equivalent with unit tests (#30386)

* Add a completely untested (but compilable) utf8 tester * Start adding unit tests * More test cases * tests * More tests * Fix typo * Slightly better formatting on test * Add comment about embedded zeroes * Restyle * Allow embedded zeroes (even if it pains me to do so) * Update copyright year and added 2 more tests * Restyle * Some comments and added another set of tests * Use fromCharString * Typo fixes * Fix dates for copyrights * Restyle
project-chip · Nov 9, 2023 · b0f5983 · b0f5983
1 parent 491354a
commit b0f5983
Show file tree

Hide file tree

Showing 5 changed files with 414 additions and 0 deletions.
diff --git a/src/lib/support/BUILD.gn b/src/lib/support/BUILD.gn
@@ -214,6 +214,8 @@ static_library("support") {
     "logging/BinaryLogging.cpp",
     "logging/BinaryLogging.h",
     "logging/CHIPLogging.h",
+    "utf8.cpp",
+    "utf8.h",
     "verhoeff/Verhoeff.cpp",
     "verhoeff/Verhoeff.h",
     "verhoeff/Verhoeff10.cpp",

diff --git a/src/lib/support/tests/BUILD.gn b/src/lib/support/tests/BUILD.gn
@@ -55,6 +55,7 @@ chip_test_suite_using_nltest("tests") {
     "TestTimeUtils.cpp",
     "TestTlvJson.cpp",
     "TestTlvToJson.cpp",
+    "TestUtf8.cpp",
     "TestVariant.cpp",
     "TestZclString.cpp",
   ]

diff --git a/src/lib/support/tests/TestUtf8.cpp b/src/lib/support/tests/TestUtf8.cpp
@@ -0,0 +1,170 @@
+
+/*
+ *
+ *    Copyright (c) 2023 Project CHIP Authors
+ *    All rights reserved.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+#include <functional>
+
+#include <lib/support/UnitTestRegistration.h>
+#include <lib/support/utf8.h>
+
+#include <nlunit-test.h>
+
+namespace {
+
+using namespace chip;
+
+void TestValidStrings(nlTestSuite * inSuite, void * inContext)
+{
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan())); // empty span ok
+
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("")));
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("abc")));
+
+    // Various tests from https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html
+
+    // Generic UTF8
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("κόσμε")));
+
+    // First possible sequence of a certain length
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("")));
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("ࠀ")));
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("𐀀")));
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("�����")));
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("������")));
+
+    // Last possible sequence of a certain length
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("")));
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("߿")));
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("")));
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("����")));
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("�����")));
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("������")));
+
+    // Other boundary conditions
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("퟿")));
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("")));
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("�")));
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("􏿿")));
+    NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("����")));
+
+    // NOTE: UTF8 allows embeded NULLs
+    //       even though strings like that are probably not ideal for handling
+    //       Test that we allow this, but consider later to disallow them
+    //       completely if the spec is updated as such
+    {
+        char zero[16] = { 0 };
+        NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 0)));
+        NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 1)));
+        NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 2)));
+        NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 3)));
+        NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 4)));
+        NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 16)));
+    }
+
+    {
+        char insideZero[] = "test\0zero";
+        NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(insideZero)));
+    }
+}
+
+#define TEST_INVALID_BYTES(...)                                                                                                    \
+    {                                                                                                                              \
+        uint8_t _buff[] = { __VA_ARGS__ };                                                                                         \
+        CharSpan _span(reinterpret_cast<const char *>(_buff), sizeof(_buff));                                                      \
+        NL_TEST_ASSERT(inSuite, !Utf8::IsValid(_span));                                                                            \
+    }                                                                                                                              \
+    (void) 0
+
+void TestInvalidStrings(nlTestSuite * inSuite, void * inContext)
+{
+    // overly long representation
+    TEST_INVALID_BYTES(0xe0, 0b1001'1111, 0x80); // A
+    TEST_INVALID_BYTES(0xed, 0b1011'0000, 0x80); // B
+    TEST_INVALID_BYTES(0xf0, 0b1000'1111, 0x80); // C
+
+    // Outside codepoint
+    TEST_INVALID_BYTES(0xf4, 0x90, 0x80, 0x80); // D
+    TEST_INVALID_BYTES(0xf4, 0x91, 0x82, 0x83);
+    TEST_INVALID_BYTES(0xf5, 0x81, 0x82, 0x83);
+
+    // Missing continuation
+    TEST_INVALID_BYTES(0xC2);
+    TEST_INVALID_BYTES(0xE0);
+    TEST_INVALID_BYTES(0xE1);
+    TEST_INVALID_BYTES(0xE1, 0x9F);
+    TEST_INVALID_BYTES(0xED, 0x9F);
+    TEST_INVALID_BYTES(0xEE, 0x9F);
+    TEST_INVALID_BYTES(0xF0);
+    TEST_INVALID_BYTES(0xF0, 0x9F);
+    TEST_INVALID_BYTES(0xF0, 0x9F, 0x9F);
+    TEST_INVALID_BYTES(0xF1);
+    TEST_INVALID_BYTES(0xF1, 0x9F);
+    TEST_INVALID_BYTES(0xF1, 0x9F, 0x9F);
+    TEST_INVALID_BYTES(0xF4);
+    TEST_INVALID_BYTES(0xF4, 0x9F);
+    TEST_INVALID_BYTES(0xF4, 0x9F, 0x9F);
+
+    // More tests from  https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html
+    TEST_INVALID_BYTES(0x80); // First continuation byte
+    TEST_INVALID_BYTES(0xBF); // Last continuation byte
+
+    // Impossible bytes
+    TEST_INVALID_BYTES(0xFE);
+    TEST_INVALID_BYTES(0xFF);
+    TEST_INVALID_BYTES(0xFE, 0xFE, 0xFF, 0xFF);
+
+    // Overlong sequences
+    // 4.1  Examples of an overlong ASCII character (in w3c tests)
+    TEST_INVALID_BYTES(0xc0, 0xaf);
+    TEST_INVALID_BYTES(0xe0, 0x80, 0xaf);
+    TEST_INVALID_BYTES(0xf0, 0x80, 0x80, 0xaf);
+    TEST_INVALID_BYTES(0xf8, 0x80, 0x80, 0x80, 0xaf);
+    TEST_INVALID_BYTES(0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf);
+    // 4.2  Maximum overlong sequences (in w3c tests)
+    TEST_INVALID_BYTES(0xc1, 0xbf);
+    TEST_INVALID_BYTES(0xe0, 0x9f, 0xbf);
+    TEST_INVALID_BYTES(0xf0, 0x8f, 0xbf, 0xbf);
+    TEST_INVALID_BYTES(0xf8, 0x87, 0xbf, 0xbf, 0xbf);
+    TEST_INVALID_BYTES(0xfc, 0x83, 0xbf, 0xbf, 0xbf, 0xbf);
+    // 4.3  Overlong representation of the NUL character (in w3c tests)
+    TEST_INVALID_BYTES(0xc0, 0x80);
+    TEST_INVALID_BYTES(0xe0, 0x80, 0x80);
+    TEST_INVALID_BYTES(0xf0, 0x80, 0x80, 0x80);
+    TEST_INVALID_BYTES(0xf8, 0x80, 0x80, 0x80, 0x80);
+    TEST_INVALID_BYTES(0xfc, 0x80, 0x80, 0x80, 0x80, 0x80);
+}
+
+} // namespace
+
+// clang-format off
+const nlTest sTests[] =
+{
+    NL_TEST_DEF("TestValidStrings", TestValidStrings),
+    NL_TEST_DEF("TestInvalidStrings", TestInvalidStrings),
+    NL_TEST_SENTINEL()
+};
+// clang-format on
+
+int TestUtf8()
+{
+    nlTestSuite theSuite = { "CHIP UTF8 tests", &sTests[0], nullptr, nullptr };
+    nlTestRunner(&theSuite, nullptr);
+    return nlTestRunnerStats(&theSuite);
+}
+
+CHIP_REGISTER_TEST_SUITE(TestUtf8);
diff --git a/src/lib/support/utf8.cpp b/src/lib/support/utf8.cpp
@@ -0,0 +1,194 @@
+/*
+ *
+ *    Copyright (c) 2023 Project CHIP Authors
+ *    All rights reserved.
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+#include "utf8.h"
+
+namespace chip {
+namespace Utf8 {
+
+namespace {
+/**
+   State machine for UTF8 valid bytes
+
+Table 3-7. Well-Formed UTF-8 Byte Sequences
+
+Code Points       | First B  | Second B   | Third B | Fourth B
+------------------+----------+------------+---------+---------
+U+0000..U+007F    | 00..7F   |            |         |
+U+0080..U+07FF    | C2..DF   | 80..BF     |         |
+U+0800..U+0FFF    | E0       | A0..BF (A) | 80..BF  |
+U+1000..U+CFFF    | E1..EC   | 80..BF     | 80..BF  |
+U+D000..U+D7FF    | ED       | 80..9F (B) | 80..BF  |
+U+E000..U+FFFF    | EE..EF   | 80..BF     | 80..BF  |
+U+10000..U+3FFFF  | F0       | 90..BF (C) | 80..BF  | 80..BF
+U+40000..U+FFFFF  | F1..F3   | 80..BF     | 80..BF  | 80..BF
+U+100000..U+10FFFF| F4       | 80..8F (D) | 80..BF  | 80..BF
+*/
+
+enum class ParserState
+{
+    kFirstByte,
+    kSecondByte_A,
+    kSecondByte_B,
+    kSecondByte_C,
+    kSecondByte_D,
+    kExtraOneByte,    // 0x80 .. 0xBF  once
+    kExtraTwoBytes,   // 0x80 .. 0xBF  twice
+    kExtraThreeBytes, // 0x80 .. 0xBF  three times
+    //
+    kInvalid, // some error
+};
+
+ParserState NextState(ParserState state, uint8_t value)
+{
+    switch (state)
+    {
+    case ParserState::kFirstByte:
+        if (value <= 0x7F)
+        {
+            return ParserState::kFirstByte;
+        }
+        else if ((value >= 0xC2) && (value <= 0xDF))
+        {
+            return ParserState::kExtraOneByte;
+        }
+        else if (value == 0xE0)
+        {
+            return ParserState::kSecondByte_A;
+        }
+        else if ((value >= 0xE1) && (value <= 0xEC))
+        {
+            return ParserState::kExtraTwoBytes;
+        }
+        else if (value == 0xED)
+        {
+            return ParserState::kSecondByte_B;
+        }
+        else if ((value >= 0xEE) && (value <= 0xEF))
+        {
+            return ParserState::kExtraTwoBytes;
+        }
+        else if (value == 0xF0)
+        {
+            return ParserState::kSecondByte_C;
+        }
+        else if ((value >= 0xF1) && (value <= 0xF3))
+        {
+            return ParserState::kExtraThreeBytes;
+        }
+        else if (value == 0xF4)
+        {
+            return ParserState::kSecondByte_D;
+        }
+        else
+        {
+            return ParserState::kInvalid;
+        }
+    case ParserState::kSecondByte_A:
+        if (value >= 0xA0 && value <= 0xBF)
+        {
+            return ParserState::kExtraOneByte;
+        }
+        else
+        {
+            return ParserState::kInvalid;
+        }
+    case ParserState::kSecondByte_B:
+        if (value >= 0x80 && value <= 0x9F)
+        {
+            return ParserState::kExtraOneByte;
+        }
+        else
+        {
+            return ParserState::kInvalid;
+        }
+    case ParserState::kSecondByte_C:
+        if (value >= 0x90 && value <= 0xBF)
+        {
+            return ParserState::kExtraTwoBytes;
+        }
+        else
+        {
+            return ParserState::kInvalid;
+        }
+    case ParserState::kSecondByte_D:
+        if (value >= 0x80 && value <= 0x8F)
+        {
+            return ParserState::kExtraTwoBytes;
+        }
+        else
+        {
+            return ParserState::kInvalid;
+        }
+    case ParserState::kExtraOneByte:
+        if (value >= 0x80 && value <= 0xBF)
+        {
+            return ParserState::kFirstByte;
+        }
+        else
+        {
+            return ParserState::kInvalid;
+        }
+    case ParserState::kExtraTwoBytes:
+        if (value >= 0x80 && value <= 0xBF)
+        {
+            return ParserState::kExtraOneByte;
+        }
+        else
+        {
+            return ParserState::kInvalid;
+        }
+    case ParserState::kExtraThreeBytes:
+        if (value >= 0x80 && value <= 0xBF)
+        {
+            return ParserState::kExtraTwoBytes;
+        }
+        else
+        {
+            return ParserState::kInvalid;
+        }
+    default:
+        return ParserState::kInvalid;
+    }
+}
+
+} // namespace
+
+bool IsValid(CharSpan span)
+{
+    ParserState state = ParserState::kFirstByte;
+
+    const char * data    = span.data();
+    const size_t kLength = span.size();
+
+    // Every byte should be valid
+    for (size_t i = 0; i < kLength; i++)
+    {
+        state = NextState(state, static_cast<uint8_t>(data[i]));
+
+        if (state == ParserState::kInvalid)
+        {
+            return false;
+        }
+    }
+
+    // finally no continuation should be expected
+    return state == ParserState::kFirstByte;
+}
+
+} // namespace Utf8
+} // namespace chip