From 8ece89c9a4bbf96a1125cdc47f8573d27cb022f1 Mon Sep 17 00:00:00 2001 From: Michael Niksa Date: Wed, 30 Oct 2019 13:36:42 -0700 Subject: [PATCH 1/4] Allow Mb2Wc to substitute U+FFFD (unicode replacement) for non-minimal forms of characters that get past our initial invalid-sequence screening. --- src/host/utf8ToWideCharParser.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/host/utf8ToWideCharParser.cpp b/src/host/utf8ToWideCharParser.cpp index 0e6143153bc..4ab12d845c1 100644 --- a/src/host/utf8ToWideCharParser.cpp +++ b/src/host/utf8ToWideCharParser.cpp @@ -377,8 +377,16 @@ unsigned int Utf8ToWideCharParser::_InvolvedParse(_In_reads_(cb) const byte* con _currentState = _State::AwaitingMoreBytes; return 0; } + + // By this point, all obviously invalid sequences have been removed. + // But non-minimal forms of sequences might still exist. + // MB2WC will fail non-minimal forms with MB_ERR_INVALID_CHARS at this point. + // So we call with flags = 0 such that non-minimal forms get the U+FFFD + // replacement character treatment. + // This issue and related concerns are fully captured in future work item GH#3378 + // for future cleanup and reconciliation. int bufferSize = MultiByteToWideChar(_currentCodePage, - MB_ERR_INVALID_CHARS, + 0, reinterpret_cast(validSequence.first.get()), validSequence.second, nullptr, From d2603079715303528df0f41cada9e62f3705041a Mon Sep 17 00:00:00 2001 From: Michael Niksa Date: Wed, 30 Oct 2019 16:34:12 -0700 Subject: [PATCH 2/4] Add test to ensure that non-minimal forms don't choke and come through as a replacement character (for now). --- .../ut_host/Utf8ToWideCharParserTests.cpp | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/src/host/ut_host/Utf8ToWideCharParserTests.cpp b/src/host/ut_host/Utf8ToWideCharParserTests.cpp index fbd28788be0..28d0227cf64 100644 --- a/src/host/ut_host/Utf8ToWideCharParserTests.cpp +++ b/src/host/ut_host/Utf8ToWideCharParserTests.cpp @@ -264,6 +264,54 @@ class Utf8ToWideCharParserTests } } + TEST_METHOD(NonMinimalFormTest) + { + Log::Comment(L"Testing that non-minimal forms of a character are tolerated don't stop the rest"); + + // clang-format off + + // Test data + const unsigned char data[] = { + 0x60, 0x12, 0x00, 0x7f, // single byte points + 0xc0, 0x80, // U+0000 as a 2-byte sequence (non-minimal) + 0x41, 0x48, 0x06, 0x55, // more single byte points + 0xe0, 0x80, 0x80, // U+0000 as a 3-byte sequence (non-minimal) + 0x18, 0x77, 0x40, 0x31, // more single byte points + 0xf0, 0x80, 0x80, 0x80, // U+0000 as a 4-byte sequence (non-minimal) + 0x59, 0x1f, 0x68, 0x20 // more single byte points + }; + + // Expected conversion + const wchar_t wideData[] = { + 0x0060, 0x0012, 0x0000, 0x007f, + 0xfffd, 0xfffd, + 0x0041, 0x0048, 0x0006, 0x0055, + 0xfffd, 0xfffd, + 0x0018, 0x0077, 0x0040, 0x0031, + 0xfffd, 0xfffd, 0xfffd, + 0x0059, 0x001f, 0x0068, 0x0020 + }; + + // clang-format on + + const unsigned int count = gsl::narrow_cast(ARRAYSIZE(data)); + const unsigned int wideCount = gsl::narrow_cast(ARRAYSIZE(wideData)); + unsigned int consumed = 0; + unsigned int generated = 0; + unique_ptr output{ nullptr }; + auto parser = Utf8ToWideCharParser{ utf8CodePage }; + + VERIFY_SUCCEEDED(parser.Parse(data, count, consumed, output, generated)); + VERIFY_ARE_EQUAL(count, consumed); + VERIFY_ARE_EQUAL(wideCount, generated); + VERIFY_IS_NOT_NULL(output.get()); + + for (int i = 0; i < wideCount; i++) + { + VERIFY_ARE_EQUAL(wideData[i], output.get()[i]); + } + } + TEST_METHOD(PartialBytesAreDroppedOnCodePageChangeTest) { Log::Comment(L"Testing that a saved partial sequence is cleared when the codepage changes"); From 28596240fbd4b359433561e22cfd515dacf5d074 Mon Sep 17 00:00:00 2001 From: Michael Niksa Date: Thu, 31 Oct 2019 09:32:02 -0700 Subject: [PATCH 3/4] PR feedback, use string comparison in tests, add some more commentary. --- src/host/ut_host/Utf8ToWideCharParserTests.cpp | 17 ++++++++--------- src/host/utf8ToWideCharParser.cpp | 1 + 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/host/ut_host/Utf8ToWideCharParserTests.cpp b/src/host/ut_host/Utf8ToWideCharParserTests.cpp index 28d0227cf64..af92685e0e5 100644 --- a/src/host/ut_host/Utf8ToWideCharParserTests.cpp +++ b/src/host/ut_host/Utf8ToWideCharParserTests.cpp @@ -272,7 +272,7 @@ class Utf8ToWideCharParserTests // Test data const unsigned char data[] = { - 0x60, 0x12, 0x00, 0x7f, // single byte points + 0x60, 0x12, 0x08, 0x7f, // single byte points 0xc0, 0x80, // U+0000 as a 2-byte sequence (non-minimal) 0x41, 0x48, 0x06, 0x55, // more single byte points 0xe0, 0x80, 0x80, // U+0000 as a 3-byte sequence (non-minimal) @@ -283,12 +283,12 @@ class Utf8ToWideCharParserTests // Expected conversion const wchar_t wideData[] = { - 0x0060, 0x0012, 0x0000, 0x007f, - 0xfffd, 0xfffd, + 0x0060, 0x0012, 0x0008, 0x007f, + 0xfffd, 0xfffd, // The number of replacements per invalid sequence is not intended to be load-bearing 0x0041, 0x0048, 0x0006, 0x0055, - 0xfffd, 0xfffd, + 0xfffd, 0xfffd, 0xfffd, // It is just representative of what it looked like when fixing this for GH#3380 0x0018, 0x0077, 0x0040, 0x0031, - 0xfffd, 0xfffd, 0xfffd, + 0xfffd, 0xfffd, 0xfffd, // Change if necessary when completing GH#3378 0x0059, 0x001f, 0x0068, 0x0020 }; @@ -306,10 +306,9 @@ class Utf8ToWideCharParserTests VERIFY_ARE_EQUAL(wideCount, generated); VERIFY_IS_NOT_NULL(output.get()); - for (int i = 0; i < wideCount; i++) - { - VERIFY_ARE_EQUAL(wideData[i], output.get()[i]); - } + const auto expected = WEX::Common::String(wideData, wideCount); + const auto actual = WEX::Common::String(output.get(), generated); + VERIFY_ARE_EQUAL(expected, actual); } TEST_METHOD(PartialBytesAreDroppedOnCodePageChangeTest) diff --git a/src/host/utf8ToWideCharParser.cpp b/src/host/utf8ToWideCharParser.cpp index 4ab12d845c1..fb3d76e7674 100644 --- a/src/host/utf8ToWideCharParser.cpp +++ b/src/host/utf8ToWideCharParser.cpp @@ -385,6 +385,7 @@ unsigned int Utf8ToWideCharParser::_InvolvedParse(_In_reads_(cb) const byte* con // replacement character treatment. // This issue and related concerns are fully captured in future work item GH#3378 // for future cleanup and reconciliation. + // The original issue introducing this was GH#3320. int bufferSize = MultiByteToWideChar(_currentCodePage, 0, reinterpret_cast(validSequence.first.get()), From 5ae708ee4aa80d766d0ffd1184f8104c1f97813c Mon Sep 17 00:00:00 2001 From: Michael Niksa Date: Thu, 31 Oct 2019 10:06:07 -0700 Subject: [PATCH 4/4] put the test back from how I intentionally broke it to check string comparison. oops --- src/host/ut_host/Utf8ToWideCharParserTests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/host/ut_host/Utf8ToWideCharParserTests.cpp b/src/host/ut_host/Utf8ToWideCharParserTests.cpp index af92685e0e5..a70265e2707 100644 --- a/src/host/ut_host/Utf8ToWideCharParserTests.cpp +++ b/src/host/ut_host/Utf8ToWideCharParserTests.cpp @@ -286,7 +286,7 @@ class Utf8ToWideCharParserTests 0x0060, 0x0012, 0x0008, 0x007f, 0xfffd, 0xfffd, // The number of replacements per invalid sequence is not intended to be load-bearing 0x0041, 0x0048, 0x0006, 0x0055, - 0xfffd, 0xfffd, 0xfffd, // It is just representative of what it looked like when fixing this for GH#3380 + 0xfffd, 0xfffd, // It is just representative of what it looked like when fixing this for GH#3380 0x0018, 0x0077, 0x0040, 0x0031, 0xfffd, 0xfffd, 0xfffd, // Change if necessary when completing GH#3378 0x0059, 0x001f, 0x0068, 0x0020