From 8ece89c9a4bbf96a1125cdc47f8573d27cb022f1 Mon Sep 17 00:00:00 2001
From: Michael Niksa <miniksa@microsoft.com>
Date: Wed, 30 Oct 2019 13:36:42 -0700
Subject: [PATCH 1/4] Allow Mb2Wc to substitute U+FFFD (unicode replacement)
 for non-minimal forms of characters that get past our initial
 invalid-sequence screening.

---
 src/host/utf8ToWideCharParser.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/src/host/utf8ToWideCharParser.cpp b/src/host/utf8ToWideCharParser.cpp
index 0e6143153bc..4ab12d845c1 100644
--- a/src/host/utf8ToWideCharParser.cpp
+++ b/src/host/utf8ToWideCharParser.cpp
@@ -377,8 +377,16 @@ unsigned int Utf8ToWideCharParser::_InvolvedParse(_In_reads_(cb) const byte* con
         _currentState = _State::AwaitingMoreBytes;
         return 0;
     }
+
+    // By this point, all obviously invalid sequences have been removed.
+    // But non-minimal forms of sequences might still exist.
+    // MB2WC will fail non-minimal forms with MB_ERR_INVALID_CHARS at this point.
+    // So we call with flags = 0 such that non-minimal forms get the U+FFFD
+    // replacement character treatment.
+    // This issue and related concerns are fully captured in future work item GH#3378
+    // for future cleanup and reconciliation.
     int bufferSize = MultiByteToWideChar(_currentCodePage,
-                                         MB_ERR_INVALID_CHARS,
+                                         0,
                                          reinterpret_cast<LPCCH>(validSequence.first.get()),
                                          validSequence.second,
                                          nullptr,

From d2603079715303528df0f41cada9e62f3705041a Mon Sep 17 00:00:00 2001
From: Michael Niksa <miniksa@microsoft.com>
Date: Wed, 30 Oct 2019 16:34:12 -0700
Subject: [PATCH 2/4] Add test to ensure that non-minimal forms don't choke and
 come through as a replacement character (for now).

---
 .../ut_host/Utf8ToWideCharParserTests.cpp     | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/src/host/ut_host/Utf8ToWideCharParserTests.cpp b/src/host/ut_host/Utf8ToWideCharParserTests.cpp
index fbd28788be0..28d0227cf64 100644
--- a/src/host/ut_host/Utf8ToWideCharParserTests.cpp
+++ b/src/host/ut_host/Utf8ToWideCharParserTests.cpp
@@ -264,6 +264,54 @@ class Utf8ToWideCharParserTests
         }
     }
 
+    TEST_METHOD(NonMinimalFormTest)
+    {
+        Log::Comment(L"Testing that non-minimal forms of a character are tolerated don't stop the rest");
+
+        // clang-format off
+
+        // Test data
+        const unsigned char data[] = {
+            0x60, 0x12, 0x00, 0x7f, // single byte points
+            0xc0, 0x80, // U+0000 as a 2-byte sequence (non-minimal)
+            0x41, 0x48, 0x06, 0x55, // more single byte points
+            0xe0, 0x80, 0x80, // U+0000 as a 3-byte sequence (non-minimal)
+            0x18, 0x77, 0x40, 0x31, // more single byte points
+            0xf0, 0x80, 0x80, 0x80, // U+0000 as a 4-byte sequence (non-minimal)
+            0x59, 0x1f, 0x68, 0x20 // more single byte points
+        };
+
+        // Expected conversion
+        const wchar_t wideData[] = {
+            0x0060, 0x0012, 0x0000, 0x007f,
+            0xfffd, 0xfffd,
+            0x0041, 0x0048, 0x0006, 0x0055,
+            0xfffd, 0xfffd,
+            0x0018, 0x0077, 0x0040, 0x0031,
+            0xfffd, 0xfffd, 0xfffd,
+            0x0059, 0x001f, 0x0068, 0x0020
+        };
+
+        // clang-format on
+
+        const unsigned int count = gsl::narrow_cast<unsigned int>(ARRAYSIZE(data));
+        const unsigned int wideCount = gsl::narrow_cast<unsigned int>(ARRAYSIZE(wideData));
+        unsigned int consumed = 0;
+        unsigned int generated = 0;
+        unique_ptr<wchar_t[]> output{ nullptr };
+        auto parser = Utf8ToWideCharParser{ utf8CodePage };
+
+        VERIFY_SUCCEEDED(parser.Parse(data, count, consumed, output, generated));
+        VERIFY_ARE_EQUAL(count, consumed);
+        VERIFY_ARE_EQUAL(wideCount, generated);
+        VERIFY_IS_NOT_NULL(output.get());
+
+        for (int i = 0; i < wideCount; i++)
+        {
+            VERIFY_ARE_EQUAL(wideData[i], output.get()[i]);
+        }
+    }
+
     TEST_METHOD(PartialBytesAreDroppedOnCodePageChangeTest)
     {
         Log::Comment(L"Testing that a saved partial sequence is cleared when the codepage changes");

From 28596240fbd4b359433561e22cfd515dacf5d074 Mon Sep 17 00:00:00 2001
From: Michael Niksa <miniksa@microsoft.com>
Date: Thu, 31 Oct 2019 09:32:02 -0700
Subject: [PATCH 3/4] PR feedback, use string comparison in tests, add some
 more commentary.

---
 src/host/ut_host/Utf8ToWideCharParserTests.cpp | 17 ++++++++---------
 src/host/utf8ToWideCharParser.cpp              |  1 +
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/host/ut_host/Utf8ToWideCharParserTests.cpp b/src/host/ut_host/Utf8ToWideCharParserTests.cpp
index 28d0227cf64..af92685e0e5 100644
--- a/src/host/ut_host/Utf8ToWideCharParserTests.cpp
+++ b/src/host/ut_host/Utf8ToWideCharParserTests.cpp
@@ -272,7 +272,7 @@ class Utf8ToWideCharParserTests
 
         // Test data
         const unsigned char data[] = {
-            0x60, 0x12, 0x00, 0x7f, // single byte points
+            0x60, 0x12, 0x08, 0x7f, // single byte points
             0xc0, 0x80, // U+0000 as a 2-byte sequence (non-minimal)
             0x41, 0x48, 0x06, 0x55, // more single byte points
             0xe0, 0x80, 0x80, // U+0000 as a 3-byte sequence (non-minimal)
@@ -283,12 +283,12 @@ class Utf8ToWideCharParserTests
 
         // Expected conversion
         const wchar_t wideData[] = {
-            0x0060, 0x0012, 0x0000, 0x007f,
-            0xfffd, 0xfffd,
+            0x0060, 0x0012, 0x0008, 0x007f,
+            0xfffd, 0xfffd, // The number of replacements per invalid sequence is not intended to be load-bearing
             0x0041, 0x0048, 0x0006, 0x0055,
-            0xfffd, 0xfffd,
+            0xfffd, 0xfffd, 0xfffd, // It is just representative of what it looked like when fixing this for GH#3380
             0x0018, 0x0077, 0x0040, 0x0031,
-            0xfffd, 0xfffd, 0xfffd,
+            0xfffd, 0xfffd, 0xfffd, // Change if necessary when completing GH#3378
             0x0059, 0x001f, 0x0068, 0x0020
         };
 
@@ -306,10 +306,9 @@ class Utf8ToWideCharParserTests
         VERIFY_ARE_EQUAL(wideCount, generated);
         VERIFY_IS_NOT_NULL(output.get());
 
-        for (int i = 0; i < wideCount; i++)
-        {
-            VERIFY_ARE_EQUAL(wideData[i], output.get()[i]);
-        }
+        const auto expected = WEX::Common::String(wideData, wideCount);
+        const auto actual = WEX::Common::String(output.get(), generated);
+        VERIFY_ARE_EQUAL(expected, actual);
     }
 
     TEST_METHOD(PartialBytesAreDroppedOnCodePageChangeTest)
diff --git a/src/host/utf8ToWideCharParser.cpp b/src/host/utf8ToWideCharParser.cpp
index 4ab12d845c1..fb3d76e7674 100644
--- a/src/host/utf8ToWideCharParser.cpp
+++ b/src/host/utf8ToWideCharParser.cpp
@@ -385,6 +385,7 @@ unsigned int Utf8ToWideCharParser::_InvolvedParse(_In_reads_(cb) const byte* con
     // replacement character treatment.
     // This issue and related concerns are fully captured in future work item GH#3378
     // for future cleanup and reconciliation.
+    // The original issue introducing this was GH#3320.
     int bufferSize = MultiByteToWideChar(_currentCodePage,
                                          0,
                                          reinterpret_cast<LPCCH>(validSequence.first.get()),

From 5ae708ee4aa80d766d0ffd1184f8104c1f97813c Mon Sep 17 00:00:00 2001
From: Michael Niksa <miniksa@microsoft.com>
Date: Thu, 31 Oct 2019 10:06:07 -0700
Subject: [PATCH 4/4] put the test back from how I intentionally broke it to
 check string comparison. oops

---
 src/host/ut_host/Utf8ToWideCharParserTests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/host/ut_host/Utf8ToWideCharParserTests.cpp b/src/host/ut_host/Utf8ToWideCharParserTests.cpp
index af92685e0e5..a70265e2707 100644
--- a/src/host/ut_host/Utf8ToWideCharParserTests.cpp
+++ b/src/host/ut_host/Utf8ToWideCharParserTests.cpp
@@ -286,7 +286,7 @@ class Utf8ToWideCharParserTests
             0x0060, 0x0012, 0x0008, 0x007f,
             0xfffd, 0xfffd, // The number of replacements per invalid sequence is not intended to be load-bearing
             0x0041, 0x0048, 0x0006, 0x0055,
-            0xfffd, 0xfffd, 0xfffd, // It is just representative of what it looked like when fixing this for GH#3380
+            0xfffd, 0xfffd, // It is just representative of what it looked like when fixing this for GH#3380
             0x0018, 0x0077, 0x0040, 0x0031,
             0xfffd, 0xfffd, 0xfffd, // Change if necessary when completing GH#3378
             0x0059, 0x001f, 0x0068, 0x0020