Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unify UTF-8 handling using til::u8u16 & revise WriteConsoleAImpl #4422

Merged
merged 4 commits into from
Feb 4, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 11 additions & 14 deletions src/host/VtInputThread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ VtInputThread::VtInputThread(_In_ wil::unique_hfile hPipe,
const bool inheritCursor) :
_hFile{ std::move(hPipe) },
_hThread{},
_utf8Parser{ CP_UTF8 },
_u8State{},
_dwThreadId{ 0 },
_exitRequested{ false },
_exitResult{ S_OK }
Expand All @@ -47,15 +47,14 @@ VtInputThread::VtInputThread(_In_ wil::unique_hfile hPipe,
}

// Method Description:
// - Processes a buffer of input characters. The characters should be utf-8
// encoded, and will get converted to wchar_t's to be processed by the
// - Processes a string of input characters. The characters should be UTF-8
// encoded, and will get converted to wstring to be processed by the
// input state machine.
// Arguments:
// - charBuffer - the UTF-8 characters recieved.
// - cch - number of UTF-8 characters in charBuffer
// - u8Str - the UTF-8 string received.
// Return Value:
// - S_OK on success, otherwise an appropriate failure.
[[nodiscard]] HRESULT VtInputThread::_HandleRunInput(_In_reads_(cch) const byte* const charBuffer, const int cch)
[[nodiscard]] HRESULT VtInputThread::_HandleRunInput(const std::string_view u8Str)
{
// Make sure to call the GLOBAL Lock/Unlock, not the gci's lock/unlock.
// Only the global unlock attempts to dispatch ctrl events. If you use the
Expand All @@ -67,16 +66,14 @@ VtInputThread::VtInputThread(_In_ wil::unique_hfile hPipe,

try
{
std::unique_ptr<wchar_t[]> pwsSequence;
unsigned int cchConsumed;
unsigned int cchSequence;
auto hr = _utf8Parser.Parse(charBuffer, cch, cchConsumed, pwsSequence, cchSequence);
std::wstring wstr{};
auto hr = til::u8u16(u8Str, wstr, _u8State);
// If we hit a parsing error, eat it. It's bad utf-8, we can't do anything with it.
if (FAILED(hr))
{
return S_FALSE;
}
_pInputStateMachine->ProcessString({ pwsSequence.get(), cchSequence });
_pInputStateMachine->ProcessString(wstr);
}
CATCH_RETURN();

Expand All @@ -100,12 +97,12 @@ DWORD WINAPI VtInputThread::StaticVtInputThreadProc(_In_ LPVOID lpParameter)
// failed, throw or log, depending on what the caller wants.
// Arguments:
// - throwOnFail: If true, throw an exception if there was an error processing
// the input recieved. Otherwise, log the error.
// the input received. Otherwise, log the error.
// Return Value:
// - <none>
void VtInputThread::DoReadInput(const bool throwOnFail)
{
byte buffer[256];
char buffer[256];
DWORD dwRead = 0;
bool fSuccess = !!ReadFile(_hFile.get(), buffer, ARRAYSIZE(buffer), &dwRead, nullptr);

Expand All @@ -120,7 +117,7 @@ void VtInputThread::DoReadInput(const bool throwOnFail)
return;
}

HRESULT hr = _HandleRunInput(buffer, dwRead);
HRESULT hr = _HandleRunInput({ buffer, gsl::narrow_cast<size_t>(dwRead) });
if (FAILED(hr))
{
if (throwOnFail)
Expand Down
5 changes: 2 additions & 3 deletions src/host/VtInputThread.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ Author(s):
#pragma once

#include "..\terminal\parser\StateMachine.hpp"
#include "utf8ToWideCharParser.hpp"

namespace Microsoft::Console
{
Expand All @@ -29,7 +28,7 @@ namespace Microsoft::Console
void DoReadInput(const bool throwOnFail);

private:
[[nodiscard]] HRESULT _HandleRunInput(_In_reads_(cch) const byte* const charBuffer, const int cch);
[[nodiscard]] HRESULT _HandleRunInput(const std::string_view u8Str);
DWORD _InputThread();

wil::unique_hfile _hFile;
Expand All @@ -40,6 +39,6 @@ namespace Microsoft::Console
HRESULT _exitResult;

std::unique_ptr<Microsoft::Console::VirtualTerminal::StateMachine> _pInputStateMachine;
Utf8ToWideCharParser _utf8Parser;
til::u8state _u8State;
};
}
42 changes: 15 additions & 27 deletions src/host/_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
#include "dbcs.h"
#include "handle.h"
#include "misc.h"
#include "utf8ToWidecharParser.hpp"

#include "../types/inc/convert.hpp"
#include "../types/inc/GlyphWidth.hpp"
Expand Down Expand Up @@ -491,9 +490,9 @@ constexpr unsigned int LOCAL_BUFFER_SIZE = 100;
CursorPosition = cursor.GetPosition();

// Make sure we don't write past the end of the buffer.
if (i > (ULONG)coordScreenBufferSize.X - CursorPosition.X)
if (i > gsl::narrow_cast<size_t>(coordScreenBufferSize.X) - CursorPosition.X)
{
i = (ULONG)coordScreenBufferSize.X - CursorPosition.X;
i = gsl::narrow_cast<size_t>(coordScreenBufferSize.X) - CursorPosition.X;
}

// line was wrapped if we're writing up to the end of the current row
Expand Down Expand Up @@ -683,7 +682,7 @@ constexpr unsigned int LOCAL_BUFFER_SIZE = 100;
if (CheckBisectProcessW(screenInfo,
pwchBufferBackupLimit,
pwchBuffer + 1 - pwchBufferBackupLimit,
coordScreenBufferSize.X - sOriginalXPosition,
gsl::narrow_cast<size_t>(coordScreenBufferSize.X) - sOriginalXPosition,
sOriginalXPosition,
dwFlags & WC_ECHO))
{
Expand All @@ -701,7 +700,7 @@ constexpr unsigned int LOCAL_BUFFER_SIZE = 100;
}
case UNICODE_TAB:
{
const size_t TabSize = NUMBER_OF_SPACES_IN_TAB(cursor.GetPosition().X);
const size_t TabSize = gsl::narrow_cast<size_t>(NUMBER_OF_SPACES_IN_TAB(cursor.GetPosition().X));
CursorPosition.X = (SHORT)(cursor.GetPosition().X + TabSize);

// move cursor forward to next tab stop. fill space with blanks.
Expand Down Expand Up @@ -1053,36 +1052,25 @@ constexpr unsigned int LOCAL_BUFFER_SIZE = 100;
const auto codepage = gci.OutputCP;

// Convert our input parameters to Unicode
std::unique_ptr<wchar_t[]> wideCharBuffer{ nullptr };
static Utf8ToWideCharParser parser{ gci.OutputCP };
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IGNORE, SEE BELOW
this one unfortunately cannot change just yet. gci.OutputCP must be the user's console output codepage for now and the forseeable future.

This poses an interesting conundrum for u8u16: should we have "ASCII" versions that take a codepage? au16 and u16a? I'm not sure 😄

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe it's actually part of the u*state itself...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OH, I understand. You can almost entirely ignore this comment. I was confused because this was created outside the CP_UTF8 block.
I also understand that writing in another codepage means we need to kill the u8 state -- so we can't move it inside this block.

Copy link
Contributor Author

@german-one german-one Jan 31, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@DHowett-MSFT Your understanding is correct. The reset() method is called in the else branch where codepages other than UTF-8 are processed. I knew I would need the reset here, that's why I implemented it from the beginning.

This poses an interesting conundrum for u8u16: should we have "ASCII" versions that take a codepage? au16 and u16a? I'm not sure 😄

Interesting indeed. Well, we already have function ConvertToW (and ConvertToA). But that's not simply applicable in WriteConsoleAImpl() due to the fact that we may receive DBCS-encoded text where caching of partials is required, too. @miniksa briefly mentioned that in #386 (comment)
I only have a poor understanding of how DBCS has to be processed though. The manpage of IsDBCSLeadByte states that even if you validated a lead byte you may not rely on MultiByteToWideChar being able to process the substring correctly. So, unfortunately I don't know enough to revise the DBCS handling. And I don't know if there is any other function in the code base where DBCS has to be converted. Hence it might not worth the effort to bring it into a separate function.
However, I offer to do my best to get rid of new and delete in WriteConsoleAImpl() and instead use the wstring that we already have for the UTF-8 conversion.
// EDIT done.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@german-one, if we theoretically made a au16 and a u16a to supercede ConvertToW and ConvertToA, there would probably be some sort of astate variable required. That variable could be stored on the assorted input handles to ensure that the problem briefly discussed in #386 is rectified.

Off hand, I believe there are probably several points in the code base that could be unified behind such a convergence function in a similar way to converging the u8/u16 problem. But I haven't enumerated them recently, so I may be incorrect.

I believe that the only circumstance where we'd really pay attention to IsDBCSLeadByte and hold onto it until the next call is if a string ends in a lead byte (then it would take the lead byte off the end and cache it for the next call in the astate or equivalent.) If it's in the middle of a run, we wouldn't care and just pass it into MultiByteToWideChar probably with the replacement character flag on so it would remove invalid DBCS representations. The next write would have the stored DBCS lead prefixed to whatever comes next, even if it's not valid together, and we'd let MB2WC sort it out and replace it.

The last provision to consider is that I believe the state would be discarded anytime the code page changed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@miniksa u16a would be easy. We only need to take care of split surrogates, and that's what u16state already does.
As to au16 - IsDBCSLeadByteEx isn't used in the current implementation of CheckBisectStringA. It would be my attempt to implement an astate though. The remarks found on the manpage still make me wonder if it would be bulletproof for DBCS.
But even if it was that simple, we would still only have conversions for SBCS, DBCS (the few mentioned on the manpage), and UTF-8. Guess what happens if we receive UTF-7 that was split inside of a base64 sequence 🤕


// update current codepage in case it was changed from last time
// this was called. We do this outside the UTF-8 check because the parser drops its state
// when the codepage changes.
parser.SetCodePage(gci.OutputCP);
std::wstring wstr{};
static til::u8state u8State{};

SCREEN_INFORMATION& ScreenInfo = context.GetActiveBuffer();
wchar_t* pwchBuffer;
size_t cchBuffer;
if (codepage == CP_UTF8)
{
wideCharBuffer.release();
unsigned int charCount;
unsigned int charsConsumed;
unsigned int charsGenerated;
RETURN_IF_FAILED(SizeTToUInt(buffer.size(), &charCount));
RETURN_IF_FAILED(parser.Parse(reinterpret_cast<const byte*>(buffer.data()),
charCount,
charsConsumed,
wideCharBuffer,
charsGenerated));

pwchBuffer = reinterpret_cast<wchar_t*>(wideCharBuffer.get());
cchBuffer = charsGenerated;
read = charsConsumed;
RETURN_IF_FAILED(til::u8u16(buffer, wstr, u8State));
pwchBuffer = wstr.data();
cchBuffer = wstr.length();
read = buffer.size();
DHowett-MSFT marked this conversation as resolved.
Show resolved Hide resolved
}
else
{
// In case the codepage changes from UTF-8 to another,
// we discard partials that might still be cached.
u8State.reset();

NTSTATUS Status = STATUS_SUCCESS;
PWCHAR TransBuffer;
PWCHAR TransBufferOriginalLocation;
Expand Down Expand Up @@ -1183,7 +1171,7 @@ constexpr unsigned int LOCAL_BUFFER_SIZE = 100;
}

pwchBuffer = TransBufferOriginalLocation;
cchBuffer = (dbcsNumBytes + BufPtrNumBytes) / sizeof(wchar_t);
cchBuffer = (gsl::narrow_cast<size_t>(dbcsNumBytes) + BufPtrNumBytes) / sizeof(wchar_t);
}

// Make the W version of the call
Expand Down
2 changes: 1 addition & 1 deletion src/host/_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,6 @@ Return Value:
// NOTE: console lock must be held when calling this routine
// String has been translated to unicode at this point.
[[nodiscard]] NTSTATUS DoWriteConsole(_In_reads_bytes_(*pcbBuffer) PWCHAR pwchBuffer,
_In_ size_t* const pcbBuffer,
_Inout_ size_t* const pcbBuffer,
SCREEN_INFORMATION& screenInfo,
std::unique_ptr<WriteData>& waiter);