-
Notifications
You must be signed in to change notification settings - Fork 8.4k
/
Copy pathutf8ToWideCharParser.cpp
520 lines (496 loc) · 18.7 KB
/
utf8ToWideCharParser.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include "precomp.h"
#include "utf8ToWideCharParser.hpp"
#include <unicode.hpp>
#ifndef WIL_ENABLE_EXCEPTIONS
#error WIL exception helpers must be enabled
#endif
#define IsBitSet WI_IsFlagSet
const byte NonAsciiBytePrefix = 0x80;
const byte ContinuationByteMask = 0xC0;
const byte ContinuationBytePrefix = 0x80;
const byte MostSignificantBitMask = 0x80;
// Routine Description:
// - Constructs an instance of the parser.
// Arguments:
// - codePage - Starting code page to interpret input with.
// Return Value:
// - A new instance of the parser.
Utf8ToWideCharParser::Utf8ToWideCharParser(const unsigned int codePage) :
_currentCodePage{ codePage },
_bytesStored{ 0 },
_currentState{ _State::Ready },
_convertedWideChars{ nullptr }
{
std::fill_n(_utf8CodePointPieces, _UTF8_BYTE_SEQUENCE_MAX, 0ui8);
}
// Routine Description:
// - Set the code page that input sequences will correspond to. Clears
// any saved partial multi-byte sequences if the code page changes
// from the code page the partial sequence is associated with.
// Arguments:
// - codePage - the code page to set to.
// Return Value:
// - <none>
void Utf8ToWideCharParser::SetCodePage(const unsigned int codePage)
{
if (_currentCodePage != codePage)
{
_currentCodePage = codePage;
// we can't be making any assumptions about the partial
// sequence we were storing now that the codepage has changed
_bytesStored = 0;
_currentState = _State::Ready;
}
}
// Routine Description:
// - Parses the input multi-byte sequence.
// Arguments:
// - pBytes - The byte sequence to parse.
// - cchBuffer - The amount of bytes in pBytes. This will contain the
// number of wide chars contained by converted after this function is
// run, or 0 if an error occurs (or if pBytes is 0).
// - converted - a valid unique_ptr to store the parsed wide chars
// in. On error this will contain nullptr instead of an array.
// Return Value:
// - <none>
[[nodiscard]] HRESULT Utf8ToWideCharParser::Parse(_In_reads_(cchBuffer) const byte* const pBytes,
_In_ unsigned int const cchBuffer,
_Out_ unsigned int& cchConsumed,
_Inout_ std::unique_ptr<wchar_t[]>& converted,
_Out_ unsigned int& cchConverted)
{
cchConsumed = 0;
cchConverted = 0;
// we can't parse anything if we weren't given any data to parse
if (cchBuffer == 0)
{
return S_OK;
}
// we shouldn't be parsing if the current codepage isn't UTF8
if (_currentCodePage != CP_UTF8)
{
_currentState = _State::Error;
}
HRESULT hr = S_OK;
try
{
bool loop = true;
unsigned int wideCharCount = 0;
_convertedWideChars.reset(nullptr);
while (loop)
{
switch (_currentState)
{
case _State::Ready:
wideCharCount = _ParseFullRange(pBytes, cchBuffer);
break;
case _State::BeginPartialParse:
wideCharCount = _InvolvedParse(pBytes, cchBuffer);
break;
case _State::Error:
hr = E_FAIL;
_Reset();
wideCharCount = 0;
loop = false;
break;
case _State::Finished:
_currentState = _State::Ready;
cchConsumed = cchBuffer;
loop = false;
break;
case _State::AwaitingMoreBytes:
_currentState = _State::BeginPartialParse;
cchConsumed = cchBuffer;
loop = false;
break;
default:
_currentState = _State::Error;
break;
}
}
converted.swap(_convertedWideChars);
cchConverted = wideCharCount;
}
catch (...)
{
_Reset();
hr = wil::ResultFromCaughtException();
}
return hr;
}
// Routine Description:
// - Determines if ch is a UTF8 lead byte. See _Utf8SequenceSize() for a
// description of how a lead byte is specified.
// Arguments:
// - ch - The byte to test.
// Return Value:
// - True if ch is a lead byte, false otherwise.
bool Utf8ToWideCharParser::_IsLeadByte(_In_ byte ch)
{
unsigned int sequenceSize = _Utf8SequenceSize(ch);
return !_IsContinuationByte(ch) &&
!_IsAsciiByte(ch) &&
sequenceSize > 1 &&
sequenceSize <= _UTF8_BYTE_SEQUENCE_MAX;
}
// Routine Description:
// - Determines if ch is a UTF8 continuation byte. A continuation byte
// takes the form 10xx xxxx, so we need to check that the two most
// significant bits are a 1 followed by a 0.
// Arguments:
// - ch - The byte to test
// Return Value:
// - True if ch is a continuation byte, false otherwise.
bool Utf8ToWideCharParser::_IsContinuationByte(_In_ byte ch)
{
return (ch & ContinuationByteMask) == ContinuationBytePrefix;
}
// Routine Description:
// - Determines if ch is an ASCII compatible UTF8 byte. A byte is
// ASCII compatible if the most significant bit is a 0.
// Arguments:
// - ch - The byte to test.
// Return Value:
// - True if ch is an ASCII compatible byte, false otherwise.
bool Utf8ToWideCharParser::_IsAsciiByte(_In_ byte ch)
{
return !IsBitSet(ch, NonAsciiBytePrefix);
}
// Routine Description:
// - Determines if the sequence starting at pLeadByte is a valid UTF8
// multi-byte sequence. Note that a single ASCII byte does not count
// as a valid MULTI-byte sequence.
// Arguments:
// - pLeadByte - The start of a possible sequence.
// - cb - The amount of remaining chars in the array that
// pLeadByte points to.
// Return Value:
// - true if the sequence starting at pLeadByte is a multi-byte
// sequence and uses all of the remaining chars, false otherwise.
bool Utf8ToWideCharParser::_IsValidMultiByteSequence(_In_reads_(cb) const byte* const pLeadByte, const unsigned int cb)
{
if (!_IsLeadByte(*pLeadByte))
{
return false;
}
const unsigned int sequenceSize = _Utf8SequenceSize(*pLeadByte);
if (sequenceSize > cb)
{
return false;
}
// i starts at 1 so that we skip the lead byte
for (unsigned int i = 1; i < sequenceSize; ++i)
{
const byte ch = *(pLeadByte + i);
if (!_IsContinuationByte(ch))
{
return false;
}
}
return true;
}
// Routine Description:
// - Checks if the sequence starting at pLeadByte is a portion of a
// single valid multi-byte sequence. A new sequence must not be
// started within the range provided in order for it to be considered
// a valid partial sequence.
// Arguments:
// - pLeadByte - The start of the possible partial sequence.
// - cb - The amount of remaining chars in the array that
// pLeadByte points to.
// Return Value:
// - true if the sequence is a single partial multi-byte sequence,
// false otherwise.
bool Utf8ToWideCharParser::_IsPartialMultiByteSequence(_In_reads_(cb) const byte* const pLeadByte, const unsigned int cb)
{
if (!_IsLeadByte(*pLeadByte))
{
return false;
}
const unsigned int sequenceSize = _Utf8SequenceSize(*pLeadByte);
if (sequenceSize <= cb)
{
return false;
}
// i starts at 1 so that we skip the lead byte
for (unsigned int i = 1; i < cb; ++i)
{
const byte ch = *(pLeadByte + i);
if (!_IsContinuationByte(ch))
{
return false;
}
}
return true;
}
// Routine Description:
// - Determines the number of bytes in the UTF8 multi-byte sequence.
// Does not perform any verification that ch is a valid lead byte. A
// lead byte indicates how many bytes are in a sequence by repeating a
// 1 for each byte in the sequence, starting with the most significant
// bit, then a 0 directly after. Ex:
// - 110x xxxx = a two byte sequence
// - 1110 xxxx = a three byte sequence
//
// Note that a byte that has a pattern 10xx xxxx is a continuation
// byte and will be reported as a sequence of one by this function.
//
// A sequence is currently a maximum of four bytes but this function
// will just count the number of consecutive 1 bits (starting with the
// most significant bit) so if the byte is malformed (ex. 1111 110x) a
// number larger than the maximum utf8 byte sequence may be
// returned. It is the responsibility of the calling function to check
// this (and the continuation byte scenario) because we don't do any
// verification here.
// Arguments:
// - ch - the lead byte of a UTF8 multi-byte sequence.
// Return Value:
// - The number of bytes (including the lead byte) that ch indicates
// are in the sequence.
unsigned int Utf8ToWideCharParser::_Utf8SequenceSize(_In_ byte ch)
{
unsigned int msbOnes = 0;
while (IsBitSet(ch, MostSignificantBitMask))
{
++msbOnes;
ch <<= 1;
}
return msbOnes;
}
// Routine Description:
// - Attempts to parse pInputChars by themselves in wide chars,
// without using any saved partial byte sequences. On success,
// _convertedWideChars will contain the converted wide char sequence
// and _currentState will be set to _State::Finished. On failure,
// _currentState will be set to either _State::Error or
// _State::BeginPartialParse.
// Arguments:
// - pInputChars - The byte sequence to convert to wide chars.
// - cb - The amount of bytes in pInputChars.
// Return Value:
// - The amount of wide chars that are stored in _convertedWideChars,
// or 0 if pInputChars cannot be successfully converted.
unsigned int Utf8ToWideCharParser::_ParseFullRange(_In_reads_(cb) const byte* const pInputChars, const unsigned int cb)
{
int bufferSize = MultiByteToWideChar(_currentCodePage,
MB_ERR_INVALID_CHARS,
reinterpret_cast<LPCCH>(pInputChars),
cb,
nullptr,
0);
if (bufferSize == 0)
{
DWORD err = GetLastError();
LOG_WIN32(err);
if (err == ERROR_NO_UNICODE_TRANSLATION)
{
_currentState = _State::BeginPartialParse;
}
else
{
_currentState = _State::Error;
}
}
else
{
_convertedWideChars = std::make_unique<wchar_t[]>(bufferSize);
bufferSize = MultiByteToWideChar(_currentCodePage,
0,
reinterpret_cast<LPCCH>(pInputChars),
cb,
_convertedWideChars.get(),
bufferSize);
if (bufferSize == 0)
{
LOG_LAST_ERROR();
_currentState = _State::Error;
}
else
{
_currentState = _State::Finished;
}
}
return bufferSize;
}
// Routine Description:
// - Attempts to parse pInputChars in a more complex manner, taking
// into account any previously saved partial byte sequences while
// removing any invalid byte sequences. Will also save a partial byte
// sequence from the end of the sequence if necessary. If the sequence
// can be successfully parsed, _currentState will be set to
// _State::Finished. If more bytes are necessary to form a wide char,
// then _currentState will be set to
// _State::AwaitingMoreBytes. Otherwise, _currentState will be set to
// _State::Error.
// Arguments:
// - pInputChars - The byte sequence to convert to wide chars.
// - cb - The amount of bytes in pInputChars.
// Return Value:
// - The amount of wide chars that are stored in _convertedWideChars,
// or 0 if pInputChars cannot be successfully converted or if the
// parser requires additional bytes before returning a valid wide
// char.
unsigned int Utf8ToWideCharParser::_InvolvedParse(_In_reads_(cb) const byte* const pInputChars, const unsigned int cb)
{
// Do safe math to add up the count and error if it won't fit.
unsigned int count;
const HRESULT hr = UIntAdd(cb, _bytesStored, &count);
if (FAILED(hr))
{
LOG_HR(hr);
_currentState = _State::Error;
return 0;
}
// Allocate space and copy.
std::unique_ptr<byte[]> combinedInputBytes = std::make_unique<byte[]>(count);
std::copy(_utf8CodePointPieces, _utf8CodePointPieces + _bytesStored, combinedInputBytes.get());
std::copy(pInputChars, pInputChars + cb, combinedInputBytes.get() + _bytesStored);
_bytesStored = 0;
std::pair<std::unique_ptr<byte[]>, unsigned int> validSequence = _RemoveInvalidSequences(combinedInputBytes.get(), count);
// the input may have only been a partial sequence so we need to
// check that there are actually any bytes that we can convert
// right now
if (validSequence.second == 0 && _bytesStored > 0)
{
_currentState = _State::AwaitingMoreBytes;
return 0;
}
// By this point, all obviously invalid sequences have been removed.
// But non-minimal forms of sequences might still exist.
// MB2WC will fail non-minimal forms with MB_ERR_INVALID_CHARS at this point.
// So we call with flags = 0 such that non-minimal forms get the U+FFFD
// replacement character treatment.
// This issue and related concerns are fully captured in future work item GH#3378
// for future cleanup and reconciliation.
// The original issue introducing this was GH#3320.
int bufferSize = MultiByteToWideChar(_currentCodePage,
0,
reinterpret_cast<LPCCH>(validSequence.first.get()),
validSequence.second,
nullptr,
0);
if (bufferSize == 0)
{
LOG_LAST_ERROR();
_currentState = _State::Error;
}
else
{
_convertedWideChars = std::make_unique<wchar_t[]>(bufferSize);
bufferSize = MultiByteToWideChar(_currentCodePage,
0,
reinterpret_cast<LPCCH>(validSequence.first.get()),
validSequence.second,
_convertedWideChars.get(),
bufferSize);
if (bufferSize == 0)
{
LOG_LAST_ERROR();
_currentState = _State::Error;
}
else if (_bytesStored > 0)
{
_currentState = _State::AwaitingMoreBytes;
}
else
{
_currentState = _State::Finished;
}
}
return bufferSize;
}
// Routine Description:
// - Reads pInputChars byte by byte, removing any invalid UTF8
// multi-byte sequences.
// Arguments:
// - pInputChars - The byte sequence to fix.
// - cb - The amount of bytes in pInputChars.
// Return Value:
// - A std::pair containing the corrected byte sequence and the number
// of bytes in the sequence.
std::pair<std::unique_ptr<byte[]>, unsigned int> Utf8ToWideCharParser::_RemoveInvalidSequences(_In_reads_(cb) const byte* const pInputChars, const unsigned int cb)
{
std::unique_ptr<byte[]> validSequence = std::make_unique<byte[]>(cb);
unsigned int validSequenceLocation = 0; // index into validSequence
unsigned int currentByteInput = 0; // index into pInputChars
while (currentByteInput < cb)
{
if (_IsAsciiByte(pInputChars[currentByteInput]))
{
validSequence[validSequenceLocation] = pInputChars[currentByteInput];
++validSequenceLocation;
++currentByteInput;
}
else if (_IsContinuationByte(pInputChars[currentByteInput]))
{
while (currentByteInput < cb && _IsContinuationByte(pInputChars[currentByteInput]))
{
++currentByteInput;
}
}
else if (_IsLeadByte(pInputChars[currentByteInput]))
{
if (_IsValidMultiByteSequence(&pInputChars[currentByteInput], cb - currentByteInput))
{
const unsigned int sequenceSize = _Utf8SequenceSize(pInputChars[currentByteInput]);
// min is to guard against static analyis possible buffer overflow
const unsigned int limit = std::min(sequenceSize, cb - currentByteInput);
for (unsigned int i = 0; i < limit; ++i)
{
validSequence[validSequenceLocation] = pInputChars[currentByteInput];
++validSequenceLocation;
++currentByteInput;
}
}
else if (_IsPartialMultiByteSequence(&pInputChars[currentByteInput], cb - currentByteInput))
{
_StorePartialSequence(&pInputChars[currentByteInput], cb - currentByteInput);
break;
}
else
{
++currentByteInput;
while (currentByteInput < cb && _IsContinuationByte(pInputChars[currentByteInput]))
{
++currentByteInput;
}
}
}
else
{
// invalid byte, skip it.
++currentByteInput;
}
}
return std::make_pair<std::unique_ptr<byte[]>, unsigned int>(std::move(validSequence), std::move(validSequenceLocation));
}
// Routine Description:
// - Stores a partial byte sequence for later use. Will overwrite any
// previously saved sequence. Will only store bytes up to the limit
// Utf8ToWideCharParser::_UTF8_BYTE_SEQUENCE_MAX.
// Arguments:
// - pLeadByte - The beginning of the sequence to save.
// - cb - The amount of bytes to save.
// Return Value:
// - <none>
void Utf8ToWideCharParser::_StorePartialSequence(_In_reads_(cb) const byte* const pLeadByte, const unsigned int cb)
{
const unsigned int maxLength = std::min(cb, _UTF8_BYTE_SEQUENCE_MAX);
std::copy(pLeadByte, pLeadByte + maxLength, _utf8CodePointPieces);
_bytesStored = maxLength;
}
// Routine Description:
// - Resets the state of the parser to that of a newly initialized
// instance. _currentCodePage is not affected.
// Arguments:
// - <none>
// Return Value:
// - <none>
void Utf8ToWideCharParser::_Reset()
{
_currentState = _State::Ready;
_bytesStored = 0;
_convertedWideChars.reset(nullptr);
}