-
Notifications
You must be signed in to change notification settings - Fork 1.7k
/
Copy pathTextDecoder.ts
346 lines (325 loc) · 10.2 KB
/
TextDecoder.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
/**
* Copyright (c) 2019 The xterm.js authors. All rights reserved.
* @license MIT
*/
/**
* Polyfill - Convert UTF32 codepoint into JS string.
* Note: The built-in String.fromCodePoint happens to be much slower
* due to additional sanity checks. We can avoid them since
* we always operate on legal UTF32 (granted by the input decoders)
* and use this faster version instead.
*/
export function stringFromCodePoint(codePoint: number): string {
if (codePoint > 0xFFFF) {
codePoint -= 0x10000;
return String.fromCharCode((codePoint >> 10) + 0xD800) + String.fromCharCode((codePoint % 0x400) + 0xDC00);
}
return String.fromCharCode(codePoint);
}
/**
* Convert UTF32 char codes into JS string.
* Basically the same as `stringFromCodePoint` but for multiple codepoints
* in a loop (which is a lot faster).
*/
export function utf32ToString(data: Uint32Array, start: number = 0, end: number = data.length): string {
let result = '';
for (let i = start; i < end; ++i) {
let codepoint = data[i];
if (codepoint > 0xFFFF) {
// JS strings are encoded as UTF16, thus a non BMP codepoint gets converted into a surrogate pair
// conversion rules:
// - subtract 0x10000 from code point, leaving a 20 bit number
// - add high 10 bits to 0xD800 --> first surrogate
// - add low 10 bits to 0xDC00 --> second surrogate
codepoint -= 0x10000;
result += String.fromCharCode((codepoint >> 10) + 0xD800) + String.fromCharCode((codepoint % 0x400) + 0xDC00);
} else {
result += String.fromCharCode(codepoint);
}
}
return result;
}
/**
* StringToUtf32 - decodes UTF16 sequences into UTF32 codepoints.
* To keep the decoder in line with JS strings it handles single surrogates as UCS2.
*/
export class StringToUtf32 {
private _interim: number = 0;
/**
* Clears interim and resets decoder to clean state.
*/
public clear(): void {
this._interim = 0;
}
/**
* Decode JS string to UTF32 codepoints.
* The methods assumes stream input and will store partly transmitted
* surrogate pairs and decode them with the next data chunk.
* Note: The method does no bound checks for target, therefore make sure
* the provided input data does not exceed the size of `target`.
* Returns the number of written codepoints in `target`.
*/
public decode(input: string, target: Uint32Array): number {
const length = input.length;
if (!length) {
return 0;
}
let size = 0;
let startPos = 0;
// handle leftover surrogate high
if (this._interim) {
const second = input.charCodeAt(startPos++);
if (0xDC00 <= second && second <= 0xDFFF) {
target[size++] = (this._interim - 0xD800) * 0x400 + second - 0xDC00 + 0x10000;
} else {
// illegal codepoint (USC2 handling)
target[size++] = this._interim;
target[size++] = second;
}
this._interim = 0;
}
for (let i = startPos; i < length; ++i) {
const code = input.charCodeAt(i);
// surrogate pair first
if (0xD800 <= code && code <= 0xDBFF) {
if (++i >= length) {
this._interim = code;
return size;
}
const second = input.charCodeAt(i);
if (0xDC00 <= second && second <= 0xDFFF) {
target[size++] = (code - 0xD800) * 0x400 + second - 0xDC00 + 0x10000;
} else {
// illegal codepoint (USC2 handling)
target[size++] = code;
target[size++] = second;
}
continue;
}
if (code === 0xFEFF) {
// BOM
continue;
}
target[size++] = code;
}
return size;
}
}
/**
* Utf8Decoder - decodes UTF8 byte sequences into UTF32 codepoints.
*/
export class Utf8ToUtf32 {
public interim: Uint8Array = new Uint8Array(3);
/**
* Clears interim bytes and resets decoder to clean state.
*/
public clear(): void {
this.interim.fill(0);
}
/**
* Decodes UTF8 byte sequences in `input` to UTF32 codepoints in `target`.
* The methods assumes stream input and will store partly transmitted bytes
* and decode them with the next data chunk.
* Note: The method does no bound checks for target, therefore make sure
* the provided data chunk does not exceed the size of `target`.
* Returns the number of written codepoints in `target`.
*/
public decode(input: Uint8Array, target: Uint32Array): number {
const length = input.length;
if (!length) {
return 0;
}
let size = 0;
let byte1: number;
let byte2: number;
let byte3: number;
let byte4: number;
let codepoint = 0;
let startPos = 0;
// handle leftover bytes
if (this.interim[0]) {
let discardInterim = false;
let cp = this.interim[0];
cp &= ((((cp & 0xE0) === 0xC0)) ? 0x1F : (((cp & 0xF0) === 0xE0)) ? 0x0F : 0x07);
let pos = 0;
let tmp: number;
while ((tmp = this.interim[++pos] & 0x3F) && pos < 4) {
cp <<= 6;
cp |= tmp;
}
// missing bytes - read ahead from input
const type = (((this.interim[0] & 0xE0) === 0xC0)) ? 2 : (((this.interim[0] & 0xF0) === 0xE0)) ? 3 : 4;
const missing = type - pos;
while (startPos < missing) {
if (startPos >= length) {
return 0;
}
tmp = input[startPos++];
if ((tmp & 0xC0) !== 0x80) {
// wrong continuation, discard interim bytes completely
startPos--;
discardInterim = true;
break;
} else {
// need to save so we can continue short inputs in next call
this.interim[pos++] = tmp;
cp <<= 6;
cp |= tmp & 0x3F;
}
}
if (!discardInterim) {
// final test is type dependent
if (type === 2) {
if (cp < 0x80) {
// wrong starter byte
startPos--;
} else {
target[size++] = cp;
}
} else if (type === 3) {
if (cp < 0x0800 || (cp >= 0xD800 && cp <= 0xDFFF)) {
// illegal codepoint
} else {
target[size++] = cp;
}
} else {
if (cp < 0x010000 || cp > 0x10FFFF) {
// illegal codepoint
} else {
target[size++] = cp;
}
}
}
this.interim.fill(0);
}
// loop through input
const fourStop = length - 4;
let i = startPos;
while (i < length) {
/**
* ASCII shortcut with loop unrolled to 4 consecutive ASCII chars.
* This is a compromise between speed gain for ASCII
* and penalty for non ASCII:
* For best ASCII performance the char should be stored directly into target,
* but even a single attempt to write to target and compare afterwards
* penalizes non ASCII really bad (-50%), thus we load the char into byteX first,
* which reduces ASCII performance by ~15%.
* This trial for ASCII reduces non ASCII performance by ~10% which seems acceptible
* compared to the gains.
* Note that this optimization only takes place for 4 consecutive ASCII chars,
* for any shorter it bails out. Worst case - all 4 bytes being read but
* thrown away due to the last being a non ASCII char (-10% performance).
*/
while (i < fourStop
&& !((byte1 = input[i]) & 0x80)
&& !((byte2 = input[i + 1]) & 0x80)
&& !((byte3 = input[i + 2]) & 0x80)
&& !((byte4 = input[i + 3]) & 0x80))
{
target[size++] = byte1;
target[size++] = byte2;
target[size++] = byte3;
target[size++] = byte4;
i += 4;
}
// reread byte1
byte1 = input[i++];
// 1 byte
if (byte1 < 0x80) {
target[size++] = byte1;
// 2 bytes
} else if ((byte1 & 0xE0) === 0xC0) {
if (i >= length) {
this.interim[0] = byte1;
return size;
}
byte2 = input[i++];
if ((byte2 & 0xC0) !== 0x80) {
// wrong continuation
i--;
continue;
}
codepoint = (byte1 & 0x1F) << 6 | (byte2 & 0x3F);
if (codepoint < 0x80) {
// wrong starter byte
i--;
continue;
}
target[size++] = codepoint;
// 3 bytes
} else if ((byte1 & 0xF0) === 0xE0) {
if (i >= length) {
this.interim[0] = byte1;
return size;
}
byte2 = input[i++];
if ((byte2 & 0xC0) !== 0x80) {
// wrong continuation
i--;
continue;
}
if (i >= length) {
this.interim[0] = byte1;
this.interim[1] = byte2;
return size;
}
byte3 = input[i++];
if ((byte3 & 0xC0) !== 0x80) {
// wrong continuation
i--;
continue;
}
codepoint = (byte1 & 0x0F) << 12 | (byte2 & 0x3F) << 6 | (byte3 & 0x3F);
if (codepoint < 0x0800 || (codepoint >= 0xD800 && codepoint <= 0xDFFF) || codepoint === 0xFEFF) {
// illegal codepoint or BOM, no i-- here
continue;
}
target[size++] = codepoint;
// 4 bytes
} else if ((byte1 & 0xF8) === 0xF0) {
if (i >= length) {
this.interim[0] = byte1;
return size;
}
byte2 = input[i++];
if ((byte2 & 0xC0) !== 0x80) {
// wrong continuation
i--;
continue;
}
if (i >= length) {
this.interim[0] = byte1;
this.interim[1] = byte2;
return size;
}
byte3 = input[i++];
if ((byte3 & 0xC0) !== 0x80) {
// wrong continuation
i--;
continue;
}
if (i >= length) {
this.interim[0] = byte1;
this.interim[1] = byte2;
this.interim[2] = byte3;
return size;
}
byte4 = input[i++];
if ((byte4 & 0xC0) !== 0x80) {
// wrong continuation
i--;
continue;
}
codepoint = (byte1 & 0x07) << 18 | (byte2 & 0x3F) << 12 | (byte3 & 0x3F) << 6 | (byte4 & 0x3F);
if (codepoint < 0x010000 || codepoint > 0x10FFFF) {
// illegal codepoint, no i-- here
continue;
}
target[size++] = codepoint;
} else {
// illegal byte, just skip
}
}
return size;
}
}