Skip to content

Commit

Permalink
[encoding] Add tests for GBK and gb18030 encoding
Browse files Browse the repository at this point in the history
These tests come from https://bugs.webkit.org/show_bug.cgi?id=218380
All tests pass in Firefox and in WebKit after that change.
Chrome has the following deviations from the standard:
1. The URL GBK encoding of the Yen sign (U+00A5) is %A3%A4 instead of %26%23165%3B
2. Decoding 0x80 with gb18030 results in a replacement character instead of the Euro sign (U+20A0)
3. The GBK decoder does not use the gb18030 decoder, which makes it less permissive.
  • Loading branch information
achristensen07 committed Nov 3, 2020
1 parent 1f01b02 commit 6d7cdfe
Show file tree
Hide file tree
Showing 5 changed files with 322 additions and 21 deletions.
21 changes: 0 additions & 21 deletions encoding/gb18030-encoder.html

This file was deleted.

57 changes: 57 additions & 0 deletions encoding/legacy-mb-schinese/gb18030/gb18030-decoder.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
<!doctype html>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script src=resources/ranges.js></script>
<div id=log></div>
<script>
decode = (input, output, desc) => {
test(function() {
for (encoding of ["gb18030", "gbk"])
assert_equals(new TextDecoder(encoding).decode(new Uint8Array(input)), output)
}, "gb18030 decoder: " + desc)
}

decode([115], "s", "ASCII")
decode([0x80], "\u20AC", "euro")
decode([0xFF], "\uFFFD", "initial byte out of accepted ranges")
decode([0x81], "\uFFFD", "end of queue, gb18030 first not 0")
decode([0x81, 0x28], "\ufffd(", "two bytes 0x81 0x28")
decode([0x81, 0x40], "\u4E02", "two bytes 0x81 0x40")
decode([0x81, 0x7E], "\u4E8A", "two bytes 0x81 0x7e")
decode([0x81, 0x7F], "\ufffd\u007f", "two bytes 0x81 0x7f")
decode([0x81, 0x80], "\u4E90", "two bytes 0x81 0x80")
decode([0x81, 0xFE], "\u4FA2", "two bytes 0x81 0xFE")
decode([0x81, 0xFF], "\ufffd", "two bytes 0x81 0xFF")
decode([0xFE, 0x40], "\uFA0C", "two bytes 0xFE 0x40")
decode([0xFE, 0xFE], "\uE4C5", "two bytes 0xFE 0xFE")
decode([0xFE, 0xFF], "\ufffd", "two bytes 0xFE 0xFF")
decode([0x81, 0x30], "\ufffd", "two bytes 0x81 0x30")
decode([0x81, 0x30, 0xFE], "\ufffd", "three bytes 0x81 0x30 0xFE")
decode([0x81, 0x30, 0xFF], "\ufffd0\ufffd", "three bytes 0x81 0x30 0xFF")
decode([0x81, 0x30, 0xFE, 0x29], "\ufffd0\ufffd)", "four bytes 0x81 0x30 0xFE 0x29")
decode([0xFE, 0x39, 0xFE, 0x39], "\ufffd", "four bytes 0xFE 0x39 0xFE 0x39")
decode([0x81, 0x35, 0xF4, 0x36], "\u1E3E", "pointer 7458")
decode([0x81, 0x35, 0xF4, 0x37], "\ue7c7", "pointer 7457")
decode([0x81, 0x35, 0xF4, 0x38], "\u1E40", "pointer 7459")
decode([0x84, 0x31, 0xA4, 0x39], "\uffff", "pointer 39419")
decode([0x84, 0x31, 0xA5, 0x30], "\ufffd", "pointer 39420")
decode([0x8F, 0x39, 0xFE, 0x39], "\ufffd", "pointer 189999")
decode([0x90, 0x30, 0x81, 0x30], "\u{10000}", "pointer 189000")
decode([0xE3, 0x32, 0x9A, 0x35], "\u{10FFFF}", "pointer 1237575")
decode([0xE3, 0x32, 0x9A, 0x36], "\ufffd", "pointer 1237576")
decode([0x83, 0x36, 0xC8, 0x30], "\uE7C8", "legacy ICU special case 1");
decode([0xA1, 0xAD], "\u2026", "legacy ICU special case 2");
decode([0xA1, 0xAB], "\uFF5E", "legacy ICU special case 3");


i = 0;
for (range of ranges) {
pointer = range[0];
decode([
Math.floor(pointer / 12600) + 0x81,
Math.floor((pointer % 12600) / 1260) + 0x30,
Math.floor((pointer % 1260) / 10) + 0x81,
pointer % 10 + 0x30
], range[1], "range " + i++)
}
</script>
48 changes: 48 additions & 0 deletions encoding/legacy-mb-schinese/gb18030/gb18030-encoder.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
<!doctype html>
<meta charset=gb18030>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script src=resources/ranges.js></script>
<div id=log></div>
<script>
encode = (input, output, desc) => {
test(function() {
var a = document.createElement("a") // <a> uses document encoding for URL's query
a.href = "https://example.com/?" + input
assert_equals(a.search.substr(1), output) // remove leading "?"
}, "gb18030 encoder: " + desc)
}

encode("s", "s", "very basic")
encode("\u20AC", "%A2%E3", "Euro")
encode("\u4E02", "%81@", "character")
encode("\uE4C6", "%A1@", "PUA")
encode("\uE4C5", "%FE%FE", "PUA #2")
encode("\uE5E5", "%26%2358853%3B", "PUA #3")
encode("\ud83d\udca9", "%949%DA3", "poo")
encode("\uE7C7", "%815%F47", "Ranges pointer special case")
encode("\uE7C8", "%836%C80", "legacy ICU special case 1");
encode("\u2026", "%A1%AD", "legacy ICU special case 2");
encode("\uFF5E", "%A1%AB", "legacy ICU special case 3");

upperCaseNibble = x => {
return Math.floor(x).toString(16).toUpperCase();
}

encodePointer = pointer => {
firstByte = Math.floor(pointer / 12600) + 0x81;
thirdByte = Math.floor((pointer % 1260) / 10) + 0x81;
return "%"
+ upperCaseNibble(firstByte / 16)
+ upperCaseNibble(firstByte % 16)
+ String.fromCharCode(Math.floor((pointer % 12600) / 1260) + 0x30)
+ "%"
+ upperCaseNibble(thirdByte / 16)
+ upperCaseNibble(thirdByte % 16)
+ String.fromCharCode(pointer % 10 + 0x30);
}

i = 0;
for (range of ranges)
encode(range[1], encodePointer(range[0]), "range " + i++)
</script>
210 changes: 210 additions & 0 deletions encoding/legacy-mb-schinese/gb18030/resources/ranges.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
// Based on https://encoding.spec.whatwg.org/index-gb18030-ranges.txt
var ranges = [
[0, "\u0080"],
[36, "\u00A5"],
[38, "\u00A9"],
[45, "\u00B2"],
[50, "\u00B8"],
[81, "\u00D8"],
[89, "\u00E2"],
[95, "\u00EB"],
[96, "\u00EE"],
[100, "\u00F4"],
[103, "\u00F8"],
[104, "\u00FB"],
[105, "\u00FD"],
[109, "\u0102"],
[126, "\u0114"],
[133, "\u011C"],
[148, "\u012C"],
[172, "\u0145"],
[175, "\u0149"],
[179, "\u014E"],
[208, "\u016C"],
[306, "\u01CF"],
[307, "\u01D1"],
[308, "\u01D3"],
[309, "\u01D5"],
[310, "\u01D7"],
[311, "\u01D9"],
[312, "\u01DB"],
[313, "\u01DD"],
[341, "\u01FA"],
[428, "\u0252"],
[443, "\u0262"],
[544, "\u02C8"],
[545, "\u02CC"],
[558, "\u02DA"],
[741, "\u03A2"],
[742, "\u03AA"],
[749, "\u03C2"],
[750, "\u03CA"],
[805, "\u0402"],
[819, "\u0450"],
[820, "\u0452"],
[7922, "\u2011"],
[7924, "\u2017"],
[7925, "\u201A"],
[7927, "\u201E"],
[7934, "\u2027"],
[7943, "\u2031"],
[7944, "\u2034"],
[7945, "\u2036"],
[7950, "\u203C"],
[8062, "\u20AD"],
[8148, "\u2104"],
[8149, "\u2106"],
[8152, "\u210A"],
[8164, "\u2117"],
[8174, "\u2122"],
[8236, "\u216C"],
[8240, "\u217A"],
[8262, "\u2194"],
[8264, "\u219A"],
[8374, "\u2209"],
[8380, "\u2210"],
[8381, "\u2212"],
[8384, "\u2216"],
[8388, "\u221B"],
[8390, "\u2221"],
[8392, "\u2224"],
[8393, "\u2226"],
[8394, "\u222C"],
[8396, "\u222F"],
[8401, "\u2238"],
[8406, "\u223E"],
[8416, "\u2249"],
[8419, "\u224D"],
[8424, "\u2253"],
[8437, "\u2262"],
[8439, "\u2268"],
[8445, "\u2270"],
[8482, "\u2296"],
[8485, "\u229A"],
[8496, "\u22A6"],
[8521, "\u22C0"],
[8603, "\u2313"],
[8936, "\u246A"],
[8946, "\u249C"],
[9046, "\u254C"],
[9050, "\u2574"],
[9063, "\u2590"],
[9066, "\u2596"],
[9076, "\u25A2"],
[9092, "\u25B4"],
[9100, "\u25BE"],
[9108, "\u25C8"],
[9111, "\u25CC"],
[9113, "\u25D0"],
[9131, "\u25E6"],
[9162, "\u2607"],
[9164, "\u260A"],
[9218, "\u2641"],
[9219, "\u2643"],
[11329, "\u2E82"],
[11331, "\u2E85"],
[11334, "\u2E89"],
[11336, "\u2E8D"],
[11346, "\u2E98"],
[11361, "\u2EA8"],
[11363, "\u2EAB"],
[11366, "\u2EAF"],
[11370, "\u2EB4"],
[11372, "\u2EB8"],
[11375, "\u2EBC"],
[11389, "\u2ECB"],
[11682, "\u2FFC"],
[11686, "\u3004"],
[11687, "\u3018"],
[11692, "\u301F"],
[11694, "\u302A"],
[11714, "\u303F"],
[11716, "\u3094"],
[11723, "\u309F"],
[11725, "\u30F7"],
[11730, "\u30FF"],
[11736, "\u312A"],
[11982, "\u322A"],
[11989, "\u3232"],
[12102, "\u32A4"],
[12336, "\u3390"],
[12348, "\u339F"],
[12350, "\u33A2"],
[12384, "\u33C5"],
[12393, "\u33CF"],
[12395, "\u33D3"],
[12397, "\u33D6"],
[12510, "\u3448"],
[12553, "\u3474"],
[12851, "\u359F"],
[12962, "\u360F"],
[12973, "\u361B"],
[13738, "\u3919"],
[13823, "\u396F"],
[13919, "\u39D1"],
[13933, "\u39E0"],
[14080, "\u3A74"],
[14298, "\u3B4F"],
[14585, "\u3C6F"],
[14698, "\u3CE1"],
[15583, "\u4057"],
[15847, "\u4160"],
[16318, "\u4338"],
[16434, "\u43AD"],
[16438, "\u43B2"],
[16481, "\u43DE"],
[16729, "\u44D7"],
[17102, "\u464D"],
[17122, "\u4662"],
[17315, "\u4724"],
[17320, "\u472A"],
[17402, "\u477D"],
[17418, "\u478E"],
[17859, "\u4948"],
[17909, "\u497B"],
[17911, "\u497E"],
[17915, "\u4984"],
[17916, "\u4987"],
[17936, "\u499C"],
[17939, "\u49A0"],
[17961, "\u49B8"],
[18664, "\u4C78"],
[18703, "\u4CA4"],
[18814, "\u4D1A"],
[18962, "\u4DAF"],
[19043, "\u9FA6"],
[33469, "\uE76C"],
[33470, "\uE7C8"],
[33471, "\uE7E7"],
[33484, "\uE815"],
[33485, "\uE819"],
[33490, "\uE81F"],
[33497, "\uE827"],
[33501, "\uE82D"],
[33505, "\uE833"],
[33513, "\uE83C"],
[33520, "\uE844"],
[33536, "\uE856"],
[33550, "\uE865"],
[37845, "\uF92D"],
[37921, "\uF97A"],
[37948, "\uF996"],
[38029, "\uF9E8"],
[38038, "\uF9F2"],
[38064, "\uFA10"],
[38065, "\uFA12"],
[38066, "\uFA15"],
[38069, "\uFA19"],
[38075, "\uFA22"],
[38076, "\uFA25"],
[38078, "\uFA2A"],
[39108, "\uFE32"],
[39109, "\uFE45"],
[39113, "\uFE53"],
[39114, "\uFE58"],
[39115, "\uFE67"],
[39116, "\uFE6C"],
[39265, "\uFF5F"],
[39394, "\uFFE6"],
[189000, "\u{10000}"]
];
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,11 @@
encode("\uE4C6", "%A1@", "PUA")
encode("\uE4C5", "%FE%FE", "PUA #2")
encode("\ud83d\udca9", "%26%23128169%3B", "poo")
encode("\uE7C8", "%26%2359336%3B", "legacy ICU special case 1");
encode("\u2026", "%A1%AD", "legacy ICU special case 2");
encode("\uFF5E", "%A1%AB", "legacy ICU special case 3");
encode("\u00A5", "%26%23165%3B", "legacy WebKit case 1");
encode("\u22EF", "%26%238943%3B", "legacy WebKit case 2");
encode("\u301C", "%26%2312316%3B", "legacy WebKit case 3");

</script>

0 comments on commit 6d7cdfe

Please sign in to comment.