diff --git a/icu4c/source/data/mappings/gb18030-2022.ucm b/icu4c/source/data/mappings/gb18030-2022.ucm index c85538940a55..7f02d06856bf 100644 --- a/icu4c/source/data/mappings/gb18030-2022.ucm +++ b/icu4c/source/data/mappings/gb18030-2022.ucm @@ -5,6 +5,12 @@ # ICU codepage data for GB 18030-2022 +# This data file was originally generated from the mapping tables +# published with the original (year 2000) GB18030 standard. +# It has been updated for the 2005 version of GB18030 (ICU-8274 & ICU-8427) +# and for the 2022 version (ICU-22357). +# ICU-22420 then made minor mapping changes for GBK and web data/WHATWG compatibility. + "gb18030-2022" "AXXXX" 4 @@ -23,7 +29,8 @@ # The second line is commented out (and does not count) # because the state table is hand-optimized and does not use what would be # the natural path for the encoding scheme. - 0-7f, 81:6, 82:7, 83:8, 84:9, 85-fe:3 +# ICU-22420 makes 0x80 valid for the GBK encoding of the Euro sign. + 0-80, 81:6, 82:7, 83:8, 84:9, 85-fe:3 # 30-39:2, 40-7e, 80-fe 81-fe:2 30-39 @@ -56,6 +63,18 @@ CHARMAP +# ICU-22420 reverse fallbacks for compatibility with GBK and other web data as in WHATWG. +# U+20AC = EURO SIGN (normally \xA2\xE3) +# U+3000 = IDEOGRAPHIC SPACE (normally \xA1\xA1) +# +# PUA U+E5E5 used to round-trip to \xA3\xA0, as specified in GB18030. +# Now that \xA3\xA0 maps to U+3000 (“reverse fallback” mapping), +# we use a “good one-way” mapping from U+E5E5 to \xA3\xA0 +# for maximum compatibility with previous behavior. + \x80 |3 + \xA3\xA0 |3 + \xA3\xA0 |4 + \x00 |0 \x01 |0 \x02 |0 @@ -29602,7 +29621,7 @@ CHARMAP \xA3\x9D |0 \xA3\x9E |0 \xA3\x9F |0 - \xA3\xA0 |0 +# \xA3\xA0 |0 \xA4\x40 |0 \xA4\x41 |0 \xA4\x42 |0 diff --git a/icu4c/source/test/testdata/conversion.txt b/icu4c/source/test/testdata/conversion.txt index 290925ef31fa..8b430a53f89a 100644 --- a/icu4c/source/test/testdata/conversion.txt +++ b/icu4c/source/test/testdata/conversion.txt @@ -115,6 +115,14 @@ conversion:table(nofallback) { :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17,18,20 }, :int{1}, :int{0}, "", "&C", :bin{""} } + // GB18030: ICU-22420 adds two reverse fallbacks + { + "gb18030", + :bin{ 80a1a1a2e3a3a0 }, + "\u20AC\u3000\u20AC\u3000", + :intvector{ 0,1,3,5 }, + :int{1}, :int{0}, "", "&C", :bin{""} + } { "UTF-8", :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a }, diff --git a/icu4j/main/charset/src/main/resources/com/ibm/icu/impl/data/icudt74b/gb18030-2022.cnv b/icu4j/main/charset/src/main/resources/com/ibm/icu/impl/data/icudt74b/gb18030-2022.cnv index 4731af8d1f1b..fdc914760004 100644 Binary files a/icu4j/main/charset/src/main/resources/com/ibm/icu/impl/data/icudt74b/gb18030-2022.cnv and b/icu4j/main/charset/src/main/resources/com/ibm/icu/impl/data/icudt74b/gb18030-2022.cnv differ diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/data/testdata/conversion.res b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/data/testdata/conversion.res index a61b4b8d676c..8d66e5616880 100644 Binary files a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/data/testdata/conversion.res and b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/data/testdata/conversion.res differ