From f062f52c123f436eb1142115ba2e4e7b65a4ac8f Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Fri, 6 Sep 2024 13:47:14 -0700 Subject: [PATCH] ICU-22294 UTS46 transitional=deprecated, change DEFAULT --- icu4c/source/common/unicode/idna.h | 3 + icu4c/source/common/unicode/uidna.h | 22 +- icu4c/source/test/intltest/uts46test.cpp | 23 ++ .../src/main/java/com/ibm/icu/text/IDNA.java | 289 +++++++++--------- .../icu/dev/test/normalizer/UTS46Test.java | 18 ++ 5 files changed, 217 insertions(+), 138 deletions(-) diff --git a/icu4c/source/common/unicode/idna.h b/icu4c/source/common/unicode/idna.h index 1c57205bae2e..1e36fa771f06 100644 --- a/icu4c/source/common/unicode/idna.h +++ b/icu4c/source/common/unicode/idna.h @@ -70,6 +70,7 @@ class U_COMMON_API IDNA : public UObject { * The worker functions use transitional processing, including deviation mappings, * unless UIDNA_NONTRANSITIONAL_TO_ASCII or UIDNA_NONTRANSITIONAL_TO_UNICODE * is used in which case the deviation characters are passed through without change. + * Unicode 15.1 UTS #46 deprecated transitional processing. * * Disallowed characters are mapped to U+FFFD. * @@ -82,6 +83,8 @@ class U_COMMON_API IDNA : public UObject { * letters, digits, hyphen (LDH) and dot/full stop are disallowed and mapped to U+FFFD. * * @param options Bit set to modify the processing and error checking. + * These should include UIDNA_DEFAULT, or + * UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_NONTRANSITIONAL_TO_UNICODE. * See option bit set values in uidna.h. * @param errorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns diff --git a/icu4c/source/common/unicode/uidna.h b/icu4c/source/common/unicode/uidna.h index 24a81ceaddf5..362a2dcbe65a 100644 --- a/icu4c/source/common/unicode/uidna.h +++ b/icu4c/source/common/unicode/uidna.h @@ -49,11 +49,19 @@ */ enum { /** - * Default options value: None of the other options are set. + * Default options value: UTS #46 nontransitional processing. * For use in static worker and factory methods. + * + * Since ICU 76, this is the same as + * UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_NONTRANSITIONAL_TO_UNICODE, + * corresponding to Unicode 15.1 UTS #46 deprecating transitional processing. + * (These options are ignored by the IDNA2003 implementation.) + * + * Before ICU 76, this constant did not set any of the options. + * * @stable ICU 2.6 */ - UIDNA_DEFAULT=0, + UIDNA_DEFAULT=0x30, #ifndef U_HIDE_DEPRECATED_API /** * Option to allow unassigned code points in domain names and labels. @@ -91,19 +99,27 @@ enum { /** * IDNA option for nontransitional processing in ToASCII(). * For use in static worker and factory methods. + * *

By default, ToASCII() uses transitional processing. + * Unicode 15.1 UTS #46 deprecated transitional processing. + * *

This option is ignored by the IDNA2003 implementation. * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.) * @stable ICU 4.6 + * @see UIDNA_DEFAULT */ UIDNA_NONTRANSITIONAL_TO_ASCII=0x10, /** * IDNA option for nontransitional processing in ToUnicode(). * For use in static worker and factory methods. + * *

By default, ToUnicode() uses transitional processing. + * Unicode 15.1 UTS #46 deprecated transitional processing. + * *

This option is ignored by the IDNA2003 implementation. * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.) * @stable ICU 4.6 + * @see UIDNA_DEFAULT */ UIDNA_NONTRANSITIONAL_TO_UNICODE=0x20, /** @@ -134,6 +150,8 @@ typedef struct UIDNA UIDNA; /**< C typedef for struct UIDNA. @stable ICU 4.6 */ * For details about the UTS #46 implementation see the IDNA C++ class in idna.h. * * @param options Bit set to modify the processing and error checking. + * These should include UIDNA_DEFAULT, or + * UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_NONTRANSITIONAL_TO_UNICODE. * See option bit set values in uidna.h. * @param pErrorCode Standard ICU error code. Its input value must * pass the U_SUCCESS() test, or else the function returns diff --git a/icu4c/source/test/intltest/uts46test.cpp b/icu4c/source/test/intltest/uts46test.cpp index 73ff225eb3ca..8defd1c1935f 100644 --- a/icu4c/source/test/intltest/uts46test.cpp +++ b/icu4c/source/test/intltest/uts46test.cpp @@ -42,6 +42,7 @@ class UTS46Test : public IntlTest { void TestNotSTD3(); void TestInvalidPunycodeDigits(); void TestACELabelEdgeCases(); + void TestDefaultNontransitional(); void TestTooLong(); void TestSomeCases(); void IdnaTest(); @@ -88,6 +89,7 @@ void UTS46Test::runIndexedTest(int32_t index, UBool exec, const char *&name, cha TESTCASE_AUTO(TestNotSTD3); TESTCASE_AUTO(TestInvalidPunycodeDigits); TESTCASE_AUTO(TestACELabelEdgeCases); + TESTCASE_AUTO(TestDefaultNontransitional); TESTCASE_AUTO(TestTooLong); TESTCASE_AUTO(TestSomeCases); TESTCASE_AUTO(IdnaTest); @@ -354,6 +356,27 @@ void UTS46Test::TestACELabelEdgeCases() { } } +void UTS46Test::TestDefaultNontransitional() { + IcuTestErrorCode errorCode(*this, "TestDefaultNontransitional()"); + // Unicode 15.1 UTS #46 deprecated transitional processing. + // ICU 76 changed UIDNA_DEFAULT to set the nontransitional options. + LocalPointer forZero(IDNA::createUTS46Instance(0, errorCode)); + LocalPointer forDefault(IDNA::createUTS46Instance(UIDNA_DEFAULT, errorCode)); + if(errorCode.isFailure()) { + return; + } + UnicodeString result; + IDNAInfo info; + forZero->labelToUnicode(u"Fⓤßẞ", result, info, errorCode); + assertEquals("forZero.toUnicode(Fⓤßẞ)", u"fussss", result); + forZero->labelToASCII(u"Fⓤßẞ", result, info, errorCode); + assertEquals("forZero.toASCII(Fⓤßẞ)", u"fussss", result); + forDefault->labelToUnicode(u"Fⓤßẞ", result, info, errorCode); + assertEquals("forDefault.toUnicode(Fⓤßẞ)", u"fußß", result); + forDefault->labelToASCII(u"Fⓤßẞ", result, info, errorCode); + assertEquals("forDefault.toASCII(Fⓤßẞ)", u"xn--fu-hiaa", result); +} + void UTS46Test::TestTooLong() { // ICU-13727: Limit input length for n^2 algorithm // where well-formed strings are at most 59 characters long. diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/IDNA.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/IDNA.java index 03da296fa709..7b66797370a5 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/IDNA.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/IDNA.java @@ -24,7 +24,7 @@ * The IDNA class is not intended for public subclassing. *

* The non-static methods implement UTS #46 and IDNA2008. - * IDNA2008 is implemented according to UTS #46, see getUTS46Instance(). + * IDNA2008 is implemented according to UTS #46, see {@link #getUTS46Instance(int)}. *

* IDNA2003 is obsolete. The static methods implement IDNA2003. They are all deprecated. *

@@ -32,35 +32,43 @@ *

* The static IDNA API methods implement the IDNA protocol as defined in the * IDNA RFC. - * The draft defines 2 operations: ToASCII and ToUnicode. Domain labels + * The draft defines 2 operations: ToASCII and ToUnicode. Domain labels * containing non-ASCII code points are required to be processed by * ToASCII operation before passing it to resolver libraries. Domain names * that are obtained from resolver libraries are required to be processed by * ToUnicode operation before displaying the domain name to the user. - * IDNA requires that implementations process input strings with - * Nameprep, - * which is a profile of Stringprep , - * and then with Punycode. - * Implementations of IDNA MUST fully implement Nameprep and Punycode; + * IDNA requires that implementations process input strings with + * Nameprep, + * which is a profile of Stringprep , + * and then with Punycode. + * Implementations of IDNA MUST fully implement Nameprep and Punycode; * neither Nameprep nor Punycode are optional. - * The input and output of ToASCII and ToUnicode operations are Unicode + * The input and output of ToASCII and ToUnicode operations are Unicode * and are designed to be chainable, i.e., applying ToASCII or ToUnicode operations * multiple times to an input string will yield the same result as applying the operation * once. - * ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string) + * ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string) * ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string). - * + * * @author Ram Viswanadha, Markus Scherer * @stable ICU 2.8 */ public abstract class IDNA { - /** - * Default options value: None of the other options are set. + /** + * Default options value: UTS #46 nontransitional processing. * For use in static worker and factory methods. + * + *

Since ICU 76, this is the same as + * {@link #NONTRANSITIONAL_TO_ASCII} | {@link #NONTRANSITIONAL_TO_UNICODE}, + * corresponding to Unicode 15.1 UTS #46 deprecating transitional processing. + * (These options are ignored by the IDNA2003 implementation.) + * + *

Before ICU 76, this constant did not set any of the options. + * * @stable ICU 2.8 */ - public static final int DEFAULT = 0; - /** + public static final int DEFAULT = 0x30; + /** * Option to allow unassigned code points in domain names and labels. * For use in static worker and factory methods. *

This option is ignored by the UTS46 implementation. @@ -69,7 +77,7 @@ public abstract class IDNA { */ @Deprecated public static final int ALLOW_UNASSIGNED = 1; - /** + /** * Option to check whether the input conforms to the STD3 ASCII rules, * for example the restriction of labels to LDH characters * (ASCII Letters, Digits and Hyphen-Minus). @@ -96,7 +104,10 @@ public abstract class IDNA { /** * IDNA option for nontransitional processing in ToASCII(). * For use in static worker and factory methods. + * *

By default, ToASCII() uses transitional processing. + * Unicode 15.1 UTS #46 deprecated transitional processing. + * *

This option is ignored by the IDNA2003 implementation. * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.) * @stable ICU 4.6 @@ -105,7 +116,10 @@ public abstract class IDNA { /** * IDNA option for nontransitional processing in ToUnicode(). * For use in static worker and factory methods. + * *

By default, ToUnicode() uses transitional processing. + * Unicode 15.1 UTS #46 deprecated transitional processing. + * *

This option is ignored by the IDNA2003 implementation. * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.) * @stable ICU 4.6 @@ -133,8 +147,9 @@ public abstract class IDNA { * IDNA2003 and IDNA2008. *

* The worker functions use transitional processing, including deviation mappings, - * unless NONTRANSITIONAL_TO_ASCII or NONTRANSITIONAL_TO_UNICODE + * unless {@link #NONTRANSITIONAL_TO_ASCII} or {@link #NONTRANSITIONAL_TO_UNICODE} * is used in which case the deviation characters are passed through without change. + * Unicode 15.1 UTS #46 deprecated transitional processing. *

* Disallowed characters are mapped to U+FFFD. *

@@ -146,6 +161,8 @@ public abstract class IDNA { * letters, digits, hyphen (LDH) and dot/full stop are disallowed and mapped to U+FFFD. * * @param options Bit set to modify the processing and error checking. + * These should include {@link IDNA#DEFAULT}, or + * {@link IDNA#NONTRANSITIONAL_TO_ASCII} | {@link IDNA#NONTRANSITIONAL_TO_UNICODE}. * @return the UTS #46 IDNA instance, if successful * @stable ICU 4.6 */ @@ -474,22 +491,22 @@ protected IDNA() { * IDNA2003: This function implements the ToASCII operation as defined in the IDNA RFC. * This operation is done on single labels before sending it to something that expects * ASCII names. A label is an individual part of a domain name. Labels are usually - * separated by dots; e.g." "www.example.com" is composed of 3 labels + * separated by dots; e.g." "www.example.com" is composed of 3 labels * "www","example", and "com". * * @param src The input string to be processed * @param options A bit set of options: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points * and do not use STD3 ASCII rules - * If unassigned code points are found the operation fails with + * If unassigned code points are found the operation fails with * StringPrepParseException. * * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations - * If this option is set, the unassigned code points are in the input + * If this option is set, the unassigned code points are in the input * are treated as normal Unicode code points. - * + * * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions - * If this option is set and the input does not satisfy STD3 rules, + * If this option is set and the input does not satisfy STD3 rules, * the operation will fail with ParseException * @return StringBuffer the converted String * @throws StringPrepParseException When an error occurs for parsing a string. @@ -501,27 +518,27 @@ public static StringBuffer convertToASCII(String src, int options) UCharacterIterator iter = UCharacterIterator.getInstance(src); return convertToASCII(iter,options); } - + /** * IDNA2003: This function implements the ToASCII operation as defined in the IDNA RFC. * This operation is done on single labels before sending it to something that expects * ASCII names. A label is an individual part of a domain name. Labels are usually - * separated by dots; e.g." "www.example.com" is composed of 3 labels + * separated by dots; e.g." "www.example.com" is composed of 3 labels * "www","example", and "com". * * @param src The input string as StringBuffer to be processed * @param options A bit set of options: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points * and do not use STD3 ASCII rules - * If unassigned code points are found the operation fails with + * If unassigned code points are found the operation fails with * ParseException. * * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations - * If this option is set, the unassigned code points are in the input + * If this option is set, the unassigned code points are in the input * are treated as normal Unicode code points. - * + * * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions - * If this option is set and the input does not satisfy STD3 rules, + * If this option is set and the input does not satisfy STD3 rules, * the operation will fail with ParseException * @return StringBuffer the converted String * @deprecated ICU 55 Use UTS 46 instead via {@link #getUTS46Instance(int)}. @@ -532,27 +549,27 @@ public static StringBuffer convertToASCII(StringBuffer src, int options) UCharacterIterator iter = UCharacterIterator.getInstance(src); return convertToASCII(iter,options); } - + /** * IDNA2003: This function implements the ToASCII operation as defined in the IDNA RFC. * This operation is done on single labels before sending it to something that expects * ASCII names. A label is an individual part of a domain name. Labels are usually - * separated by dots; e.g." "www.example.com" is composed of 3 labels + * separated by dots; e.g." "www.example.com" is composed of 3 labels * "www","example", and "com". * * @param src The input string as UCharacterIterator to be processed * @param options A bit set of options: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points * and do not use STD3 ASCII rules - * If unassigned code points are found the operation fails with + * If unassigned code points are found the operation fails with * ParseException. * * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations - * If this option is set, the unassigned code points are in the input + * If this option is set, the unassigned code points are in the input * are treated as normal Unicode code points. - * + * * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions - * If this option is set and the input does not satisfy STD3 rules, + * If this option is set and the input does not satisfy STD3 rules, * the operation will fail with ParseException * @return StringBuffer the converted String * @deprecated ICU 55 Use UTS 46 instead via {@link #getUTS46Instance(int)}. @@ -565,29 +582,29 @@ public static StringBuffer convertToASCII(UCharacterIterator src, int options) /** * IDNA2003: Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC. - * This operation is done on complete domain names, e.g: "www.example.com". - * It is important to note that this operation can fail. If it fails, then the input + * This operation is done on complete domain names, e.g: "www.example.com". + * It is important to note that this operation can fail. If it fails, then the input * domain name cannot be used as an Internationalized Domain Name and the application * should have methods defined to deal with the failure. - * + * * Note: IDNA RFC specifies that a conformant application should divide a domain name - * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, - * and then convert. This function does not offer that level of granularity. The options once + * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, + * and then convert. This function does not offer that level of granularity. The options once * set will apply to all labels in the domain name * * @param src The input string as UCharacterIterator to be processed * @param options A bit set of options: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points * and do not use STD3 ASCII rules - * If unassigned code points are found the operation fails with + * If unassigned code points are found the operation fails with * ParseException. * * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations - * If this option is set, the unassigned code points are in the input + * If this option is set, the unassigned code points are in the input * are treated as normal Unicode code points. - * + * * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions - * If this option is set and the input does not satisfy STD3 rules, + * If this option is set and the input does not satisfy STD3 rules, * the operation will fail with ParseException * @return StringBuffer the converted String * @deprecated ICU 55 Use UTS 46 instead via {@link #getUTS46Instance(int)}. @@ -595,34 +612,34 @@ public static StringBuffer convertToASCII(UCharacterIterator src, int options) @Deprecated public static StringBuffer convertIDNToASCII(UCharacterIterator src, int options) throws StringPrepParseException{ - return convertIDNToASCII(src.getText(), options); + return convertIDNToASCII(src.getText(), options); } - + /** * IDNA2003: Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC. - * This operation is done on complete domain names, e.g: "www.example.com". - * It is important to note that this operation can fail. If it fails, then the input + * This operation is done on complete domain names, e.g: "www.example.com". + * It is important to note that this operation can fail. If it fails, then the input * domain name cannot be used as an Internationalized Domain Name and the application * should have methods defined to deal with the failure. - * + * * Note: IDNA RFC specifies that a conformant application should divide a domain name - * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, - * and then convert. This function does not offer that level of granularity. The options once + * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, + * and then convert. This function does not offer that level of granularity. The options once * set will apply to all labels in the domain name * * @param src The input string as a StringBuffer to be processed * @param options A bit set of options: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points * and do not use STD3 ASCII rules - * If unassigned code points are found the operation fails with + * If unassigned code points are found the operation fails with * ParseException. * * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations - * If this option is set, the unassigned code points are in the input + * If this option is set, the unassigned code points are in the input * are treated as normal Unicode code points. - * + * * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions - * If this option is set and the input does not satisfy STD3 rules, + * If this option is set and the input does not satisfy STD3 rules, * the operation will fail with ParseException * @return StringBuffer the converted String * @deprecated ICU 55 Use UTS 46 instead via {@link #getUTS46Instance(int)}. @@ -630,34 +647,34 @@ public static StringBuffer convertIDNToASCII(UCharacterIterator src, int options @Deprecated public static StringBuffer convertIDNToASCII(StringBuffer src, int options) throws StringPrepParseException{ - return convertIDNToASCII(src.toString(), options); + return convertIDNToASCII(src.toString(), options); } - + /** * IDNA2003: Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC. - * This operation is done on complete domain names, e.g: "www.example.com". - * It is important to note that this operation can fail. If it fails, then the input + * This operation is done on complete domain names, e.g: "www.example.com". + * It is important to note that this operation can fail. If it fails, then the input * domain name cannot be used as an Internationalized Domain Name and the application * should have methods defined to deal with the failure. - * + * * Note: IDNA RFC specifies that a conformant application should divide a domain name - * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, - * and then convert. This function does not offer that level of granularity. The options once + * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, + * and then convert. This function does not offer that level of granularity. The options once * set will apply to all labels in the domain name * * @param src The input string to be processed * @param options A bit set of options: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points * and do not use STD3 ASCII rules - * If unassigned code points are found the operation fails with + * If unassigned code points are found the operation fails with * ParseException. * * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations - * If this option is set, the unassigned code points are in the input + * If this option is set, the unassigned code points are in the input * are treated as normal Unicode code points. - * + * * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions - * If this option is set and the input does not satisfy STD3 rules, + * If this option is set and the input does not satisfy STD3 rules, * the operation will fail with ParseException * @return StringBuffer the converted String * @deprecated ICU 55 Use UTS 46 instead via {@link #getUTS46Instance(int)}. @@ -668,27 +685,27 @@ public static StringBuffer convertIDNToASCII(String src,int options) return IDNA2003.convertIDNToASCII(src, options); } - + /** * IDNA2003: This function implements the ToUnicode operation as defined in the IDNA RFC. * This operation is done on single labels before sending it to something that expects * Unicode names. A label is an individual part of a domain name. Labels are usually - * separated by dots; for e.g." "www.example.com" is composed of 3 labels + * separated by dots; for e.g." "www.example.com" is composed of 3 labels * "www","example", and "com". - * + * * @param src The input string to be processed * @param options A bit set of options: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points * and do not use STD3 ASCII rules - * If unassigned code points are found the operation fails with + * If unassigned code points are found the operation fails with * ParseException. * * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations - * If this option is set, the unassigned code points are in the input + * If this option is set, the unassigned code points are in the input * are treated as normal Unicode code points. - * + * * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions - * If this option is set and the input does not satisfy STD3 rules, + * If this option is set and the input does not satisfy STD3 rules, * the operation will fail with ParseException * @return StringBuffer the converted String * @deprecated ICU 55 Use UTS 46 instead via {@link #getUTS46Instance(int)}. @@ -699,27 +716,27 @@ public static StringBuffer convertToUnicode(String src, int options) UCharacterIterator iter = UCharacterIterator.getInstance(src); return convertToUnicode(iter,options); } - + /** * IDNA2003: This function implements the ToUnicode operation as defined in the IDNA RFC. * This operation is done on single labels before sending it to something that expects * Unicode names. A label is an individual part of a domain name. Labels are usually - * separated by dots; for e.g." "www.example.com" is composed of 3 labels + * separated by dots; for e.g." "www.example.com" is composed of 3 labels * "www","example", and "com". - * + * * @param src The input string as StringBuffer to be processed * @param options A bit set of options: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points * and do not use STD3 ASCII rules - * If unassigned code points are found the operation fails with + * If unassigned code points are found the operation fails with * ParseException. * * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations - * If this option is set, the unassigned code points are in the input + * If this option is set, the unassigned code points are in the input * are treated as normal Unicode code points. - * + * * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions - * If this option is set and the input does not satisfy STD3 rules, + * If this option is set and the input does not satisfy STD3 rules, * the operation will fail with ParseException * @return StringBuffer the converted String * @deprecated ICU 55 Use UTS 46 instead via {@link #getUTS46Instance(int)}. @@ -730,27 +747,27 @@ public static StringBuffer convertToUnicode(StringBuffer src, int options) UCharacterIterator iter = UCharacterIterator.getInstance(src); return convertToUnicode(iter,options); } - + /** * IDNA2003: Function that implements the ToUnicode operation as defined in the IDNA RFC. * This operation is done on single labels before sending it to something that expects * Unicode names. A label is an individual part of a domain name. Labels are usually - * separated by dots; for e.g." "www.example.com" is composed of 3 labels + * separated by dots; for e.g." "www.example.com" is composed of 3 labels * "www","example", and "com". - * + * * @param src The input string as UCharacterIterator to be processed * @param options A bit set of options: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points * and do not use STD3 ASCII rules - * If unassigned code points are found the operation fails with + * If unassigned code points are found the operation fails with * ParseException. * * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations - * If this option is set, the unassigned code points are in the input + * If this option is set, the unassigned code points are in the input * are treated as normal Unicode code points. - * + * * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions - * If this option is set and the input does not satisfy STD3 rules, + * If this option is set and the input does not satisfy STD3 rules, * the operation will fail with ParseException * @return StringBuffer the converted String * @deprecated ICU 55 Use UTS 46 instead via {@link #getUTS46Instance(int)}. @@ -760,29 +777,29 @@ public static StringBuffer convertToUnicode(UCharacterIterator src, int options) throws StringPrepParseException{ return IDNA2003.convertToUnicode(src, options); } - + /** * IDNA2003: Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC. - * This operation is done on complete domain names, e.g: "www.example.com". + * This operation is done on complete domain names, e.g: "www.example.com". * * Note: IDNA RFC specifies that a conformant application should divide a domain name - * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, - * and then convert. This function does not offer that level of granularity. The options once + * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, + * and then convert. This function does not offer that level of granularity. The options once * set will apply to all labels in the domain name * * @param src The input string as UCharacterIterator to be processed * @param options A bit set of options: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points * and do not use STD3 ASCII rules - * If unassigned code points are found the operation fails with + * If unassigned code points are found the operation fails with * ParseException. * * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations - * If this option is set, the unassigned code points are in the input + * If this option is set, the unassigned code points are in the input * are treated as normal Unicode code points. - * + * * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions - * If this option is set and the input does not satisfy STD3 rules, + * If this option is set and the input does not satisfy STD3 rules, * the operation will fail with ParseException * @return StringBuffer the converted String * @deprecated ICU 55 Use UTS 46 instead via {@link #getUTS46Instance(int)}. @@ -792,29 +809,29 @@ public static StringBuffer convertIDNToUnicode(UCharacterIterator src, int optio throws StringPrepParseException{ return convertIDNToUnicode(src.getText(), options); } - + /** * IDNA2003: Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC. - * This operation is done on complete domain names, e.g: "www.example.com". + * This operation is done on complete domain names, e.g: "www.example.com". * * Note: IDNA RFC specifies that a conformant application should divide a domain name - * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, - * and then convert. This function does not offer that level of granularity. The options once + * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, + * and then convert. This function does not offer that level of granularity. The options once * set will apply to all labels in the domain name * * @param src The input string as StringBuffer to be processed * @param options A bit set of options: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points * and do not use STD3 ASCII rules - * If unassigned code points are found the operation fails with + * If unassigned code points are found the operation fails with * ParseException. * * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations - * If this option is set, the unassigned code points are in the input + * If this option is set, the unassigned code points are in the input * are treated as normal Unicode code points. - * + * * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions - * If this option is set and the input does not satisfy STD3 rules, + * If this option is set and the input does not satisfy STD3 rules, * the operation will fail with ParseException * @return StringBuffer the converted String * @deprecated ICU 55 Use UTS 46 instead via {@link #getUTS46Instance(int)}. @@ -824,29 +841,29 @@ public static StringBuffer convertIDNToUnicode(StringBuffer src, int options) throws StringPrepParseException{ return convertIDNToUnicode(src.toString(), options); } - + /** * IDNA2003: Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC. - * This operation is done on complete domain names, e.g: "www.example.com". + * This operation is done on complete domain names, e.g: "www.example.com". * * Note: IDNA RFC specifies that a conformant application should divide a domain name - * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, - * and then convert. This function does not offer that level of granularity. The options once + * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, + * and then convert. This function does not offer that level of granularity. The options once * set will apply to all labels in the domain name * * @param src The input string to be processed * @param options A bit set of options: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points * and do not use STD3 ASCII rules - * If unassigned code points are found the operation fails with + * If unassigned code points are found the operation fails with * ParseException. * * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations - * If this option is set, the unassigned code points are in the input + * If this option is set, the unassigned code points are in the input * are treated as normal Unicode code points. - * + * * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions - * If this option is set and the input does not satisfy STD3 rules, + * If this option is set and the input does not satisfy STD3 rules, * the operation will fail with ParseException * @return StringBuffer the converted String * @deprecated ICU 55 Use UTS 46 instead via {@link #getUTS46Instance(int)}. @@ -856,30 +873,30 @@ public static StringBuffer convertIDNToUnicode(String src, int options) throws StringPrepParseException{ return IDNA2003.convertIDNToUnicode(src, options); } - + /** * IDNA2003: Compare two IDN strings for equivalence. * This function splits the domain names into labels and compares them. - * According to IDN RFC, whenever two labels are compared, they are - * considered equal if and only if their ASCII forms (obtained by + * According to IDN RFC, whenever two labels are compared, they are + * considered equal if and only if their ASCII forms (obtained by * applying toASCII) match using an case-insensitive ASCII comparison. - * Two domain names are considered a match if and only if all labels + * Two domain names are considered a match if and only if all labels * match regardless of whether label separators match. - * + * * @param s1 First IDN string as StringBuffer * @param s2 Second IDN string as StringBuffer * @param options A bit set of options: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points * and do not use STD3 ASCII rules - * If unassigned code points are found the operation fails with + * If unassigned code points are found the operation fails with * ParseException. * * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations - * If this option is set, the unassigned code points are in the input + * If this option is set, the unassigned code points are in the input * are treated as normal Unicode code points. - * + * * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions - * If this option is set and the input does not satisfy STD3 rules, + * If this option is set and the input does not satisfy STD3 rules, * the operation will fail with ParseException * @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2 * @deprecated ICU 55 Use UTS 46 instead via {@link #getUTS46Instance(int)}. @@ -892,30 +909,30 @@ public static int compare(StringBuffer s1, StringBuffer s2, int options) } return IDNA2003.compare(s1.toString(), s2.toString(), options); } - + /** * IDNA2003: Compare two IDN strings for equivalence. * This function splits the domain names into labels and compares them. - * According to IDN RFC, whenever two labels are compared, they are - * considered equal if and only if their ASCII forms (obtained by + * According to IDN RFC, whenever two labels are compared, they are + * considered equal if and only if their ASCII forms (obtained by * applying toASCII) match using an case-insensitive ASCII comparison. - * Two domain names are considered a match if and only if all labels + * Two domain names are considered a match if and only if all labels * match regardless of whether label separators match. - * - * @param s1 First IDN string + * + * @param s1 First IDN string * @param s2 Second IDN string * @param options A bit set of options: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points * and do not use STD3 ASCII rules - * If unassigned code points are found the operation fails with + * If unassigned code points are found the operation fails with * ParseException. * * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations - * If this option is set, the unassigned code points are in the input + * If this option is set, the unassigned code points are in the input * are treated as normal Unicode code points. - * + * * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions - * If this option is set and the input does not satisfy STD3 rules, + * If this option is set and the input does not satisfy STD3 rules, * the operation will fail with ParseException * @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2 * @deprecated ICU 55 Use UTS 46 instead via {@link #getUTS46Instance(int)}. @@ -930,26 +947,26 @@ public static int compare(String s1, String s2, int options) throws StringPrepPa /** * IDNA2003: Compare two IDN strings for equivalence. * This function splits the domain names into labels and compares them. - * According to IDN RFC, whenever two labels are compared, they are - * considered equal if and only if their ASCII forms (obtained by + * According to IDN RFC, whenever two labels are compared, they are + * considered equal if and only if their ASCII forms (obtained by * applying toASCII) match using an case-insensitive ASCII comparison. - * Two domain names are considered a match if and only if all labels + * Two domain names are considered a match if and only if all labels * match regardless of whether label separators match. - * + * * @param s1 First IDN string as UCharacterIterator * @param s2 Second IDN string as UCharacterIterator * @param options A bit set of options: * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points * and do not use STD3 ASCII rules - * If unassigned code points are found the operation fails with + * If unassigned code points are found the operation fails with * ParseException. * * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations - * If this option is set, the unassigned code points are in the input + * If this option is set, the unassigned code points are in the input * are treated as normal Unicode code points. - * + * * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions - * If this option is set and the input does not satisfy STD3 rules, + * If this option is set and the input does not satisfy STD3 rules, * the operation will fail with ParseException * @return 0 if the strings are equal, > 0 if i1 > i2 and < 0 if i1 < i2 * @deprecated ICU 55 Use UTS 46 instead via {@link #getUTS46Instance(int)}. diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/normalizer/UTS46Test.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/normalizer/UTS46Test.java index db56e877697f..8824e02df8f2 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/normalizer/UTS46Test.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/normalizer/UTS46Test.java @@ -179,6 +179,24 @@ public void TestACELabelEdgeCases() { info.getErrors().contains(IDNA.Error.INVALID_ACE_LABEL)); } + @Test + public void TestDefaultNontransitional() { + // Unicode 15.1 UTS #46 deprecated transitional processing. + // ICU 76 changed IDNA.DEFAULT to set the nontransitional options. + IDNA forZero = IDNA.getUTS46Instance(0); + IDNA forDefault = IDNA.getUTS46Instance(IDNA.DEFAULT); + StringBuilder result = new StringBuilder(); + IDNA.Info info = new IDNA.Info(); + forZero.labelToUnicode("Fⓤßẞ", result, info); + assertEquals("forZero.toUnicode(Fⓤßẞ)", "fussss", result.toString()); + forZero.labelToASCII("Fⓤßẞ", result, info); + assertEquals("forZero.toASCII(Fⓤßẞ)", "fussss", result.toString()); + forDefault.labelToUnicode("Fⓤßẞ", result, info); + assertEquals("forDefault.toUnicode(Fⓤßẞ)", "fußß", result.toString()); + forDefault.labelToASCII("Fⓤßẞ", result, info); + assertEquals("forDefault.toASCII(Fⓤßẞ)", "xn--fu-hiaa", result.toString()); + } + @Test public void TestTooLong() { // ICU-13727: Limit input length for n^2 algorithm