diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c index a9eb1d57d74c7..99edfd96bcf1f 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c @@ -59,12 +59,13 @@ struct SortHandle SearchIteratorNode searchIteratorList[CompareOptionsMask + 1]; }; -typedef struct { UChar* items; size_t size; } UCharList; - // Hiragana character range static const UChar hiraganaStart = 0x3041; static const UChar hiraganaEnd = 0x309e; static const UChar hiraganaToKatakanaOffset = 0x30a1 - 0x3041; +// Length of the fullwidth characters from 'A' to 'Z' +// We'll use it to map the casing of the full width 'A' to 'Z' characters +static const int32_t FullWidthAlphabetRangeLength = 0xFF3A - 0xFF21 + 1; // Mapping between half- and fullwidth characters. // LowerChars are the characters that should sort lower than HigherChars @@ -142,99 +143,101 @@ static int IsHalfFullHigherSymbol(UChar character) } /* -Gets a string of custom collation rules, if necessary. +Fill custom collation rules for ignoreKana cases. Since the CompareOptions flags don't map 1:1 with ICU default functionality, we need to fall back to using custom rules in order to support IgnoreKanaType and IgnoreWidth CompareOptions correctly. */ -static UCharList* GetCustomRules(int32_t options, UColAttributeValue strength, int isIgnoreSymbols) +static void FillIgnoreKanaRules(UChar* completeRules, int32_t* fillIndex, int32_t completeRulesLength, int32_t isIgnoreKanaType) { - int isIgnoreKanaType = (options & CompareOptionsIgnoreKanaType) == CompareOptionsIgnoreKanaType; - int isIgnoreWidth = (options & CompareOptionsIgnoreWidth) == CompareOptionsIgnoreWidth; - - // kana differs at the tertiary level - int needsIgnoreKanaTypeCustomRule = isIgnoreKanaType && strength >= UCOL_TERTIARY; - int needsNotIgnoreKanaTypeCustomRule = !isIgnoreKanaType && strength < UCOL_TERTIARY; - - // character width differs at the tertiary level - int needsIgnoreWidthCustomRule = isIgnoreWidth && strength >= UCOL_TERTIARY; - int needsNotIgnoreWidthCustomRule = !isIgnoreWidth && strength < UCOL_TERTIARY; + UChar compareChar = isIgnoreKanaType ? '=' : '<'; - if (!(needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule || needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule)) - return NULL; + assert((*fillIndex) + (4 * (hiraganaEnd - hiraganaStart + 1)) <= completeRulesLength); + if ((*fillIndex) + (4 * (hiraganaEnd - hiraganaStart + 1)) > completeRulesLength) // check the allocated the size + { + return; + } - UCharList* customRules = (UCharList*)malloc(sizeof(UCharList)); - if (customRules == NULL) + for (UChar hiraganaChar = hiraganaStart; hiraganaChar <= hiraganaEnd; hiraganaChar++) { - return NULL; + // Hiragana is the range 3041 to 3096 & 309D & 309E + if (hiraganaChar <= 0x3096 || hiraganaChar >= 0x309D) // characters between 3096 and 309D are not mapped to katakana + { + completeRules[*fillIndex] = '&'; + completeRules[(*fillIndex) + 1] = hiraganaChar; + completeRules[(*fillIndex) + 2] = compareChar; + completeRules[(*fillIndex) + 3] = hiraganaChar + hiraganaToKatakanaOffset; + (*fillIndex) += 4; + } } +} - // If we need to create customRules, the KanaType custom rule will be 88 kana characters * 4 = 352 chars long - // and the Width custom rule will be at most 212 halfwidth characters * 5 = 1060 chars long. - int capacity = - ((needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule) ? 4 * (hiraganaEnd - hiraganaStart + 1) : 0) + - ((needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule) ? 5 * g_HalfFullCharsLength : 0); +/* +Fill custom collation rules for ignoreWidth cases. - UChar* items; - customRules->items = items = (UChar*)malloc((size_t)capacity * sizeof(UChar)); - if (customRules->items == NULL) +Since the CompareOptions flags don't map 1:1 with ICU default functionality, we need to fall back to using +custom rules in order to support IgnoreKanaType and IgnoreWidth CompareOptions correctly. +*/ +static void FillIgnoreWidthRules(UChar* completeRules, int32_t* fillIndex, int32_t completeRulesLength, int32_t isIgnoreWidth, int32_t isIgnoreCase, int32_t isIgnoreSymbols) +{ + UChar compareChar = isIgnoreWidth ? '=' : '<'; + + UChar lowerChar; + UChar higherChar; + int needsEscape; + + assert((*fillIndex) + (5 * g_HalfFullCharsLength) <= completeRulesLength); + if ((*fillIndex) + (5 * g_HalfFullCharsLength) > completeRulesLength) { - free(customRules); - return NULL; + return; } - if (needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule) + for (int i = 0; i < g_HalfFullCharsLength; i++) { - UChar compareChar = needsIgnoreKanaTypeCustomRule ? '=' : '<'; - - for (UChar hiraganaChar = hiraganaStart; hiraganaChar <= hiraganaEnd; hiraganaChar++) + lowerChar = g_HalfFullLowerChars[i]; + higherChar = g_HalfFullHigherChars[i]; + // the lower chars need to be checked for escaping since they contain ASCII punctuation + needsEscape = NeedsEscape(lowerChar); + + // when isIgnoreSymbols is true and we are not ignoring width, check to see if + // this character is a symbol, and if so skip it + if (!(isIgnoreSymbols && (!isIgnoreWidth) && (needsEscape || IsHalfFullHigherSymbol(higherChar)))) { - // Hiragana is the range 3041 to 3096 & 309D & 309E - if (hiraganaChar <= 0x3096 || hiraganaChar >= 0x309D) // characters between 3096 and 309D are not mapped to katakana + completeRules[*fillIndex] = '&'; + (*fillIndex)++; + + if (needsEscape) { - assert(items - customRules->items <= capacity - 4); - *(items++) = '&'; - *(items++) = hiraganaChar; - *(items++) = compareChar; - *(items++) = hiraganaChar + hiraganaToKatakanaOffset; + completeRules[*fillIndex] = '\\'; + (*fillIndex)++; } + + completeRules[*fillIndex] = lowerChar; + completeRules[(*fillIndex) + 1] = compareChar; + completeRules[(*fillIndex) + 2] = higherChar; + (*fillIndex) += 3; } } - if (needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule) + // When we have isIgnoreWidth is false, we sort the normal width latin alphabet characters before the full width latin alphabet characters + // e.g. `a` < `a` (\uFF41) + // This break the casing of the full width latin alphabet characters. + // e.g. `a` (\uFF41) == `A` (\uFF21). + // we are fixing back this case mapping here. + if (isIgnoreCase && (!isIgnoreWidth)) { - UChar compareChar = needsIgnoreWidthCustomRule ? '=' : '<'; + assert((*fillIndex) + (FullWidthAlphabetRangeLength * 4) <= completeRulesLength); + const int UpperCaseToLowerCaseOffset = 0xFF41 - 0xFF21; - UChar lowerChar; - UChar higherChar; - int needsEscape; - for (int i = 0; i < g_HalfFullCharsLength; i++) + for (UChar ch = 0xFF21; ch <= 0xFF3A; ch++) { - lowerChar = g_HalfFullLowerChars[i]; - higherChar = g_HalfFullHigherChars[i]; - // the lower chars need to be checked for escaping since they contain ASCII punctuation - needsEscape = NeedsEscape(lowerChar); - - // when isIgnoreSymbols is true and we are not ignoring width, check to see if - // this character is a symbol, and if so skip it - if (!(isIgnoreSymbols && needsNotIgnoreWidthCustomRule && (needsEscape || IsHalfFullHigherSymbol(higherChar)))) - { - assert(items - customRules->items <= capacity - 5); - *(items++) = '&'; - if (needsEscape) - { - *(items++) = '\\'; - } - *(items++) = lowerChar; - *(items++) = compareChar; - *(items++) = higherChar; - } + completeRules[*fillIndex] = '&'; + completeRules[(*fillIndex) + 1] = ch + UpperCaseToLowerCaseOffset; + completeRules[(*fillIndex) + 2] = '='; + completeRules[(*fillIndex) + 3] = ch; + (*fillIndex) += 4; } } - - customRules->size = (size_t)(items - customRules->items); - - return customRules; } /* @@ -247,9 +250,11 @@ static UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t o { UColAttributeValue strength = ucol_getStrength(pCollator); - int isIgnoreCase = (options & CompareOptionsIgnoreCase) == CompareOptionsIgnoreCase; - int isIgnoreNonSpace = (options & CompareOptionsIgnoreNonSpace) == CompareOptionsIgnoreNonSpace; - int isIgnoreSymbols = (options & CompareOptionsIgnoreSymbols) == CompareOptionsIgnoreSymbols; + int32_t isIgnoreCase = (options & CompareOptionsIgnoreCase) == CompareOptionsIgnoreCase; + int32_t isIgnoreNonSpace = (options & CompareOptionsIgnoreNonSpace) == CompareOptionsIgnoreNonSpace; + int32_t isIgnoreSymbols = (options & CompareOptionsIgnoreSymbols) == CompareOptionsIgnoreSymbols; + int32_t isIgnoreKanaType = (options & CompareOptionsIgnoreKanaType) == CompareOptionsIgnoreKanaType; + int32_t isIgnoreWidth = (options & CompareOptionsIgnoreWidth) == CompareOptionsIgnoreWidth; if (isIgnoreCase) { @@ -262,34 +267,74 @@ static UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t o } UCollator* pClonedCollator; - UCharList* customRules = GetCustomRules(options, strength, isIgnoreSymbols); - if (customRules == NULL || customRules->size == 0) + + // IgnoreWidth - it would be easy to IgnoreWidth by just setting Strength <= Secondary. + // For any strength under that, the width of the characters will be ignored. + // For strength above that, the width of the characters will be used in differentiation. + // a. However, this doesn’t play nice with IgnoreCase, since these Strength levels are overloaded. + // b. So the plan to support IgnoreWidth is to use customized rules. + // i. Since the character width is differentiated at “Tertiary” strength, we only need to use custom rules in specific cases. + // ii. If (IgnoreWidth == true && Strength > “Secondary”) + // 1. Build up a custom rule set for each half-width character and say that it is equal to the corresponding full-width character. + // a. ex: “0x30F2 = 0xFF66 & 0x30F3 = 0xFF9D & …” + // iii. If (IgnoreWidth == false && Strength <= “Secondary”) + // 1. Build up a custom rule set saying that the half-width and full-width characters have a primary level difference (which will cause it always to be unequal) + // a. Ex. “0x30F2 < 0xFF66 & 0x30F3 < 0xFF9D & …” + // IgnoreKanaType – this works the same way as IgnoreWidth, it uses the set of Hiragana and Katakana characters instead of half-width vs full-width characters to build the rules. + int32_t applyIgnoreKanaTypeCustomRule = isIgnoreKanaType ^ (strength < UCOL_TERTIARY); // kana differs at the tertiary level + int32_t applyIgnoreWidthTypeCustomRule = isIgnoreWidth ^ (strength < UCOL_TERTIARY); // character width differs at the tertiary level + + int32_t customRuleLength = 0; + if (applyIgnoreKanaTypeCustomRule || applyIgnoreWidthTypeCustomRule) + { + // If we need to create customRules, the KanaType custom rule will be 88 kana characters * 4 = 352 chars long + // and the Width custom rule will be at most 212 halfwidth characters * 5 = 1060 chars long. + customRuleLength = (applyIgnoreKanaTypeCustomRule ? 4 * (hiraganaEnd - hiraganaStart + 1) : 0) + + (applyIgnoreWidthTypeCustomRule ? ((5 * g_HalfFullCharsLength) + (isIgnoreCase ? 4 * FullWidthAlphabetRangeLength : 0)) : 0) + + 1; // Adding extra terminator rule at the end to force ICU apply last actual entered rule, otherwise last actual rule get ignored. + } + + if (customRuleLength == 0) { pClonedCollator = ucol_safeClone(pCollator, NULL, NULL, pErr); } else { - int32_t customRuleLength = (int32_t)customRules->size; - - int32_t localeRulesLength; - const UChar* localeRules = ucol_getRules(pCollator, &localeRulesLength); - int32_t completeRulesLength = localeRulesLength + customRuleLength + 1; + int32_t rulesLength; + const UChar* localeRules = ucol_getRules(pCollator, &rulesLength); + int32_t completeRulesLength = rulesLength + customRuleLength + 1; UChar* completeRules = (UChar*)calloc((size_t)completeRulesLength, sizeof(UChar)); - for (int i = 0; i < localeRulesLength; i++) + for (int i = 0; i < rulesLength; i++) { completeRules[i] = localeRules[i]; } - for (int i = 0; i < customRuleLength; i++) + + if (applyIgnoreKanaTypeCustomRule) { - completeRules[localeRulesLength + i] = customRules->items[i]; + FillIgnoreKanaRules(completeRules, &rulesLength, completeRulesLength, isIgnoreKanaType); } - pClonedCollator = ucol_openRules(completeRules, completeRulesLength, UCOL_DEFAULT, strength, NULL, pErr); + assert(rulesLength <= completeRulesLength); + + if (applyIgnoreWidthTypeCustomRule) + { + FillIgnoreWidthRules(completeRules, &rulesLength, completeRulesLength, isIgnoreWidth, isIgnoreCase, isIgnoreSymbols); + } + + assert(rulesLength + 4 <= completeRulesLength); + + // Adding extra terminator rule at the end to force ICU apply last actual entered rule, otherwise last actual rule get ignored. + completeRules[rulesLength] = '&'; + completeRules[rulesLength + 1] = 'a'; + completeRules[rulesLength + 2] = '='; + completeRules[rulesLength + 3] = 'a'; + rulesLength += 4; + + pClonedCollator = ucol_openRules(completeRules, rulesLength, UCOL_DEFAULT, strength, NULL, pErr); free(completeRules); } - free(customRules); if (isIgnoreSymbols) { diff --git a/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.Compare.cs b/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.Compare.cs index 57704146c8b68..98890c0c4fdbf 100644 --- a/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.Compare.cs +++ b/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.Compare.cs @@ -455,5 +455,31 @@ public void Compare_Invalid() AssertExtensions.Throws("string2", () => s_invariantCompare.Compare("Test", 0, 2, "Test", 2, 3)); AssertExtensions.Throws("string2", () => s_invariantCompare.Compare("Test", 0, 2, "Test", 2, 3, CompareOptions.None)); } + + [Fact] + public void TestIgnoreKanaAndWidthCases() + { + for (char c = '\uFF41'; c <= '\uFF5A'; c++) // Full width 'a' to `z` + { + Assert.False(string.Equals(new string(c, 1), new string((char) (c - 0x20), 1), StringComparison.InvariantCulture), $"Expected '{(int)c:x4}' != '{c - 0x20:x4}'"); + Assert.True(string.Equals(new string(c, 1), new string((char) (c - 0x20), 1), StringComparison.InvariantCultureIgnoreCase), $"Expected '{(int)c:x4}' == '{c - 0x20:x4}'"); + } + + // Edge case of the Ignore Width. + Assert.False(string.Compare("\u3162\u3163", "\uFFDB\uFFDC", CultureInfo.InvariantCulture, CompareOptions.None) == 0, $"Expect '0x3162 0x3163' != '0xFFDB 0xFFDC'"); + Assert.True(string.Compare("\u3162\u3163", "\uFFDB\uFFDC", CultureInfo.InvariantCulture, CompareOptions.IgnoreWidth) == 0, "Expect '0x3162 0x3163' == '0xFFDB 0xFFDC'"); + + const char hiraganaStart = '\u3041'; + const char hiraganaEnd = '\u3096'; + const int hiraganaToKatakanaOffset = 0x30a1 - 0x3041; + + for (Char hiraganaChar = hiraganaStart; hiraganaChar <= hiraganaEnd; hiraganaChar++) + { + Assert.False(string.Compare(new string(hiraganaChar, 1), new string((char)(hiraganaChar + hiraganaToKatakanaOffset), 1), CultureInfo.InvariantCulture, CompareOptions.IgnoreCase) == 0, + $"Expect '{(int)hiraganaChar:x4}' != {(int)hiraganaChar + hiraganaToKatakanaOffset:x4} with CompareOptions.IgnoreCase"); + Assert.True(string.Compare(new string(hiraganaChar, 1), new string((char)(hiraganaChar + hiraganaToKatakanaOffset), 1), CultureInfo.InvariantCulture, CompareOptions.IgnoreKanaType) == 0, + $"Expect '{(int)hiraganaChar:x4}' == {(int)hiraganaChar + hiraganaToKatakanaOffset:x4} with CompareOptions.IgnoreKanaType"); + } + } } }