Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Full Width Chars Casing #45079

Merged
merged 6 commits into from
Nov 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 129 additions & 84 deletions src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,13 @@ struct SortHandle
SearchIteratorNode searchIteratorList[CompareOptionsMask + 1];
};

typedef struct { UChar* items; size_t size; } UCharList;

// Hiragana character range
static const UChar hiraganaStart = 0x3041;
static const UChar hiraganaEnd = 0x309e;
static const UChar hiraganaToKatakanaOffset = 0x30a1 - 0x3041;
// Length of the fullwidth characters from 'A' to 'Z'
// We'll use it to map the casing of the full width 'A' to 'Z' characters
static const int32_t FullWidthAlphabetRangeLength = 0xFF3A - 0xFF21 + 1;

// Mapping between half- and fullwidth characters.
// LowerChars are the characters that should sort lower than HigherChars
Expand Down Expand Up @@ -142,99 +143,101 @@ static int IsHalfFullHigherSymbol(UChar character)
}

/*
Gets a string of custom collation rules, if necessary.
Fill custom collation rules for ignoreKana cases.

Since the CompareOptions flags don't map 1:1 with ICU default functionality, we need to fall back to using
custom rules in order to support IgnoreKanaType and IgnoreWidth CompareOptions correctly.
*/
static UCharList* GetCustomRules(int32_t options, UColAttributeValue strength, int isIgnoreSymbols)
static void FillIgnoreKanaRules(UChar* completeRules, int32_t* fillIndex, int32_t completeRulesLength, int32_t isIgnoreKanaType)
{
int isIgnoreKanaType = (options & CompareOptionsIgnoreKanaType) == CompareOptionsIgnoreKanaType;
int isIgnoreWidth = (options & CompareOptionsIgnoreWidth) == CompareOptionsIgnoreWidth;

// kana differs at the tertiary level
int needsIgnoreKanaTypeCustomRule = isIgnoreKanaType && strength >= UCOL_TERTIARY;
int needsNotIgnoreKanaTypeCustomRule = !isIgnoreKanaType && strength < UCOL_TERTIARY;

// character width differs at the tertiary level
int needsIgnoreWidthCustomRule = isIgnoreWidth && strength >= UCOL_TERTIARY;
int needsNotIgnoreWidthCustomRule = !isIgnoreWidth && strength < UCOL_TERTIARY;
UChar compareChar = isIgnoreKanaType ? '=' : '<';

if (!(needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule || needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule))
return NULL;
assert((*fillIndex) + (4 * (hiraganaEnd - hiraganaStart + 1)) <= completeRulesLength);
if ((*fillIndex) + (4 * (hiraganaEnd - hiraganaStart + 1)) > completeRulesLength) // check the allocated the size
{
return;
}

UCharList* customRules = (UCharList*)malloc(sizeof(UCharList));
if (customRules == NULL)
for (UChar hiraganaChar = hiraganaStart; hiraganaChar <= hiraganaEnd; hiraganaChar++)
{
return NULL;
// Hiragana is the range 3041 to 3096 & 309D & 309E
if (hiraganaChar <= 0x3096 || hiraganaChar >= 0x309D) // characters between 3096 and 309D are not mapped to katakana
{
completeRules[*fillIndex] = '&';
completeRules[(*fillIndex) + 1] = hiraganaChar;
completeRules[(*fillIndex) + 2] = compareChar;
completeRules[(*fillIndex) + 3] = hiraganaChar + hiraganaToKatakanaOffset;
(*fillIndex) += 4;
}
}
}

// If we need to create customRules, the KanaType custom rule will be 88 kana characters * 4 = 352 chars long
// and the Width custom rule will be at most 212 halfwidth characters * 5 = 1060 chars long.
int capacity =
((needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule) ? 4 * (hiraganaEnd - hiraganaStart + 1) : 0) +
((needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule) ? 5 * g_HalfFullCharsLength : 0);
/*
Fill custom collation rules for ignoreWidth cases.

UChar* items;
customRules->items = items = (UChar*)malloc((size_t)capacity * sizeof(UChar));
if (customRules->items == NULL)
Since the CompareOptions flags don't map 1:1 with ICU default functionality, we need to fall back to using
custom rules in order to support IgnoreKanaType and IgnoreWidth CompareOptions correctly.
*/
static void FillIgnoreWidthRules(UChar* completeRules, int32_t* fillIndex, int32_t completeRulesLength, int32_t isIgnoreWidth, int32_t isIgnoreCase, int32_t isIgnoreSymbols)
{
UChar compareChar = isIgnoreWidth ? '=' : '<';

UChar lowerChar;
UChar higherChar;
int needsEscape;

assert((*fillIndex) + (5 * g_HalfFullCharsLength) <= completeRulesLength);
if ((*fillIndex) + (5 * g_HalfFullCharsLength) > completeRulesLength)
{
free(customRules);
return NULL;
return;
}

if (needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule)
for (int i = 0; i < g_HalfFullCharsLength; i++)
{
UChar compareChar = needsIgnoreKanaTypeCustomRule ? '=' : '<';

for (UChar hiraganaChar = hiraganaStart; hiraganaChar <= hiraganaEnd; hiraganaChar++)
lowerChar = g_HalfFullLowerChars[i];
higherChar = g_HalfFullHigherChars[i];
// the lower chars need to be checked for escaping since they contain ASCII punctuation
needsEscape = NeedsEscape(lowerChar);

// when isIgnoreSymbols is true and we are not ignoring width, check to see if
// this character is a symbol, and if so skip it
if (!(isIgnoreSymbols && (!isIgnoreWidth) && (needsEscape || IsHalfFullHigherSymbol(higherChar))))
{
// Hiragana is the range 3041 to 3096 & 309D & 309E
if (hiraganaChar <= 0x3096 || hiraganaChar >= 0x309D) // characters between 3096 and 309D are not mapped to katakana
completeRules[*fillIndex] = '&';
(*fillIndex)++;

if (needsEscape)
{
assert(items - customRules->items <= capacity - 4);
*(items++) = '&';
*(items++) = hiraganaChar;
*(items++) = compareChar;
*(items++) = hiraganaChar + hiraganaToKatakanaOffset;
completeRules[*fillIndex] = '\\';
(*fillIndex)++;
}

completeRules[*fillIndex] = lowerChar;
completeRules[(*fillIndex) + 1] = compareChar;
completeRules[(*fillIndex) + 2] = higherChar;
(*fillIndex) += 3;
}
}

if (needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule)
// When we have isIgnoreWidth is false, we sort the normal width latin alphabet characters before the full width latin alphabet characters
// e.g. `a` < `a` (\uFF41)
// This break the casing of the full width latin alphabet characters.
// e.g. `a` (\uFF41) == `A` (\uFF21).
// we are fixing back this case mapping here.
if (isIgnoreCase && (!isIgnoreWidth))
{
UChar compareChar = needsIgnoreWidthCustomRule ? '=' : '<';
assert((*fillIndex) + (FullWidthAlphabetRangeLength * 4) <= completeRulesLength);
const int UpperCaseToLowerCaseOffset = 0xFF41 - 0xFF21;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any concern with current definition?


UChar lowerChar;
UChar higherChar;
int needsEscape;
for (int i = 0; i < g_HalfFullCharsLength; i++)
for (UChar ch = 0xFF21; ch <= 0xFF3A; ch++)
{
lowerChar = g_HalfFullLowerChars[i];
higherChar = g_HalfFullHigherChars[i];
// the lower chars need to be checked for escaping since they contain ASCII punctuation
needsEscape = NeedsEscape(lowerChar);

// when isIgnoreSymbols is true and we are not ignoring width, check to see if
// this character is a symbol, and if so skip it
if (!(isIgnoreSymbols && needsNotIgnoreWidthCustomRule && (needsEscape || IsHalfFullHigherSymbol(higherChar))))
{
assert(items - customRules->items <= capacity - 5);
*(items++) = '&';
if (needsEscape)
{
*(items++) = '\\';
}
*(items++) = lowerChar;
*(items++) = compareChar;
*(items++) = higherChar;
}
completeRules[*fillIndex] = '&';
completeRules[(*fillIndex) + 1] = ch + UpperCaseToLowerCaseOffset;
completeRules[(*fillIndex) + 2] = '=';
completeRules[(*fillIndex) + 3] = ch;
(*fillIndex) += 4;
}
}

customRules->size = (size_t)(items - customRules->items);

return customRules;
}

/*
Expand All @@ -247,9 +250,11 @@ static UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t o
{
UColAttributeValue strength = ucol_getStrength(pCollator);

int isIgnoreCase = (options & CompareOptionsIgnoreCase) == CompareOptionsIgnoreCase;
int isIgnoreNonSpace = (options & CompareOptionsIgnoreNonSpace) == CompareOptionsIgnoreNonSpace;
int isIgnoreSymbols = (options & CompareOptionsIgnoreSymbols) == CompareOptionsIgnoreSymbols;
int32_t isIgnoreCase = (options & CompareOptionsIgnoreCase) == CompareOptionsIgnoreCase;
int32_t isIgnoreNonSpace = (options & CompareOptionsIgnoreNonSpace) == CompareOptionsIgnoreNonSpace;
int32_t isIgnoreSymbols = (options & CompareOptionsIgnoreSymbols) == CompareOptionsIgnoreSymbols;
int32_t isIgnoreKanaType = (options & CompareOptionsIgnoreKanaType) == CompareOptionsIgnoreKanaType;
int32_t isIgnoreWidth = (options & CompareOptionsIgnoreWidth) == CompareOptionsIgnoreWidth;

if (isIgnoreCase)
{
Expand All @@ -262,34 +267,74 @@ static UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t o
}

UCollator* pClonedCollator;
UCharList* customRules = GetCustomRules(options, strength, isIgnoreSymbols);
if (customRules == NULL || customRules->size == 0)

// IgnoreWidth - it would be easy to IgnoreWidth by just setting Strength <= Secondary.
// For any strength under that, the width of the characters will be ignored.
// For strength above that, the width of the characters will be used in differentiation.
// a. However, this doesn’t play nice with IgnoreCase, since these Strength levels are overloaded.
// b. So the plan to support IgnoreWidth is to use customized rules.
// i. Since the character width is differentiated at “Tertiary” strength, we only need to use custom rules in specific cases.
// ii. If (IgnoreWidth == true && Strength > “Secondary”)
// 1. Build up a custom rule set for each half-width character and say that it is equal to the corresponding full-width character.
// a. ex: “0x30F2 = 0xFF66 & 0x30F3 = 0xFF9D & …”
// iii. If (IgnoreWidth == false && Strength <= “Secondary”)
// 1. Build up a custom rule set saying that the half-width and full-width characters have a primary level difference (which will cause it always to be unequal)
// a. Ex. “0x30F2 < 0xFF66 & 0x30F3 < 0xFF9D & …”
// IgnoreKanaType – this works the same way as IgnoreWidth, it uses the set of Hiragana and Katakana characters instead of half-width vs full-width characters to build the rules.
int32_t applyIgnoreKanaTypeCustomRule = isIgnoreKanaType ^ (strength < UCOL_TERTIARY); // kana differs at the tertiary level
int32_t applyIgnoreWidthTypeCustomRule = isIgnoreWidth ^ (strength < UCOL_TERTIARY); // character width differs at the tertiary level

int32_t customRuleLength = 0;
if (applyIgnoreKanaTypeCustomRule || applyIgnoreWidthTypeCustomRule)
{
// If we need to create customRules, the KanaType custom rule will be 88 kana characters * 4 = 352 chars long
// and the Width custom rule will be at most 212 halfwidth characters * 5 = 1060 chars long.
customRuleLength = (applyIgnoreKanaTypeCustomRule ? 4 * (hiraganaEnd - hiraganaStart + 1) : 0) +
(applyIgnoreWidthTypeCustomRule ? ((5 * g_HalfFullCharsLength) + (isIgnoreCase ? 4 * FullWidthAlphabetRangeLength : 0)) : 0) +
1; // Adding extra terminator rule at the end to force ICU apply last actual entered rule, otherwise last actual rule get ignored.
}

if (customRuleLength == 0)
{
pClonedCollator = ucol_safeClone(pCollator, NULL, NULL, pErr);
}
else
{
int32_t customRuleLength = (int32_t)customRules->size;

int32_t localeRulesLength;
const UChar* localeRules = ucol_getRules(pCollator, &localeRulesLength);
int32_t completeRulesLength = localeRulesLength + customRuleLength + 1;
int32_t rulesLength;
safern marked this conversation as resolved.
Show resolved Hide resolved
const UChar* localeRules = ucol_getRules(pCollator, &rulesLength);
int32_t completeRulesLength = rulesLength + customRuleLength + 1;

UChar* completeRules = (UChar*)calloc((size_t)completeRulesLength, sizeof(UChar));

for (int i = 0; i < localeRulesLength; i++)
for (int i = 0; i < rulesLength; i++)
{
completeRules[i] = localeRules[i];
}
for (int i = 0; i < customRuleLength; i++)

if (applyIgnoreKanaTypeCustomRule)
{
completeRules[localeRulesLength + i] = customRules->items[i];
FillIgnoreKanaRules(completeRules, &rulesLength, completeRulesLength, isIgnoreKanaType);
}

pClonedCollator = ucol_openRules(completeRules, completeRulesLength, UCOL_DEFAULT, strength, NULL, pErr);
assert(rulesLength <= completeRulesLength);

if (applyIgnoreWidthTypeCustomRule)
{
FillIgnoreWidthRules(completeRules, &rulesLength, completeRulesLength, isIgnoreWidth, isIgnoreCase, isIgnoreSymbols);
}

assert(rulesLength + 4 <= completeRulesLength);

// Adding extra terminator rule at the end to force ICU apply last actual entered rule, otherwise last actual rule get ignored.
completeRules[rulesLength] = '&';
completeRules[rulesLength + 1] = 'a';
completeRules[rulesLength + 2] = '=';
completeRules[rulesLength + 3] = 'a';
rulesLength += 4;

pClonedCollator = ucol_openRules(completeRules, rulesLength, UCOL_DEFAULT, strength, NULL, pErr);
free(completeRules);
}
free(customRules);

if (isIgnoreSymbols)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -455,5 +455,31 @@ public void Compare_Invalid()
AssertExtensions.Throws<ArgumentOutOfRangeException>("string2", () => s_invariantCompare.Compare("Test", 0, 2, "Test", 2, 3));
AssertExtensions.Throws<ArgumentOutOfRangeException>("string2", () => s_invariantCompare.Compare("Test", 0, 2, "Test", 2, 3, CompareOptions.None));
}

[Fact]
public void TestIgnoreKanaAndWidthCases()
{
for (char c = '\uFF41'; c <= '\uFF5A'; c++) // Full width 'a' to `z`
{
Assert.False(string.Equals(new string(c, 1), new string((char) (c - 0x20), 1), StringComparison.InvariantCulture), $"Expected '{(int)c:x4}' != '{c - 0x20:x4}'");
Assert.True(string.Equals(new string(c, 1), new string((char) (c - 0x20), 1), StringComparison.InvariantCultureIgnoreCase), $"Expected '{(int)c:x4}' == '{c - 0x20:x4}'");
}

// Edge case of the Ignore Width.
Assert.False(string.Compare("\u3162\u3163", "\uFFDB\uFFDC", CultureInfo.InvariantCulture, CompareOptions.None) == 0, $"Expect '0x3162 0x3163' != '0xFFDB 0xFFDC'");
Assert.True(string.Compare("\u3162\u3163", "\uFFDB\uFFDC", CultureInfo.InvariantCulture, CompareOptions.IgnoreWidth) == 0, "Expect '0x3162 0x3163' == '0xFFDB 0xFFDC'");

const char hiraganaStart = '\u3041';
const char hiraganaEnd = '\u3096';
const int hiraganaToKatakanaOffset = 0x30a1 - 0x3041;

for (Char hiraganaChar = hiraganaStart; hiraganaChar <= hiraganaEnd; hiraganaChar++)
{
Assert.False(string.Compare(new string(hiraganaChar, 1), new string((char)(hiraganaChar + hiraganaToKatakanaOffset), 1), CultureInfo.InvariantCulture, CompareOptions.IgnoreCase) == 0,
$"Expect '{(int)hiraganaChar:x4}' != {(int)hiraganaChar + hiraganaToKatakanaOffset:x4} with CompareOptions.IgnoreCase");
Assert.True(string.Compare(new string(hiraganaChar, 1), new string((char)(hiraganaChar + hiraganaToKatakanaOffset), 1), CultureInfo.InvariantCulture, CompareOptions.IgnoreKanaType) == 0,
$"Expect '{(int)hiraganaChar:x4}' == {(int)hiraganaChar + hiraganaToKatakanaOffset:x4} with CompareOptions.IgnoreKanaType");
}
}
}
}