Skip to content

Commit

Permalink
Work around a UnicodeSet bug (unicode-org#908)
Browse files Browse the repository at this point in the history
Co-authored-by: Markus Scherer <[email protected]>
  • Loading branch information
eggrobin and markusicu authored Aug 9, 2024
1 parent cccbe93 commit 5845839
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 7 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.unicode.text.UCD;

import com.google.common.base.Strings;
import com.ibm.icu.dev.tool.UOption;
import com.ibm.icu.dev.util.UnicodeMap;
import com.ibm.icu.lang.UCharacter;
Expand All @@ -26,6 +27,7 @@
import java.util.Stack;
import java.util.TreeMap;
import java.util.function.Function;
import java.util.regex.MatchResult;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.unicode.cldr.draft.FileUtilities;
Expand Down Expand Up @@ -2007,17 +2009,63 @@ public BackwardParseException(String s, int errorOffset) {
}
}

private static Pattern nameEscape = Pattern.compile("\\\\N\\{[^}]*\\}");

public static UnicodeSet parseUnicodeSet(String source, ParsePosition pp)
throws ParseException {
final int initialPosition = pp.getIndex();
UnicodeSet icuSet;
try {
final var result = new UnicodeSet(source, pp, symbolTable);
return result;
// Let ICU figure out where the UnicodeSet expression ends.
icuSet = new UnicodeSet(source, pp, symbolTable);
} catch (IllegalArgumentException e) {
// ICU produces unhelpful messages when parsing UnicodeSet deep into
// a large string in a string that contains line terminators, as the
// whole string is escaped and printed.
final String message = e.getMessage().split(" at \"", 2)[0];
throw new BackwardParseException(message, pp.getIndex());
}
String unicodeSetExpression = source.substring(initialPosition, pp.getIndex());
// ICU incorrectly treats \N{X} as a synonym for \p{Name=X}, returning a
// set rather than a character, so that it can be empty, and so that
// \N{X}-\N{Y} is a set difference (equal to \N{X}) rather than the range \N{X}-\N{Y}.
// This should likely be fixed in ICU, but in the meantime we need to work around it in
// the invariant before someone gets hurt.
var matcher = nameEscape.matcher(unicodeSetExpression);
if (!matcher.find()) {
return icuSet;
}
// Simplest way for the lambda function to report errors.
// It cannot throw a ParseException, and it cannot modify local variables.
// It _can_ modify what this local variable points to.
// Below, we will throw a ParseException for the first bad position.
final List<Integer> badEscapePositions = new ArrayList<>();
unicodeSetExpression =
matcher.replaceAll(
(MatchResult match) -> {
UnicodeSet character =
new UnicodeSet(
match.group(), new ParsePosition(0), symbolTable);
if (character.isEmpty()) {
badEscapePositions.add(match.start());
return "";
}
return Strings.padStart(
"\\\\x{" + Integer.toHexString(character.charAt(0)) + "}",
match.group().length() + 1,
' ');
});
for (int p : badEscapePositions) {
// Simplest way to throw an exception for only the first list element.
throw new ParseException("No character matching \\N escape", initialPosition + p);
}
var patchedParsePosition = new ParsePosition(0);
try {
return new UnicodeSet(unicodeSetExpression, patchedParsePosition, symbolTable);
} catch (IllegalArgumentException e) {
final String message = e.getMessage().split(" at \"", 2)[0];
throw new BackwardParseException(
message, patchedParsePosition.getIndex() + initialPosition);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ In \P{U-1:gc=Cn}, R-1:NFKC_Simple_Casefold = NFKC_Simple_Casefold
# Red Flag: cased and case_ignorable should be disjoint,
# except for modifier letters and ◌ͅ.

\p{Cased} ∥ [\p{Case_Ignorable} - \p{gc=Lm} - \N{COMBINING GREEK YPOGEGRAMMENI} ]
\p{Cased} ∥ [\p{Case_Ignorable} - \p{gc=Lm} - [\N{COMBINING GREEK YPOGEGRAMMENI}] ]

##########################
# Property Stability Policies
Expand Down Expand Up @@ -837,8 +837,8 @@ Let $QUInclusions := [\u275F-\u2760 \U0001F676-\U0001F678 \u0022 \u0027 \u275B-\
# 7.0 added 275F..2760 1F676..1F678
\p{LB=QU} = [\p{GC=Pf} \p{GC=Pi} $QUInclusions]
\p{LB=SG} = \p{GC=Cs}
\p{LB=SP} = \N{SPACE}
\p{LB=SY} = \N{SOLIDUS}
\p{LB=SP} = [\N{SPACE}]
\p{LB=SY} = [\N{SOLIDUS}]

# The classes WJ and ZW are reserved for characters meant as explicit overrides
# to the line breaking algorithms. For historical reasons, WJ has two such
Expand All @@ -848,7 +848,7 @@ Let $QUInclusions := [\u275F-\u2760 \U0001F676-\U0001F678 \u0022 \u0027 \u275B-\
# behaviour requires the prohibition of line breaks on either side should have
# class GL.
\p{LB=WJ} = [\N{WORD JOINER} \N{ZERO WIDTH NO-BREAK SPACE}]
\p{LB=ZW} = \N{ZERO WIDTH SPACE}
\p{LB=ZW} = [\N{ZERO WIDTH SPACE}]

\p{LB=ZWJ} = [\u200D]
\p{LB=RI} = \p{RI=Y}
Expand Down Expand Up @@ -1249,7 +1249,7 @@ Let $cjkStrokes := \p{Name=/^CJK STROKE /}
Let $kangxiRadicals := \p{Name=/^KANGXI RADICAL /}
Let $cjkRadicals := \p{Name=/^CJK RADICAL /}
Let $strokesAndRadicals := [ $cjkStrokes $kangxiRadicals $cjkRadicals ]
Let $nonIdeographicRadicals := \N{CJK RADICAL REPEAT}
Let $nonIdeographicRadicals := [\N{CJK RADICAL REPEAT}]
# The following set may expand over time, if strokes are added.
# It can also shrink, if single-stroke ideographs are encoded.
Let $nonIdeographicStrokes := \p{Name=/^CJK STROKE (T|WG|XG|BXG|SW|HZZ|HP|HZWG|SZWG|HZT|HZZP|HPWG|HZW|HZZZ|PG|Q|HXG|SZP)$/}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;

import java.io.IOException;
import java.text.ParseException;
import java.text.ParsePosition;
import org.junit.jupiter.api.Test;
import org.unicode.text.UCD.TestUnicodeInvariants.BackwardParseException;
import org.unicode.text.utility.Settings;

public class TestTestUnicodeInvariants {
Expand Down Expand Up @@ -49,4 +53,31 @@ void testSecurityInvariants() throws IOException {
TestUnicodeInvariants.testInvariants("SecurityInvariantTest.txt", "security", true);
assertEquals(0, rc, "TestUnicodeInvariants.testInvariants(security) failed");
}

@Test
void testUnicodeSetParsing() throws ParseException {
assertEquals(
26,
TestUnicodeInvariants.parseUnicodeSet(
"TEST [\\N{LATIN SMALL LETTER A}-\\N{LATIN SMALL LETTER Z}]",
new ParsePosition(5))
.size());
ParseException thrown =
assertThrows(
ParseException.class,
() ->
TestUnicodeInvariants.parseUnicodeSet(
"TEST [\\N{MEOW}]", new ParsePosition(5)));
assertEquals("No character matching \\N escape", thrown.getMessage());
assertEquals("TEST [".length(), thrown.getErrorOffset());
thrown =
assertThrows(
BackwardParseException.class,
() ->
TestUnicodeInvariants.parseUnicodeSet(
"TEST [[a-z]-\\N{LATIN SMALL LETTER Z}]",
new ParsePosition(5)));
assertEquals("Error: Set expected after operator", thrown.getMessage());
assertEquals("TEST [[a-z]-.N{LATIN SMALL LETTER Z}".length(), thrown.getErrorOffset());
}
}

0 comments on commit 5845839

Please sign in to comment.