Skip to content

Commit

Permalink
ICU-21071 Fix lenient parse rules
Browse files Browse the repository at this point in the history
- Check non-lenient rules before call lenint parsing
- Remove logKnownIssue 9503 from test code
- Adjust TestAllLocales test on ICU4C
- Add lenient checks on ICU4J
  • Loading branch information
robertgmelo committed Apr 24, 2020
1 parent 5944e18 commit 440cef6
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 59 deletions.
16 changes: 13 additions & 3 deletions icu4c/source/i18n/nfrule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1297,6 +1297,10 @@ NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErr
#if !UCONFIG_NO_COLLATION
// go through all this grief if we're in lenient-parse mode
if (formatter->isLenient()) {
// Check if non-lenient rule finds the text before call lenient parsing
if (str.startsWith(prefix)) {
return prefix.length();
}
// get the formatter's collator and use it to create two
// collation element iterators, one over the target string
// and another over the prefix (right now, we'll throw an
Expand Down Expand Up @@ -1505,9 +1509,15 @@ NFRule::findText(const UnicodeString& str,
return str.indexOf(key, startingAt);
}
else {
// but if lenient parsing is turned ON, we've got some work
// ahead of us
return findTextLenient(str, key, startingAt, length);
// Check if non-lenient rule finds the text before call lenient parsing
*length = key.length();
int32_t pos = str.indexOf(key, startingAt);
if(pos >= 0) {
return pos;
} else {
// but if lenient parsing is turned ON, we've got some work ahead of us
return findTextLenient(str, key, startingAt, length);
}
}
}

Expand Down
12 changes: 9 additions & 3 deletions icu4c/source/i18n/plurfmt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -549,9 +549,15 @@ void PluralFormat::parseType(const UnicodeString& source, const NFRule *rbnfLeni

UnicodeString currArg = pattern.tempSubString(partStart->getLimit(), partLimit->getIndex() - partStart->getLimit());
if (rbnfLenientScanner != NULL) {
// If lenient parsing is turned ON, we've got some time consuming parsing ahead of us.
int32_t length = -1;
currMatchIndex = rbnfLenientScanner->findTextLenient(source, currArg, startingAt, &length);
// Check if non-lenient rule finds the text before call lenient parsing
int32_t tempIndex = source.indexOf(currArg, startingAt);
if (tempIndex >= 0) {
currMatchIndex = tempIndex;
} else {
// If lenient parsing is turned ON, we've got some time consuming parsing ahead of us.
int32_t length = -1;
currMatchIndex = rbnfLenientScanner->findTextLenient(source, currArg, startingAt, &length);
}
}
else {
currMatchIndex = source.indexOf(currArg, startingAt);
Expand Down
51 changes: 26 additions & 25 deletions icu4c/source/test/intltest/itrbnf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1889,16 +1889,19 @@ IntlTestRBNF::TestAllLocales()
UErrorCode status = U_ZERO_ERROR;
RuleBasedNumberFormat* f = new RuleBasedNumberFormat((URBNFRuleSetTag)j, *loc, status);

if (status == U_USING_DEFAULT_WARNING || status == U_USING_FALLBACK_WARNING) {
// Skip it.
delete f;
break;
}
if (U_FAILURE(status)) {
errln(UnicodeString(loc->getName()) + names[j]
+ "ERROR could not instantiate -> " + u_errorName(status));
continue;
}

Locale actualLocale = f->getLocale(ULOC_ACTUAL_LOCALE, status);
if (actualLocale != *loc) {
// Skip the redundancy
delete f;
break;
}

#if !UCONFIG_NO_COLLATION
for (unsigned int numidx = 0; numidx < UPRV_LENGTHOF(numbers); numidx++) {
double n = numbers[numidx];
Expand Down Expand Up @@ -1936,28 +1939,26 @@ IntlTestRBNF::TestAllLocales()
+ UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getDouble());
}
}
if (!quick && !logKnownIssue("9503") ) {
// lenient parse
status = U_ZERO_ERROR;
f->setLenient(TRUE);
f->parse(str, num, status);
if (U_FAILURE(status)) {
// lenient parse
status = U_ZERO_ERROR;
f->setLenient(TRUE);
f->parse(str, num, status);
if (U_FAILURE(status)) {
errln(UnicodeString(loc->getName()) + names[j]
+ "ERROR could not parse(lenient) '" + str + "' -> " + u_errorName(status));
}
// We only check the spellout. The behavior is undefined for numbers < 1 and fractional numbers.
if (j == 0) {
if (num.getType() == Formattable::kLong && num.getLong() != n) {
errln(UnicodeString(loc->getName()) + names[j]
+ "ERROR could not parse(lenient) '" + str + "' -> " + u_errorName(status));
+ UnicodeString("ERROR could not roundtrip ") + n
+ UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getLong());
}
// We only check the spellout. The behavior is undefined for numbers < 1 and fractional numbers.
if (j == 0) {
if (num.getType() == Formattable::kLong && num.getLong() != n) {
errln(UnicodeString(loc->getName()) + names[j]
+ UnicodeString("ERROR could not roundtrip ") + n
+ UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getLong());
}
else if (num.getType() == Formattable::kDouble && (int64_t)(num.getDouble() * 1000) != (int64_t)(n*1000)) {
// The epsilon difference is too high.
errln(UnicodeString(loc->getName()) + names[j]
+ UnicodeString("ERROR could not roundtrip ") + n
+ UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getDouble());
}
else if (num.getType() == Formattable::kDouble && (int64_t)(num.getDouble() * 1000) != (int64_t)(n*1000)) {
// The epsilon difference is too high.
errln(UnicodeString(loc->getName()) + names[j]
+ UnicodeString("ERROR could not roundtrip ") + n
+ UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getDouble());
}
}
}
Expand Down
15 changes: 12 additions & 3 deletions icu4j/main/classes/core/src/com/ibm/icu/text/NFRule.java
Original file line number Diff line number Diff line change
Expand Up @@ -1241,6 +1241,10 @@ private int prefixLength(String str, String prefix) {

RbnfLenientScanner scanner = formatter.getLenientScanner();
if (scanner != null) {
// Check if non-lenient rule finds the text before call lenient parsing
if (str.startsWith(prefix)) {
return prefix.length();
}
return scanner.prefixLength(str, prefix);
}

Expand Down Expand Up @@ -1290,9 +1294,14 @@ private int[] findText(String str, String key, PluralFormat pluralFormatKey, int
}

if (scanner != null) {
// if lenient parsing is turned ON, we've got some work
// ahead of us
return scanner.findText(str, key, startingAt);
// Check if non-lenient rule finds the text before call lenient parsing
int pos[] = new int[] { str.indexOf(key, startingAt), key.length() };
if (pos[0] >= 0) {
return pos;
} else {
// if lenient parsing is turned ON, we've got some work ahead of us
return scanner.findText(str, key, startingAt);
}
}
// if lenient parsing is turned off, this is easy. Just call
// String.indexOf() and we're done
Expand Down
12 changes: 9 additions & 3 deletions icu4j/main/classes/core/src/com/ibm/icu/text/PluralFormat.java
Original file line number Diff line number Diff line change
Expand Up @@ -760,9 +760,15 @@ public Object parseObject(String source, ParsePosition pos) {

String currArg = pattern.substring(partStart.getLimit(), partLimit.getIndex());
if (scanner != null) {
// If lenient parsing is turned ON, we've got some time consuming parsing ahead of us.
int[] scannerMatchResult = scanner.findText(source, currArg, startingAt);
currMatchIndex = scannerMatchResult[0];
// Check if non-lenient rule finds the text before call lenient parsing
int tempPos = source.indexOf(currArg, startingAt);
if (tempPos >= 0) {
currMatchIndex = tempPos;
} else {
// If lenient parsing is turned ON, we've got some time consuming parsing ahead of us.
int[] scannerMatchResult = scanner.findText(source, currArg, startingAt);
currMatchIndex = scannerMatchResult[0];
}
}
else {
currMatchIndex = source.indexOf(currArg, startingAt);
Expand Down
109 changes: 87 additions & 22 deletions icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RbnfTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,17 @@ public void TestEnglishSpellout() {
};

doTest(formatter, testData, true);

String[][] testDataLenient = {
{ "fifty-7", "57" },
{ " fifty-7", "57" },
{ " fifty-7", "57" },
{ "2 thousand six HUNDRED fifty-7", "2,657" },
{ "fifteen hundred and zero", "1,500" },
{ "FOurhundred thiRTY six", "436" }
};

doParsingTest(formatter, testDataLenient, true);
}

/**
Expand Down Expand Up @@ -350,6 +361,12 @@ public void TestDurations() {
};

doTest(formatter, testData, true);

String[][] testDataLenient = {
{ "2-51-33", "10,293" },
};

doParsingTest(formatter, testDataLenient, true);
}

/**
Expand Down Expand Up @@ -425,6 +442,13 @@ public void TestFrenchSpellout() {
};

doTest(formatter, testData, true);

String[][] testDataLenient = {
{ "trente-et-un", "31" },
{ "un cent quatre vingt dix huit", "198" },
};

doParsingTest(formatter, testDataLenient, true);
}

/**
Expand Down Expand Up @@ -529,6 +553,12 @@ public void TestGermanSpellout() {
};

doTest(formatter, testData, true);

String[][] testDataLenient = {
{ "ein Tausend sechs Hundert fuenfunddreissig", "1,635" },
};

doParsingTest(formatter, testDataLenient, true);
}

/**
Expand Down Expand Up @@ -1117,6 +1147,10 @@ public void TestAllLocales() {
" (ordinal) "
//" (duration) " // English only
};
boolean[] lenientMode = {
false, // non-lenient mode
true // lenient mode
};
double[] numbers = {45.678, 1, 2, 10, 11, 100, 110, 200, 1000, 1111, -1111};
int count = numbers.length;
Random r = (count <= numbers.length ? null : createRandom());
Expand All @@ -1142,25 +1176,25 @@ public void TestAllLocales() {
logln(loc.getName() + names[j] + "success format: " + n + " -> " + s);
}

try {
// RBNF parse is extremely slow when lenient option is enabled.
// non-lenient parse
fmt.setLenientParseMode(false);
Number num = fmt.parse(s);
if (isVerbose()) {
logln(loc.getName() + names[j] + "success parse: " + s + " -> " + num);
}
if (j != 0) {
// TODO: Fix the ordinal rules.
continue;
}
if (n != num.doubleValue()) {
errors.append("\n" + loc + names[j] + "got " + num + " expected " + n);
for (int k = 0; k < lenientMode.length; k++) {
try {
fmt.setLenientParseMode(lenientMode[k]);
Number num = fmt.parse(s);
if (isVerbose()) {
logln(loc.getName() + names[j] + "success parse: " + s + " -> " + num);
}
if (j != 0) {
// TODO: Fix the ordinal rules.
continue;
}
if (n != num.doubleValue()) {
errors.append("\n" + loc + names[j] + "got " + num + " expected " + n);
}
} catch (ParseException pe) {
String msg = loc.getName() + names[j] + "ERROR:" + pe.getMessage();
logln(msg);
errors.append("\n" + msg);
}
} catch (ParseException pe) {
String msg = loc.getName() + names[j] + "ERROR:" + pe.getMessage();
logln(msg);
errors.append("\n" + msg);
}
}
}
Expand All @@ -1170,10 +1204,12 @@ public void TestAllLocales() {
}
}

void doTest(RuleBasedNumberFormat formatter, String[][] testData,
boolean testParsing) {
// NumberFormat decFmt = NumberFormat.getInstance(Locale.US);
NumberFormat decFmt = new DecimalFormat("#,###.################");
NumberFormat createDecimalFormatter() {
return new DecimalFormat("#,###.################");
}

void doTest(RuleBasedNumberFormat formatter, String[][] testData, boolean testParsing) {
NumberFormat decFmt = createDecimalFormatter();
try {
for (int i = 0; i < testData.length; i++) {
String number = testData[i][0];
Expand Down Expand Up @@ -1207,6 +1243,35 @@ else if (testParsing) {
}
}

void doParsingTest(RuleBasedNumberFormat formatter, String[][] testData, boolean lenient) {
NumberFormat decFmt = createDecimalFormatter();

if (lenient) {
formatter.setLenientParseMode(true);
}
for (int i = 0; i < testData.length; i++) {
try {
String s = testData[i][0];
Number expectedNumber = decFmt.parse(testData[i][1]);
if (isVerbose()) {
logln("test[" + i + "] spellout value: (" + s + ") target: " + expectedNumber);
}

Number num = formatter.parse(s);
if (isVerbose()) {
logln("success parse: (" + s + ") -> " + num);
}

if (expectedNumber.doubleValue() != num.doubleValue()) {
errln("\nParsing (" + s + ") failed: got " + num + " expected " + expectedNumber);
}
} catch (Throwable e) {
e.printStackTrace();
errln("Test failed with exception: " + e.toString());
}
}
}

/* Tests the method
* public boolean equals(Object that)
*/
Expand Down

0 comments on commit 440cef6

Please sign in to comment.