ICU-21071 Fix lenient parse rules

- Check non-lenient rules before call lenint parsing - Remove logKnownIssue 9503 from test code - Adjust TestAllLocales test on ICU4C - Add lenient checks on ICU4J
unicode-org · Apr 24, 2020 · 440cef6 · 440cef6
1 parent 5944e18
commit 440cef6
Show file tree

Hide file tree

Showing 6 changed files with 156 additions and 59 deletions.
diff --git a/icu4c/source/i18n/nfrule.cpp b/icu4c/source/i18n/nfrule.cpp
@@ -1297,6 +1297,10 @@ NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErr
 #if !UCONFIG_NO_COLLATION
     // go through all this grief if we're in lenient-parse mode
     if (formatter->isLenient()) {
+        // Check if non-lenient rule finds the text before call lenient parsing
+        if (str.startsWith(prefix)) {
+            return prefix.length();
+        }
         // get the formatter's collator and use it to create two
         // collation element iterators, one over the target string
         // and another over the prefix (right now, we'll throw an
@@ -1505,9 +1509,15 @@ NFRule::findText(const UnicodeString& str,
         return str.indexOf(key, startingAt);
     }
     else {
-        // but if lenient parsing is turned ON, we've got some work
-        // ahead of us
-        return findTextLenient(str, key, startingAt, length);
+        // Check if non-lenient rule finds the text before call lenient parsing
+        *length = key.length();
+        int32_t pos = str.indexOf(key, startingAt);
+        if(pos >= 0) {
+            return pos;
+        } else {
+            // but if lenient parsing is turned ON, we've got some work ahead of us
+            return findTextLenient(str, key, startingAt, length);
+        }
     }
 }
 

diff --git a/icu4c/source/i18n/plurfmt.cpp b/icu4c/source/i18n/plurfmt.cpp
@@ -549,9 +549,15 @@ void PluralFormat::parseType(const UnicodeString& source, const NFRule *rbnfLeni
 
         UnicodeString currArg = pattern.tempSubString(partStart->getLimit(), partLimit->getIndex() - partStart->getLimit());
         if (rbnfLenientScanner != NULL) {
-            // If lenient parsing is turned ON, we've got some time consuming parsing ahead of us.
-            int32_t length = -1;
-            currMatchIndex = rbnfLenientScanner->findTextLenient(source, currArg, startingAt, &length);
+            // Check if non-lenient rule finds the text before call lenient parsing
+            int32_t tempIndex = source.indexOf(currArg, startingAt);
+            if (tempIndex >= 0) {
+                currMatchIndex = tempIndex;
+            } else {
+                // If lenient parsing is turned ON, we've got some time consuming parsing ahead of us.
+                int32_t length = -1;
+                currMatchIndex = rbnfLenientScanner->findTextLenient(source, currArg, startingAt, &length);
+            }
         }
         else {
             currMatchIndex = source.indexOf(currArg, startingAt);

diff --git a/icu4c/source/test/intltest/itrbnf.cpp b/icu4c/source/test/intltest/itrbnf.cpp
@@ -1889,16 +1889,19 @@ IntlTestRBNF::TestAllLocales()
             UErrorCode status = U_ZERO_ERROR;
             RuleBasedNumberFormat* f = new RuleBasedNumberFormat((URBNFRuleSetTag)j, *loc, status);
 
-            if (status == U_USING_DEFAULT_WARNING || status == U_USING_FALLBACK_WARNING) {
-                // Skip it.
-                delete f;
-                break;
-            }
             if (U_FAILURE(status)) {
                 errln(UnicodeString(loc->getName()) + names[j]
                     + "ERROR could not instantiate -> " + u_errorName(status));
                 continue;
             }
+
+            Locale actualLocale = f->getLocale(ULOC_ACTUAL_LOCALE, status);
+            if (actualLocale != *loc) {
+                // Skip the redundancy
+                delete f;
+                break;
+            }
+
 #if !UCONFIG_NO_COLLATION
             for (unsigned int numidx = 0; numidx < UPRV_LENGTHOF(numbers); numidx++) {
                 double n = numbers[numidx];
@@ -1936,28 +1939,26 @@ IntlTestRBNF::TestAllLocales()
                             + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getDouble());
                     }
                 }
-                if (!quick && !logKnownIssue("9503") ) {
-                    // lenient parse
-                    status = U_ZERO_ERROR;
-                    f->setLenient(TRUE);
-                    f->parse(str, num, status);
-                    if (U_FAILURE(status)) {
+                // lenient parse
+                status = U_ZERO_ERROR;
+                f->setLenient(TRUE);
+                f->parse(str, num, status);
+                if (U_FAILURE(status)) {
+                    errln(UnicodeString(loc->getName()) + names[j]
+                        + "ERROR could not parse(lenient) '" + str + "' -> " + u_errorName(status));
+                }
+                // We only check the spellout. The behavior is undefined for numbers < 1 and fractional numbers.
+                if (j == 0) {
+                    if (num.getType() == Formattable::kLong && num.getLong() != n) {
                         errln(UnicodeString(loc->getName()) + names[j]
-                            + "ERROR could not parse(lenient) '" + str + "' -> " + u_errorName(status));
+                            + UnicodeString("ERROR could not roundtrip ") + n
+                            + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getLong());
                     }
-                    // We only check the spellout. The behavior is undefined for numbers < 1 and fractional numbers.
-                    if (j == 0) {
-                        if (num.getType() == Formattable::kLong && num.getLong() != n) {
-                            errln(UnicodeString(loc->getName()) + names[j]
-                                + UnicodeString("ERROR could not roundtrip ") + n
-                                + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getLong());
-                        }
-                        else if (num.getType() == Formattable::kDouble && (int64_t)(num.getDouble() * 1000) != (int64_t)(n*1000)) {
-                            // The epsilon difference is too high.
-                            errln(UnicodeString(loc->getName()) + names[j]
-                                + UnicodeString("ERROR could not roundtrip ") + n
-                                + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getDouble());
-                        }
+                    else if (num.getType() == Formattable::kDouble && (int64_t)(num.getDouble() * 1000) != (int64_t)(n*1000)) {
+                        // The epsilon difference is too high.
+                        errln(UnicodeString(loc->getName()) + names[j]
+                            + UnicodeString("ERROR could not roundtrip ") + n
+                            + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getDouble());
                     }
                 }
             }

diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/NFRule.java b/icu4j/main/classes/core/src/com/ibm/icu/text/NFRule.java
@@ -1241,6 +1241,10 @@ private int prefixLength(String str, String prefix) {
 
         RbnfLenientScanner scanner = formatter.getLenientScanner();
         if (scanner != null) {
+            // Check if non-lenient rule finds the text before call lenient parsing
+            if (str.startsWith(prefix)) {
+                return prefix.length();
+            }
             return scanner.prefixLength(str, prefix);
         }
 
@@ -1290,9 +1294,14 @@ private int[] findText(String str, String key, PluralFormat pluralFormatKey, int
         }
 
         if (scanner != null) {
-            // if lenient parsing is turned ON, we've got some work
-            // ahead of us
-            return scanner.findText(str, key, startingAt);
+            // Check if non-lenient rule finds the text before call lenient parsing
+            int pos[] = new int[] { str.indexOf(key, startingAt), key.length() };
+            if (pos[0] >= 0) {
+                return pos;
+            } else {
+                // if lenient parsing is turned ON, we've got some work ahead of us
+                return scanner.findText(str, key, startingAt);
+            }
         }
         // if lenient parsing is turned off, this is easy. Just call
         // String.indexOf() and we're done

diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/PluralFormat.java b/icu4j/main/classes/core/src/com/ibm/icu/text/PluralFormat.java
@@ -760,9 +760,15 @@ public Object parseObject(String source, ParsePosition pos) {
 
             String currArg = pattern.substring(partStart.getLimit(), partLimit.getIndex());
             if (scanner != null) {
-                // If lenient parsing is turned ON, we've got some time consuming parsing ahead of us.
-                int[] scannerMatchResult = scanner.findText(source, currArg, startingAt);
-                currMatchIndex = scannerMatchResult[0];
+                // Check if non-lenient rule finds the text before call lenient parsing
+                int tempPos = source.indexOf(currArg, startingAt);
+                if (tempPos >= 0) {
+                    currMatchIndex = tempPos;
+                } else {
+                    // If lenient parsing is turned ON, we've got some time consuming parsing ahead of us.
+                    int[] scannerMatchResult = scanner.findText(source, currArg, startingAt);
+                    currMatchIndex = scannerMatchResult[0];
+                }
             }
             else {
                 currMatchIndex = source.indexOf(currArg, startingAt);

diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RbnfTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RbnfTest.java
@@ -296,6 +296,17 @@ public void TestEnglishSpellout() {
         };
 
         doTest(formatter, testData, true);
+
+        String[][] testDataLenient = {
+                { "fifty-7", "57" },
+                { " fifty-7", "57" },
+                { "  fifty-7", "57" },
+                { "2 thousand six    HUNDRED fifty-7", "2,657" },
+                { "fifteen hundred and zero", "1,500" },
+                { "FOurhundred     thiRTY six", "436" }
+        };
+
+        doParsingTest(formatter, testDataLenient, true);
     }
 
     /**
@@ -350,6 +361,12 @@ public void TestDurations() {
         };
 
         doTest(formatter, testData, true);
+
+        String[][] testDataLenient = {
+                { "2-51-33", "10,293" },
+        };
+
+        doParsingTest(formatter, testDataLenient, true);
     }
 
     /**
@@ -425,6 +442,13 @@ public void TestFrenchSpellout() {
         };
 
         doTest(formatter, testData, true);
+
+        String[][] testDataLenient = {
+                { "trente-et-un", "31" },
+                { "un cent quatre vingt dix huit", "198" },
+        };
+
+        doParsingTest(formatter, testDataLenient, true);
     }
 
     /**
@@ -529,6 +553,12 @@ public void TestGermanSpellout() {
         };
 
         doTest(formatter, testData, true);
+
+        String[][] testDataLenient = {
+                { "ein Tausend sechs Hundert fuenfunddreissig", "1,635" },
+        };
+
+        doParsingTest(formatter, testDataLenient, true);
     }
 
     /**
@@ -1117,6 +1147,10 @@ public void TestAllLocales() {
                 " (ordinal) "
                 //" (duration) " // English only
         };
+        boolean[] lenientMode = {
+                false, // non-lenient mode
+                true // lenient mode
+        };
         double[] numbers = {45.678, 1, 2, 10, 11, 100, 110, 200, 1000, 1111, -1111};
         int count = numbers.length;
         Random r = (count <= numbers.length ? null : createRandom());
@@ -1142,25 +1176,25 @@ public void TestAllLocales() {
                         logln(loc.getName() + names[j] + "success format: " + n + " -> " + s);
                     }
 
-                    try {
-                        // RBNF parse is extremely slow when lenient option is enabled.
-                        // non-lenient parse
-                        fmt.setLenientParseMode(false);
-                        Number num = fmt.parse(s);
-                        if (isVerbose()) {
-                            logln(loc.getName() + names[j] + "success parse: " + s + " -> " + num);
-                        }
-                        if (j != 0) {
-                            // TODO: Fix the ordinal rules.
-                            continue;
-                        }
-                        if (n != num.doubleValue()) {
-                            errors.append("\n" + loc + names[j] + "got " + num + " expected " + n);
+                    for (int k = 0; k < lenientMode.length; k++) {
+                        try {
+                            fmt.setLenientParseMode(lenientMode[k]);
+                            Number num = fmt.parse(s);
+                            if (isVerbose()) {
+                                logln(loc.getName() + names[j] + "success parse: " + s + " -> " + num);
+                            }
+                            if (j != 0) {
+                                // TODO: Fix the ordinal rules.
+                                continue;
+                            }
+                            if (n != num.doubleValue()) {
+                                errors.append("\n" + loc + names[j] + "got " + num + " expected " + n);
+                            }
+                        } catch (ParseException pe) {
+                            String msg = loc.getName() + names[j] + "ERROR:" + pe.getMessage();
+                            logln(msg);
+                            errors.append("\n" + msg);
                         }
-                    } catch (ParseException pe) {
-                        String msg = loc.getName() + names[j] + "ERROR:" + pe.getMessage();
-                        logln(msg);
-                        errors.append("\n" + msg);
                     }
                 }
             }
@@ -1170,10 +1204,12 @@ public void TestAllLocales() {
         }
     }
 
-    void doTest(RuleBasedNumberFormat formatter, String[][] testData,
-                boolean testParsing) {
-        //        NumberFormat decFmt = NumberFormat.getInstance(Locale.US);
-        NumberFormat decFmt = new DecimalFormat("#,###.################");
+    NumberFormat createDecimalFormatter() {
+        return new DecimalFormat("#,###.################");
+    }
+
+    void doTest(RuleBasedNumberFormat formatter, String[][] testData, boolean testParsing) {
+        NumberFormat decFmt = createDecimalFormatter();
         try {
             for (int i = 0; i < testData.length; i++) {
                 String number = testData[i][0];
@@ -1207,6 +1243,35 @@ else if (testParsing) {
         }
     }
 
+    void doParsingTest(RuleBasedNumberFormat formatter, String[][] testData, boolean lenient) {
+        NumberFormat decFmt = createDecimalFormatter();
+
+        if (lenient) {
+            formatter.setLenientParseMode(true);
+        }
+        for (int i = 0; i < testData.length; i++) {
+            try {
+                String s = testData[i][0];
+                Number expectedNumber = decFmt.parse(testData[i][1]);
+                if (isVerbose()) {
+                    logln("test[" + i + "] spellout value: (" + s + ") target: " + expectedNumber);
+                }
+
+                Number num = formatter.parse(s);
+                if (isVerbose()) {
+                    logln("success parse: (" + s + ") -> " + num);
+                }
+
+                if (expectedNumber.doubleValue() != num.doubleValue()) {
+                    errln("\nParsing (" + s + ") failed: got " + num + " expected " + expectedNumber);
+                }
+            } catch (Throwable e) {
+                e.printStackTrace();
+                errln("Test failed with exception: " + e.toString());
+            }
+        }
+    }
+
     /* Tests the method
      *      public boolean equals(Object that)
      */