From c8336085bb7489650c449f05c13be7b6a660fd89 Mon Sep 17 00:00:00 2001 From: Frank Tang Date: Thu, 18 Jan 2024 21:36:07 +0000 Subject: [PATCH] ICU-22637 Rewrite custom timezone parser See #2792 --- icu4c/source/i18n/timezone.cpp | 189 ++++++++---------- icu4c/source/test/intltest/tztest.cpp | 70 ++++--- .../main/java/com/ibm/icu/impl/ZoneMeta.java | 116 +++++------ .../icu/dev/test/timezone/TimeZoneTest.java | 20 ++ 4 files changed, 203 insertions(+), 192 deletions(-) diff --git a/icu4c/source/i18n/timezone.cpp b/icu4c/source/i18n/timezone.cpp index 02cbc78ceffb..0dd860b4ac67 100644 --- a/icu4c/source/i18n/timezone.cpp +++ b/icu4c/source/i18n/timezone.cpp @@ -42,7 +42,9 @@ #include "unicode/utypes.h" #include "unicode/ustring.h" #include "uassert.h" +#include "uinvchar.h" #include "ustr_imp.h" +#include "util.h" #ifdef U_DEBUG_TZ # include @@ -75,7 +77,6 @@ static char gStrBuf[256]; #include "unicode/gregocal.h" #include "unicode/ures.h" #include "unicode/tzfmt.h" -#include "unicode/numfmt.h" #include "gregoimp.h" #include "uresimp.h" // struct UResourceBundle #include "olsontz.h" @@ -1369,123 +1370,99 @@ TimeZone::getCustomID(const UnicodeString& id, UnicodeString& normalized, UError UBool TimeZone::parseCustomID(const UnicodeString& id, int32_t& sign, int32_t& hour, int32_t& min, int32_t& sec) { - static const int32_t kParseFailed = -99999; - - NumberFormat* numberFormat = 0; - UnicodeString idUppercase = id; - idUppercase.toUpper(""); - - if (id.length() > GMT_ID_LENGTH && - idUppercase.startsWith(GMT_ID, GMT_ID_LENGTH)) - { - ParsePosition pos(GMT_ID_LENGTH); - sign = 1; - hour = 0; - min = 0; - sec = 0; - - if (id[pos.getIndex()] == MINUS /*'-'*/) { - sign = -1; - } else if (id[pos.getIndex()] != PLUS /*'+'*/) { + if (id.length() < GMT_ID_LENGTH) { + return false; + } + if (0 != u_strncasecmp(id.getBuffer(), GMT_ID, GMT_ID_LENGTH, 0)) { + return false; + } + // ICU_Utility::parseNumber also accept non ASCII digits so we need to first + // check we only have ASCII chars. + if (!uprv_isInvariantUString(id.getBuffer(), id.length())) { + return false; + } + sign = 1; + hour = 0; + min = 0; + sec = 0; + + if (id[GMT_ID_LENGTH] == MINUS /*'-'*/) { + sign = -1; + } else if (id[GMT_ID_LENGTH] != PLUS /*'+'*/) { + return false; + } + + int32_t start = GMT_ID_LENGTH + 1; + int32_t pos = start; + hour = ICU_Utility::parseNumber(id, pos, 10); + if (pos == id.length()) { + // Handle the following cases + // HHmmss + // Hmmss + // HHmm + // Hmm + // HH + // H + + // Get all digits + // Should be 1 to 6 digits. + int32_t length = pos - start; + switch (length) { + case 1: // H + case 2: // HH + // already set to hour + break; + case 3: // Hmm + case 4: // HHmm + min = hour % 100; + hour /= 100; + break; + case 5: // Hmmss + case 6: // HHmmss + sec = hour % 100; + min = (hour/100) % 100; + hour /= 10000; + break; + default: + // invalid range + return false; + } + } else { + // Handle the following cases + // HH:mm:ss + // H:mm:ss + // HH:mm + // H:mm + if (pos - start < 1 || pos - start > 2 || id[pos] != COLON) { return false; } - pos.setIndex(pos.getIndex() + 1); - - UErrorCode success = U_ZERO_ERROR; - numberFormat = NumberFormat::createInstance(success); - if(U_FAILURE(success)){ + pos++; // skip : after H or HH + if (id.length() == pos) { return false; } - numberFormat->setParseIntegerOnly(true); - //numberFormat->setLenient(true); // TODO: May need to set this, depends on latest timezone parsing - - // Look for either hh:mm, hhmm, or hh - int32_t start = pos.getIndex(); - Formattable n(kParseFailed); - numberFormat->parse(id, n, pos); - if (pos.getIndex() == start) { - delete numberFormat; + start = pos; + min = ICU_Utility::parseNumber(id, pos, 10); + if (pos - start != 2) { return false; } - hour = n.getLong(); - - if (pos.getIndex() < id.length()) { - if (pos.getIndex() - start > 2 - || id[pos.getIndex()] != COLON) { - delete numberFormat; + if (id.length() > pos) { + if (id[pos] != COLON) { return false; } - // hh:mm - pos.setIndex(pos.getIndex() + 1); - int32_t oldPos = pos.getIndex(); - n.setLong(kParseFailed); - numberFormat->parse(id, n, pos); - if ((pos.getIndex() - oldPos) != 2) { - // must be 2 digits - delete numberFormat; + pos++; // skip : after mm + start = pos; + sec = ICU_Utility::parseNumber(id, pos, 10); + if (pos - start != 2 || id.length() > pos) { return false; } - min = n.getLong(); - if (pos.getIndex() < id.length()) { - if (id[pos.getIndex()] != COLON) { - delete numberFormat; - return false; - } - // [:ss] - pos.setIndex(pos.getIndex() + 1); - oldPos = pos.getIndex(); - n.setLong(kParseFailed); - numberFormat->parse(id, n, pos); - if (pos.getIndex() != id.length() - || (pos.getIndex() - oldPos) != 2) { - delete numberFormat; - return false; - } - sec = n.getLong(); - } - } else { - // Supported formats are below - - // - // HHmmss - // Hmmss - // HHmm - // Hmm - // HH - // H - - int32_t length = pos.getIndex() - start; - if (length <= 0 || 6 < length) { - // invalid length - delete numberFormat; - return false; - } - switch (length) { - case 1: - case 2: - // already set to hour - break; - case 3: - case 4: - min = hour % 100; - hour /= 100; - break; - case 5: - case 6: - sec = hour % 100; - min = (hour/100) % 100; - hour /= 10000; - break; - } } - - delete numberFormat; - - if (hour > kMAX_CUSTOM_HOUR || min > kMAX_CUSTOM_MIN || sec > kMAX_CUSTOM_SEC) { - return false; - } - return true; } - return false; + if (hour > kMAX_CUSTOM_HOUR || + min > kMAX_CUSTOM_MIN || + sec > kMAX_CUSTOM_SEC) { + return false; + } + return true; } UnicodeString& diff --git a/icu4c/source/test/intltest/tztest.cpp b/icu4c/source/test/intltest/tztest.cpp index 53e02a9e2288..472326ff3b85 100644 --- a/icu4c/source/test/intltest/tztest.cpp +++ b/icu4c/source/test/intltest/tztest.cpp @@ -1171,36 +1171,56 @@ void TimeZoneTest::TestCustomParse() struct { - const char *customId; + const char16_t *customId; int32_t expectedOffset; } kData[] = { // ID Expected offset in seconds - {"GMT", kUnparseable}, //Isn't custom. [returns normal GMT] - {"GMT-YOUR.AD.HERE", kUnparseable}, - {"GMT0", kUnparseable}, - {"GMT+0", (0)}, - {"GMT+1", (1*60*60)}, - {"GMT-0030", (-30*60)}, - {"GMT+15:99", kUnparseable}, - {"GMT+", kUnparseable}, - {"GMT-", kUnparseable}, - {"GMT+0:", kUnparseable}, - {"GMT-:", kUnparseable}, - {"GMT-YOUR.AD.HERE", kUnparseable}, - {"GMT+0010", (10*60)}, // Interpret this as 00:10 - {"GMT-10", (-10*60*60)}, - {"GMT+30", kUnparseable}, - {"GMT-3:30", (-(3*60+30)*60)}, - {"GMT-230", (-(2*60+30)*60)}, - {"GMT+05:13:05", ((5*60+13)*60+5)}, - {"GMT-71023", (-((7*60+10)*60+23))}, - {"GMT+01:23:45:67", kUnparseable}, - {"GMT+01:234", kUnparseable}, - {"GMT-2:31:123", kUnparseable}, - {"GMT+3:75", kUnparseable}, - {"GMT-01010101", kUnparseable}, + {u"GMT", kUnparseable}, //Isn't custom. [returns normal GMT] + {u"GMT-YOUR.AD.HERE", kUnparseable}, + {u"GMT0", kUnparseable}, + {u"GMT+0", (0)}, + {u"GMT+1", (1*60*60)}, + {u"GMT-0030", (-30*60)}, + {u"GMT+15:99", kUnparseable}, + {u"GMT+", kUnparseable}, + {u"GMT-", kUnparseable}, + {u"GMT+0:", kUnparseable}, + {u"GMT-:", kUnparseable}, + {u"GMT-YOUR.AD.HERE", kUnparseable}, + {u"GMT+0010", (10*60)}, // Interpret this as 00:10 + {u"GMT-10", (-10*60*60)}, + {u"GMT+30", kUnparseable}, + {u"GMT-3:30", (-(3*60+30)*60)}, + {u"GMT-230", (-(2*60+30)*60)}, + {u"GMT+05:13:05", ((5*60+13)*60+5)}, + {u"GMT-71023", (-((7*60+10)*60+23))}, + {u"GMT+01:23:45:67", kUnparseable}, + {u"GMT+01:234", kUnparseable}, + {u"GMT-2:31:123", kUnparseable}, + {u"GMT+3:75", kUnparseable}, + {u"GMT-01010101", kUnparseable}, + {u"GMT-4E58", kUnparseable}, // ICU-22637 + {u"GMT-4e58", kUnparseable}, // ICU-22637 + {u"GMT-1E01", kUnparseable}, // ICU-22637 + {u"GMT-2E01", kUnparseable}, // ICU-22637 + {u"GMT-2e01", kUnparseable}, // ICU-22637 + {u"GMT-9e02", kUnparseable}, // ICU-22637 + {u"GMT-1e03", kUnparseable}, // ICU-22637 + {u"GMT-2e03", kUnparseable}, // ICU-22637 + {u"GMT-500M", kUnparseable}, // ICU-22637 + {u"GMT-500T", kUnparseable}, // ICU-22637 + {u"GMT-9E00", kUnparseable}, // ICU-22637 + {u"GMT-0X0F", kUnparseable}, // ICU-22637 + {u"GMT-0x0F", kUnparseable}, // ICU-22637 + {u"GMT-0x12", kUnparseable}, // ICU-22637 + {u"GMT-B111", kUnparseable}, // ICU-22637 + {u"GMT-b111", kUnparseable}, // ICU-22637 + {u"GMT-0b11", kUnparseable}, // ICU-22637 + {u"GMT-๑๒", kUnparseable}, // ICU-22637 + {u"GMT-๑๒:๓๔", kUnparseable}, // ICU-22637 + {u"GMT+๑๒:๓๔:๕๖", kUnparseable}, // ICU-22637 {0, 0} }; diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/impl/ZoneMeta.java b/icu4j/main/core/src/main/java/com/ibm/icu/impl/ZoneMeta.java index 3699cbbca9fa..06df445c61c9 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/impl/ZoneMeta.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/impl/ZoneMeta.java @@ -20,7 +20,7 @@ import java.util.Set; import java.util.TreeSet; -import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.impl.Utility; import com.ibm.icu.util.Output; import com.ibm.icu.util.SimpleTimeZone; import com.ibm.icu.util.TimeZone; @@ -681,66 +681,30 @@ public static String getCustomID(String id) { * @return Returns true when the given custom id is valid. */ static boolean parseCustomID(String id, int[] fields) { - NumberFormat numberFormat = null; - if (id != null && id.length() > kGMT_ID.length() && - id.toUpperCase(Locale.ENGLISH).startsWith(kGMT_ID)) { - ParsePosition pos = new ParsePosition(kGMT_ID.length()); + id.substring(0, 3).equalsIgnoreCase(kGMT_ID)) { int sign = 1; int hour = 0; int min = 0; int sec = 0; - if (id.charAt(pos.getIndex()) == 0x002D /*'-'*/) { + int[] pos = new int[1]; + pos[0] = kGMT_ID.length(); + if (id.charAt(pos[0]) == 0x002D /*'-'*/) { sign = -1; - } else if (id.charAt(pos.getIndex()) != 0x002B /*'+'*/) { + } else if (id.charAt(pos[0]) != 0x002B /*'+'*/) { return false; } - pos.setIndex(pos.getIndex() + 1); - - numberFormat = NumberFormat.getInstance(); - numberFormat.setParseIntegerOnly(true); - - // Look for either hh:mm, hhmm, or hh - int start = pos.getIndex(); - - Number n = numberFormat.parse(id, pos); - if (pos.getIndex() == start) { + pos[0]++; + int start = pos[0]; + // Utility.parseNumber also accept non ASCII digits so we need to first + // check we only have ASCII chars. + if (!id.chars().allMatch(c -> c < 128)) { return false; } - hour = n.intValue(); - - if (pos.getIndex() < id.length()){ - if (pos.getIndex() - start > 2 - || id.charAt(pos.getIndex()) != 0x003A /*':'*/) { - return false; - } - // hh:mm - pos.setIndex(pos.getIndex() + 1); - int oldPos = pos.getIndex(); - n = numberFormat.parse(id, pos); - if ((pos.getIndex() - oldPos) != 2) { - // must be 2 digits - return false; - } - min = n.intValue(); - if (pos.getIndex() < id.length()) { - if (id.charAt(pos.getIndex()) != 0x003A /*':'*/) { - return false; - } - // [:ss] - pos.setIndex(pos.getIndex() + 1); - oldPos = pos.getIndex(); - n = numberFormat.parse(id, pos); - if (pos.getIndex() != id.length() - || (pos.getIndex() - oldPos) != 2) { - return false; - } - sec = n.intValue(); - } - } else { - // Supported formats are below - - // + hour = Utility.parseNumber(id, pos, 10); + if (pos[0] == id.length()) { + // Handle the following cases // HHmmss // Hmmss // HHmm @@ -748,27 +712,57 @@ static boolean parseCustomID(String id, int[] fields) { // HH // H - int length = pos.getIndex() - start; - if (length <= 0 || 6 < length) { - // invalid length - return false; - } + // Get all digits + // Should be 1 to 6 digits. + int length = pos[0] - start; switch (length) { - case 1: - case 2: + case 1: // H + case 2: // HH // already set to hour break; - case 3: - case 4: + case 3: // Hmm + case 4: // HHmm min = hour % 100; hour /= 100; break; - case 5: - case 6: + case 5: // Hmmss + case 6: // HHmmss sec = hour % 100; min = (hour/100) % 100; hour /= 10000; break; + default: + // invalid range + return false; + } + } else { + // Handle the following cases + // HH:mm:ss + // H:mm:ss + // HH:mm + // H:mm + if (pos[0] - start < 1 || pos[0] - start > 2 || id.charAt(pos[0]) != 0x003A /*':'*/) { + return false; + } + pos[0]++; // skip : after H + if (id.length() == pos[0]) { + return false; + } + start = pos[0]; + min = Utility.parseNumber(id, pos, 10); + if (pos[0] - start != 2) { + return false; + } + if (id.length() > pos[0]) { + if (id.charAt(pos[0]) != 0x003A /*':'*/) { + return false; + } + pos[0]++; // skip : after mm + start = pos[0]; + sec = Utility.parseNumber(id, pos, 10); + if (pos[0] - start != 2 || id.length() > pos[0]) { + return false; + } } } diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/timezone/TimeZoneTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/timezone/TimeZoneTest.java index 2c6a164d2b31..d3db92f67254 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/timezone/TimeZoneTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/timezone/TimeZoneTest.java @@ -298,6 +298,26 @@ public void TestCustomParse() { "GMT-2:31:123", "0", TimeZone.UNKNOWN_ZONE_ID, "GMT+3:75", "0", TimeZone.UNKNOWN_ZONE_ID, "GMT-01010101", "0", TimeZone.UNKNOWN_ZONE_ID, + "GMT-4E58", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-4e58", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-1E01", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-2E01", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-2e01", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-9e02", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-1e03", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-2e03", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-500M", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-500T", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-9E00", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-0X0F", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-0x0F", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-0x12", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-B111", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-b111", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-0b11", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-๑๒", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT-๑๒:๓๔", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 + "GMT+๑๒:๓๔:๕๖", "0", TimeZone.UNKNOWN_ZONE_ID, // ICU-22637 }; for (int i = 0; i < DATA.length; i += 3) { String id = DATA[i];