From b032026b13dd3648d505ef6c6e821e5e0d60f908 Mon Sep 17 00:00:00 2001 From: Frank Tang Date: Tue, 14 Nov 2023 17:36:27 -0800 Subject: [PATCH] ICU-13219 add -u-dx- support to BreakIterator ICU-13219 merge ICU-13219 ICU-13219 optimize --- icu4c/source/common/brkiter.cpp | 31 ++++++-- icu4c/source/common/rbbi.cpp | 46 ++++++++++- icu4c/source/common/rbbi_cache.cpp | 33 +++++--- icu4c/source/common/unicode/brkiter.h | 2 +- icu4c/source/common/unicode/rbbi.h | 17 +++- icu4c/source/test/intltest/rbbitst.cpp | 62 +++++++++++++++ icu4c/source/test/intltest/rbbitst.h | 2 + .../java/com/ibm/icu/text/BreakIterator.java | 1 + .../ibm/icu/text/BreakIteratorFactory.java | 7 +- .../ibm/icu/text/RuleBasedBreakIterator.java | 79 ++++++++++++++++--- .../com/ibm/icu/dev/test/rbbi/RBBITest.java | 29 +++++++ 11 files changed, 271 insertions(+), 38 deletions(-) diff --git a/icu4c/source/common/brkiter.cpp b/icu4c/source/common/brkiter.cpp index b452cf2c0500..b34a4edecc92 100644 --- a/icu4c/source/common/brkiter.cpp +++ b/icu4c/source/common/brkiter.cpp @@ -1,4 +1,5 @@ // © 2016 and later: Unicode, Inc. and others. +// // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* @@ -55,7 +56,7 @@ U_NAMESPACE_BEGIN // ------------------------------------- BreakIterator* -BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status) +BreakIterator::buildInstance(const Locale& loc, const char *type, bool checkDX, UErrorCode &status) { char fnbuff[256]; char ext[4]={'\0'}; @@ -116,8 +117,22 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st return nullptr; } - // Create a RuleBasedBreakIterator - result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, status); + { + const char* dxs = nullptr; + CharString dxsValue; // keep on the stack till we no longer need dxs. + // If it is word or line instance, try to get the value for dx + if (checkDX) { + UErrorCode dxsStatus = U_ZERO_ERROR; + CharStringByteSink dxsSink(&dxsValue); + loc.getKeywordValue("dx", dxsSink, dxsStatus); + if (U_SUCCESS(dxsStatus) && dxsValue.length() > 0) { + dxs = dxsValue.data(); + } + } + + // Create a RuleBasedBreakIterator + result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, dxs, status); + } // If there is a result, set the valid locale and actual locale, and the kind if (U_SUCCESS(status) && result != nullptr) { @@ -421,14 +436,14 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) case UBRK_CHARACTER: { UTRACE_ENTRY(UTRACE_UBRK_CREATE_CHARACTER); - result = BreakIterator::buildInstance(loc, "grapheme", status); + result = BreakIterator::buildInstance(loc, "grapheme", false, status); UTRACE_EXIT_STATUS(status); } break; case UBRK_WORD: { UTRACE_ENTRY(UTRACE_UBRK_CREATE_WORD); - result = BreakIterator::buildInstance(loc, "word", status); + result = BreakIterator::buildInstance(loc, "word", true, status); UTRACE_EXIT_STATUS(status); } break; @@ -454,7 +469,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) uprv_strcat(lb_lw, value.data()); } } - result = BreakIterator::buildInstance(loc, lb_lw, status); + result = BreakIterator::buildInstance(loc, lb_lw, true, status); UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw); UTRACE_EXIT_STATUS(status); @@ -463,7 +478,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) case UBRK_SENTENCE: { UTRACE_ENTRY(UTRACE_UBRK_CREATE_SENTENCE); - result = BreakIterator::buildInstance(loc, "sentence", status); + result = BreakIterator::buildInstance(loc, "sentence", false, status); #if !UCONFIG_NO_FILTERED_BREAK_ITERATION char ssKeyValue[kKeyValueLenMax] = {0}; UErrorCode kvStatus = U_ZERO_ERROR; @@ -482,7 +497,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) case UBRK_TITLE: { UTRACE_ENTRY(UTRACE_UBRK_CREATE_TITLE); - result = BreakIterator::buildInstance(loc, "title", status); + result = BreakIterator::buildInstance(loc, "title", false, status); UTRACE_EXIT_STATUS(status); } break; diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index 599279fb72bb..4cd7a20122fa 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -25,6 +25,7 @@ #include "unicode/uchriter.h" #include "unicode/uclean.h" #include "unicode/udata.h" +#include "unicode/uniset.h" #include "brkeng.h" #include "ucln_cmn.h" @@ -89,9 +90,37 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode // //------------------------------------------------------------------------------- RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking, + const char* dxs, UErrorCode &status) : RuleBasedBreakIterator(udm, status) { fIsPhraseBreaking = isPhraseBreaking; + if (U_FAILURE(status)) { + return; + } + if (dxs != nullptr) { + size_t length = uprv_strlen(dxs); + // The value should be a list of 4 letter script codes joined by '-'. + if (length % 5 != 4) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + size_t items = 1 + length / 5; + // Change from "thai" to "[:thai:]" or + // "thai-arab" to "[[:thai:][:arab:]]" + UnicodeString udxs; + if (items > 1) { + udxs.append(u'['); + } + for (size_t i = 0; i < items; i++) { + udxs.append(u"[:", -1); + udxs.append(UnicodeString(dxs + i * 5, 4, US_INV)); + udxs.append(u":]", -1); + } + if (items > 1) { + udxs.append(u']'); + } + fDX = new UnicodeSet(udxs, status); + } } // @@ -198,7 +227,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator() * Simple Constructor with an error code. * Handles common initialization for all other constructors. */ -RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) { +RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) : fDX(nullptr) { UErrorCode ec = U_ZERO_ERROR; if (status == nullptr) { status = &ec; @@ -212,6 +241,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) { } fDictionaryCache = lpDictionaryCache.orphan(); fBreakCache = lpBreakCache.orphan(); + fDX = nullptr; #ifdef RBBI_DEBUG static UBool debugInitDone = false; @@ -261,6 +291,9 @@ RuleBasedBreakIterator::~RuleBasedBreakIterator() { delete fDictionaryCache; fDictionaryCache = nullptr; + delete fDX; + fDX = nullptr; + delete fLanguageBreakEngines; fLanguageBreakEngines = nullptr; @@ -333,6 +366,7 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { // the assumption that the current position is on a rule boundary. fBreakCache->reset(fPosition, fRuleStatusIndex); fDictionaryCache->reset(); + fDX = (that.fDX == nullptr) ? nullptr : that.fDX->cloneAsThawed(); return *this; } @@ -381,11 +415,15 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const { return false; } + // If only one has fDX or they are not equal + if (!((that2.fDX == nullptr && fDX == nullptr) || *that2.fDX == *fDX)) { + return false; + } if (that2.fData == fData || (fData != nullptr && that2.fData != nullptr && *that2.fData == *fData)) { // The two break iterators are using the same rules. return true; - } + } return false; } @@ -1298,6 +1336,10 @@ RuleBasedBreakIterator::getRules() const { } } +bool RuleBasedBreakIterator::excludedFromDictionaryBreak(int32_t c) { + return fDX != nullptr && fDX->contains(c); +} + U_NAMESPACE_END #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp index f7a283f69e45..c08c27854284 100644 --- a/icu4c/source/common/rbbi_cache.cpp +++ b/icu4c/source/common/rbbi_cache.cpp @@ -156,20 +156,27 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo break; } - // We now have a dictionary character. Get the appropriate language object - // to deal with it. - const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine( - c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status)); - - // Ask the language object if there are any breaks. It will add them to the cache and - // leave the text pointer on the other side of its range, ready to search for the next one. - if (lbe != nullptr) { - foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status); + // We now have a dictionary character. + // Handle dx (Dictionary break script exclusions) first if needed + if (fBI->excludedFromDictionaryBreak(c)) { + utext_next32(text); + c = utext_current32(text); + // If we exclude the character, we treat it as AL + category = ucptrie_get(fBI->fData->fTrie, 'A'); + } else { + // Get the appropriate language object to deal with it. + const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine( + c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status)); + + // Ask the language object if there are any breaks. It will add them to the cache and + // leave the text pointer on the other side of its range, ready to search for the next one. + if (lbe != nullptr) { + foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status); + } + // Reload the loop variables for the next go-round + c = utext_current32(text); + category = ucptrie_get(fBI->fData->fTrie, c); } - - // Reload the loop variables for the next go-round - c = utext_current32(text); - category = ucptrie_get(fBI->fData->fTrie, c); } // If we found breaks, ensure that the first and last entries are diff --git a/icu4c/source/common/unicode/brkiter.h b/icu4c/source/common/unicode/brkiter.h index 1b10e6ef1165..4af640bd0da0 100644 --- a/icu4c/source/common/unicode/brkiter.h +++ b/icu4c/source/common/unicode/brkiter.h @@ -623,7 +623,7 @@ class U_COMMON_API BreakIterator : public UObject { virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0; private: - static BreakIterator* buildInstance(const Locale& loc, const char *type, UErrorCode& status); + static BreakIterator* buildInstance(const Locale& loc, const char *type, bool checkDX, UErrorCode& status); static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status); static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status); diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h index 045238ac5d79..dfe155d44d83 100644 --- a/icu4c/source/common/unicode/rbbi.h +++ b/icu4c/source/common/unicode/rbbi.h @@ -17,6 +17,7 @@ #define RBBI_H #include "unicode/utypes.h" +#include "unicode/uniset.h" #if U_SHOW_CPLUSPLUS_API @@ -42,6 +43,7 @@ struct RBBIDataHeader; class RBBIDataWrapper; class UnhandledEngine; class UStack; +class UnicodeSet; #ifndef U_HIDE_DRAFT_API @@ -221,6 +223,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator { */ UBool fIsPhraseBreaking = false; + /** + * A UnicodeSet for Dictionary Break Exclusion. + */ + UnicodeSet* fDX = nullptr; +private: + //======================================================================= // constructors //======================================================================= @@ -246,11 +254,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator { * which will be responsible for closing it when it is no longer needed. * @param status Information on any errors encountered. * @param isPhraseBreaking true if phrase based breaking is required, otherwise false. + * @param dxs nullptr or a string to denote "Dictionary break script exclusions". * @see udata_open * @see #getBinaryRules * @internal (private) */ - RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status); + RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, const char* dxs, UErrorCode &status); /** @internal */ friend class RBBIRuleBuilder; @@ -766,6 +775,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator { * signature) */ + /* + * Check should the character be excluded from dictionary-based text break. + * @internal (private) + */ + bool excludedFromDictionaryBreak(int32_t c); + typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32); template diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 2afe4b3912df..15a7570c2869 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -106,6 +106,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha TESTCASE_AUTO(TestLineBreaks); TESTCASE_AUTO(TestSentBreaks); TESTCASE_AUTO(TestExtended); + TESTCASE_AUTO(TestDXLineBreaks); + TESTCASE_AUTO(TestDXWordBreaks); #endif #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO TESTCASE_AUTO(TestMonkey); @@ -3900,6 +3902,66 @@ void RBBITest::TestLineBreaks() #endif } +void RBBITest::TestDXLineBreaks() +{ +#if !UCONFIG_NO_REGULAR_EXPRESSIONS + UnicodeString text(u"abcde 一二三四五六七八九十อิสราเอลโชว์คลิป"); + std::vector expected{ 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32 }; + Locale locale("ja-u-dx-hani-thai"); + UErrorCode status = U_ZERO_ERROR; + std::unique_ptr bi(BreakIterator::createLineInstance(locale, status)); + TEST_ASSERT_SUCCESS(status); + if (U_FAILURE(status)) { + return; + } + bi->setText(text); + int32_t c = bi->first(); + std::vector actuals; + do { + actuals.push_back(c); + } while ((c = bi->next()) != BreakIterator::DONE ); + + assertEquals(WHERE, + static_cast(expected.size()), + static_cast(actuals.size())); + if (expected.size() == actuals.size()) { + for (size_t i = 0; i < expected.size(); i++) { + assertEquals(WHERE, expected[i], actuals[i]); + } + } +#endif +} + +void RBBITest::TestDXWordBreaks() +{ +#if !UCONFIG_NO_REGULAR_EXPRESSIONS + UnicodeString text(u"abcde 一二三四五六七八九十อิสราเอลโชว์คลิป"); + Locale locale("ja-u-dx-hani-thai"); + std::vector expected{ 0, 5, 6, 16, 32 }; + UErrorCode status = U_ZERO_ERROR; + std::unique_ptr bi(BreakIterator::createWordInstance(locale, status)); + TEST_ASSERT_SUCCESS(status); + if (U_FAILURE(status)) { + return; + } + bi->setText(text); + int32_t c = bi->first(); + std::vector actuals; + do { + actuals.push_back(c); + } while ((c = bi->next()) != BreakIterator::DONE ); + + assertEquals(WHERE, + static_cast(expected.size()), + static_cast(actuals.size())); + if (expected.size() == actuals.size()) { + for (size_t i = 0; i < expected.size(); i++) { + assertEquals(WHERE, expected[i], actuals[i]); + } + } +#endif +} + void RBBITest::TestSentBreaks() { #if !UCONFIG_NO_REGULAR_EXPRESSIONS diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index 537a537863ad..fdc228823695 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -99,6 +99,8 @@ class RBBITest: public IntlTest { void TestExternalBreakEngineWithFakeTaiLe(); void TestExternalBreakEngineWithFakeYue(); + void TestDXLineBreaks(); + void TestDXWordBreaks(); #if U_ENABLE_TRACING void TestTraceCreateCharacter(); void TestTraceCreateWord(); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIterator.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIterator.java index 7223d22e4c34..fcafb597fbc1 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIterator.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIterator.java @@ -17,6 +17,7 @@ import com.ibm.icu.impl.CSCharacterIterator; import com.ibm.icu.impl.CacheValue; import com.ibm.icu.impl.ICUDebug; +import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.ICUCloneNotSupportedException; import com.ibm.icu.util.ULocale; diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIteratorFactory.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIteratorFactory.java index c78f36ed638b..0b8c02fcc55b 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIteratorFactory.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIteratorFactory.java @@ -156,12 +156,17 @@ private static BreakIterator createBreakInstance(ULocale locale, int kind) { throw new MissingResourceException(e.toString(),"",""); } + // Dictionary Break Exclusion + String dxValue = null; + if (kind == BreakIterator.KIND_LINE || kind == BreakIterator.KIND_WORD) { + dxValue = locale.getUnicodeLocaleType("dx"); + } // // Create a normal RuleBasedBreakIterator. // try { boolean isPhraseBreaking = (brkfname != null) && brkfname.contains("phrase"); - iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(bytes, isPhraseBreaking); + iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(bytes, isPhraseBreaking, dxValue); } catch (IOException e) { // Shouldn't be possible to get here. diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java index d0cf532ec761..5f6782a31808 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java @@ -40,6 +40,7 @@ import com.ibm.icu.lang.UProperty; import com.ibm.icu.lang.UScript; import com.ibm.icu.util.CodePointTrie; +import com.ibm.icu.text.UnicodeSet; /** * Rule Based Break Iterator @@ -99,17 +100,57 @@ public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is * * @param bytes a buffer supplying the compiled binary rules. * @param phraseBreaking a flag indicating if phrase breaking is required. + * @param dxValues Dictionary break script exclusions. * @throws IOException if there is an error while reading the rules from the buffer. + * IllegalArgumentException if the dxValues is not null nor a String in the supported + * format. * @see #compileRules(String, OutputStream) * @internal */ /* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules( - ByteBuffer bytes, boolean phraseBreaking) throws IOException { + ByteBuffer bytes, boolean phraseBreaking, String dxValues) throws IOException, IllegalArgumentException { RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes); instance.fPhraseBreaking = phraseBreaking; + instance.fDX = makeExcludedDictionaryBreakUnicodeSet(dxValues); return instance; } + /** + * Crate a UnicodeSet for the Dictionary Break Script Exclusions. + * @param dxValues Dictionary break script exclusions, a string of Script code joined by "-". + * @throws IOException if there is an error while constr the rules from the buffer. + * @internal + */ + private static UnicodeSet makeExcludedDictionaryBreakUnicodeSet( + String dxs) throws IllegalArgumentException { + if (dxs == null) { + return null; + } + if (dxs.length() % 5 != 4) { + throw new IllegalArgumentException("Incorrect value for dx key: " + dxs); + } + // Change from "thai" to "[:thai:]" or "thai-arab" to "[[:thai:][:arab:]]" + StringBuilder builder = new StringBuilder(); + int items = 1 + (dxs.length() / 5); + if (items > 1) { + builder.append("["); + } + for (int i = 0; i < items; i++) { + builder.append("[:").append(dxs.substring(i*5, i*5+4)).append(":]"); + } + if (items > 1) { + builder.append("]"); + } + return new UnicodeSet(builder.toString()); + } + + /** + * Check should the character be excluded from dictionary-based text break. + */ + private boolean excludedFromDictionaryBreak(int c) { + return fDX != null && fDX.contains(c); + } + /** * Create a break iterator from a precompiled set of break rules. * @@ -180,6 +221,7 @@ public Object clone() { result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; result.fBreakCache = result.new BreakCache(fBreakCache); result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache); + result.fDX = fDX; // fDX could be shared w/ other instance return result; } @@ -206,6 +248,9 @@ public boolean equals(Object that) { (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { return false; } + if (!((fDX == null && other.fDX == null) || fDX.equals(other.fDX))) { + return false; + } if (fText == null && other.fText == null) { return true; } @@ -305,6 +350,10 @@ public int hashCode() */ private boolean fPhraseBreaking = false; + /** + * UnicodeSet for Dictionary break script exclusions. + */ + protected UnicodeSet fDX = null; /** * Counter for the number of characters encountered with the "dictionary" @@ -1229,19 +1278,25 @@ void populateDictionary(int startPos, int endPos, break; } - // We now have a dictionary character. Get the appropriate language object - // to deal with it. - LanguageBreakEngine lbe = getLanguageBreakEngine(c); + if (excludedFromDictionaryBreak(c)) { + c = CharacterIteration.next32(fText); + // treat character in dx as AL + category = (short)fRData.fTrie.get('A'); + } else { + // We now have a dictionary character. Get the appropriate language object + // to deal with it. + LanguageBreakEngine lbe = getLanguageBreakEngine(c); + + // Ask the language object if there are any breaks. It will add them to the cache and + // leave the text pointer on the other side of its range, ready to search for the next one. + if (lbe != null) { + foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking); + } - // Ask the language object if there are any breaks. It will add them to the cache and - // leave the text pointer on the other side of its range, ready to search for the next one. - if (lbe != null) { - foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking); + // Reload the loop variables for the next go-round + c = CharacterIteration.current32(fText); + category = (short)fRData.fTrie.get(c); } - - // Reload the loop variables for the next go-round - c = CharacterIteration.current32(fText); - category = (short)fRData.fTrie.get(c); } // If we found breaks, ensure that the first and last entries are diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java index 15f7265d0636..21e129ea7aec 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java @@ -10,6 +10,7 @@ import java.text.CharacterIterator; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Locale; @@ -1003,4 +1004,32 @@ public int randomStringIndex() { assertEquals("preceding" + idx, fns.expectedPreceding(idx), bi.preceding(idx)); } } + @Test + public void TestDXLineBreaks() { + String text = "abcde 一二三四五六七八九十อิสราเอลโชว์คลิป"; + BreakIterator brk = BreakIterator.getLineInstance(ULocale.forLanguageTag("ja-u-dx-hani-thai")); + brk.setText(text); + List expected = new ArrayList(Arrays.asList( + 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32)); + List actuals = new ArrayList(); + int c = brk.first(); + do { + actuals.add(c); + } while ((c = brk.next()) != BreakIterator.DONE); + assertEquals("-u-dx- is not working", expected, actuals); + } + @Test + public void TestDXWordBreaks() { + String text = "abcde 一二三四五六七八九十อิสราเอลโชว์คลิป"; + BreakIterator brk = BreakIterator.getWordInstance(ULocale.forLanguageTag("ja-u-dx-hani-thai")); + brk.setText(text); + List expected = new ArrayList(Arrays.asList( + 0, 5, 6, 16, 32 )); + List actuals = new ArrayList(); + int c = brk.first(); + do { + actuals.add(c); + } while ((c = brk.next()) != BreakIterator.DONE); + assertEquals("-u-dx- is not working", expected, actuals); + } }