From b032026b13dd3648d505ef6c6e821e5e0d60f908 Mon Sep 17 00:00:00 2001
From: Frank Tang <ftang@chromium.org>
Date: Tue, 14 Nov 2023 17:36:27 -0800
Subject: [PATCH] ICU-13219 add -u-dx- support to BreakIterator

ICU-13219 merge

ICU-13219

ICU-13219 optimize
---
 icu4c/source/common/brkiter.cpp               | 31 ++++++--
 icu4c/source/common/rbbi.cpp                  | 46 ++++++++++-
 icu4c/source/common/rbbi_cache.cpp            | 33 +++++---
 icu4c/source/common/unicode/brkiter.h         |  2 +-
 icu4c/source/common/unicode/rbbi.h            | 17 +++-
 icu4c/source/test/intltest/rbbitst.cpp        | 62 +++++++++++++++
 icu4c/source/test/intltest/rbbitst.h          |  2 +
 .../java/com/ibm/icu/text/BreakIterator.java  |  1 +
 .../ibm/icu/text/BreakIteratorFactory.java    |  7 +-
 .../ibm/icu/text/RuleBasedBreakIterator.java  | 79 ++++++++++++++++---
 .../com/ibm/icu/dev/test/rbbi/RBBITest.java   | 29 +++++++
 11 files changed, 271 insertions(+), 38 deletions(-)

diff --git a/icu4c/source/common/brkiter.cpp b/icu4c/source/common/brkiter.cpp
index b452cf2c0500..b34a4edecc92 100644
--- a/icu4c/source/common/brkiter.cpp
+++ b/icu4c/source/common/brkiter.cpp
@@ -1,4 +1,5 @@
 // © 2016 and later: Unicode, Inc. and others.
+//
 // License & terms of use: http://www.unicode.org/copyright.html
 /*
 *******************************************************************************
@@ -55,7 +56,7 @@ U_NAMESPACE_BEGIN
 // -------------------------------------
 
 BreakIterator*
-BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status)
+BreakIterator::buildInstance(const Locale& loc, const char *type, bool checkDX, UErrorCode &status)
 {
     char fnbuff[256];
     char ext[4]={'\0'};
@@ -116,8 +117,22 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st
         return nullptr;
     }
 
-    // Create a RuleBasedBreakIterator
-    result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, status);
+    {
+        const char* dxs = nullptr;
+        CharString dxsValue; // keep on the stack till we no longer need dxs.
+        // If it is word or line instance, try to get the value for dx
+        if (checkDX) {
+            UErrorCode dxsStatus = U_ZERO_ERROR;
+            CharStringByteSink dxsSink(&dxsValue);
+            loc.getKeywordValue("dx", dxsSink, dxsStatus);
+            if (U_SUCCESS(dxsStatus) && dxsValue.length() > 0) {
+                dxs = dxsValue.data();
+            }
+        }
+
+        // Create a RuleBasedBreakIterator
+        result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, dxs, status);
+    }
 
     // If there is a result, set the valid locale and actual locale, and the kind
     if (U_SUCCESS(status) && result != nullptr) {
@@ -421,14 +436,14 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
     case UBRK_CHARACTER:
         {
             UTRACE_ENTRY(UTRACE_UBRK_CREATE_CHARACTER);
-            result = BreakIterator::buildInstance(loc, "grapheme", status);
+            result = BreakIterator::buildInstance(loc, "grapheme", false, status);
             UTRACE_EXIT_STATUS(status);
         }
         break;
     case UBRK_WORD:
         {
             UTRACE_ENTRY(UTRACE_UBRK_CREATE_WORD);
-            result = BreakIterator::buildInstance(loc, "word", status);
+            result = BreakIterator::buildInstance(loc, "word", true, status);
             UTRACE_EXIT_STATUS(status);
         }
         break;
@@ -454,7 +469,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
                     uprv_strcat(lb_lw, value.data());
                 }
             }
-            result = BreakIterator::buildInstance(loc, lb_lw, status);
+            result = BreakIterator::buildInstance(loc, lb_lw, true, status);
 
             UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw);
             UTRACE_EXIT_STATUS(status);
@@ -463,7 +478,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
     case UBRK_SENTENCE:
         {
             UTRACE_ENTRY(UTRACE_UBRK_CREATE_SENTENCE);
-            result = BreakIterator::buildInstance(loc, "sentence", status);
+            result = BreakIterator::buildInstance(loc, "sentence", false, status);
 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
             char ssKeyValue[kKeyValueLenMax] = {0};
             UErrorCode kvStatus = U_ZERO_ERROR;
@@ -482,7 +497,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
     case UBRK_TITLE:
         {
             UTRACE_ENTRY(UTRACE_UBRK_CREATE_TITLE);
-            result = BreakIterator::buildInstance(loc, "title", status);
+            result = BreakIterator::buildInstance(loc, "title", false, status);
             UTRACE_EXIT_STATUS(status);
         }
         break;
diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp
index 599279fb72bb..4cd7a20122fa 100644
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@@ -25,6 +25,7 @@
 #include "unicode/uchriter.h"
 #include "unicode/uclean.h"
 #include "unicode/udata.h"
+#include "unicode/uniset.h"
 
 #include "brkeng.h"
 #include "ucln_cmn.h"
@@ -89,9 +90,37 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode
 //
 //-------------------------------------------------------------------------------
 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking,
+                                               const char* dxs,
         UErrorCode &status) : RuleBasedBreakIterator(udm, status)
 {
     fIsPhraseBreaking = isPhraseBreaking;
+    if (U_FAILURE(status)) {
+        return;
+    }
+    if (dxs != nullptr) {
+        size_t length = uprv_strlen(dxs);
+        // The value should be a list of 4 letter script codes joined by '-'.
+        if (length % 5 != 4) {
+            status = U_ILLEGAL_ARGUMENT_ERROR;
+            return;
+        }
+        size_t items = 1 + length / 5;
+        // Change from "thai" to "[:thai:]" or
+        // "thai-arab" to "[[:thai:][:arab:]]"
+        UnicodeString udxs;
+        if (items > 1) {
+            udxs.append(u'[');
+        }
+        for (size_t i = 0; i < items; i++) {
+            udxs.append(u"[:", -1);
+            udxs.append(UnicodeString(dxs + i * 5, 4, US_INV));
+            udxs.append(u":]", -1);
+        }
+        if (items > 1) {
+            udxs.append(u']');
+        }
+        fDX = new UnicodeSet(udxs, status);
+    }
 }
 
 //
@@ -198,7 +227,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator()
  * Simple Constructor with an error code.
  * Handles common initialization for all other constructors.
  */
-RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) {
+RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) : fDX(nullptr) {
     UErrorCode ec = U_ZERO_ERROR;
     if (status == nullptr) {
         status = &ec;
@@ -212,6 +241,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) {
     }
     fDictionaryCache = lpDictionaryCache.orphan();
     fBreakCache = lpBreakCache.orphan();
+    fDX = nullptr;
 
 #ifdef RBBI_DEBUG
     static UBool debugInitDone = false;
@@ -261,6 +291,9 @@ RuleBasedBreakIterator::~RuleBasedBreakIterator() {
     delete fDictionaryCache;
     fDictionaryCache = nullptr;
 
+    delete fDX;
+    fDX = nullptr;
+
     delete fLanguageBreakEngines;
     fLanguageBreakEngines = nullptr;
 
@@ -333,6 +366,7 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
     //       the assumption that the current position is on a rule boundary.
     fBreakCache->reset(fPosition, fRuleStatusIndex);
     fDictionaryCache->reset();
+    fDX = (that.fDX == nullptr) ? nullptr : that.fDX->cloneAsThawed();
 
     return *this;
 }
@@ -381,11 +415,15 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
         return false;
     }
 
+    // If only one has fDX or they are not equal
+    if (!((that2.fDX == nullptr && fDX == nullptr) || *that2.fDX == *fDX)) {
+        return false;
+    }
     if (that2.fData == fData ||
         (fData != nullptr && that2.fData != nullptr && *that2.fData == *fData)) {
             // The two break iterators are using the same rules.
             return true;
-        }
+    }
     return false;
 }
 
@@ -1298,6 +1336,10 @@ RuleBasedBreakIterator::getRules() const {
     }
 }
 
+bool RuleBasedBreakIterator::excludedFromDictionaryBreak(int32_t c) {
+    return fDX != nullptr && fDX->contains(c);
+}
+
 U_NAMESPACE_END
 
 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp
index f7a283f69e45..c08c27854284 100644
--- a/icu4c/source/common/rbbi_cache.cpp
+++ b/icu4c/source/common/rbbi_cache.cpp
@@ -156,20 +156,27 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
             break;
         }
 
-        // We now have a dictionary character. Get the appropriate language object
-        // to deal with it.
-        const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
-            c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));
-
-        // Ask the language object if there are any breaks. It will add them to the cache and
-        // leave the text pointer on the other side of its range, ready to search for the next one.
-        if (lbe != nullptr) {
-            foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
+        // We now have a dictionary character.
+        // Handle dx (Dictionary break script exclusions) first if needed
+        if (fBI->excludedFromDictionaryBreak(c)) {
+            utext_next32(text);
+            c = utext_current32(text);
+            // If we exclude the character, we treat it as AL
+            category = ucptrie_get(fBI->fData->fTrie, 'A');
+        } else {
+            // Get the appropriate language object to deal with it.
+            const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
+                c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));
+
+            // Ask the language object if there are any breaks. It will add them to the cache and
+            // leave the text pointer on the other side of its range, ready to search for the next one.
+            if (lbe != nullptr) {
+                foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
+            }
+            // Reload the loop variables for the next go-round
+            c = utext_current32(text);
+            category = ucptrie_get(fBI->fData->fTrie, c);
         }
-
-        // Reload the loop variables for the next go-round
-        c = utext_current32(text);
-        category = ucptrie_get(fBI->fData->fTrie, c);
     }
 
     // If we found breaks, ensure that the first and last entries are
diff --git a/icu4c/source/common/unicode/brkiter.h b/icu4c/source/common/unicode/brkiter.h
index 1b10e6ef1165..4af640bd0da0 100644
--- a/icu4c/source/common/unicode/brkiter.h
+++ b/icu4c/source/common/unicode/brkiter.h
@@ -623,7 +623,7 @@ class U_COMMON_API BreakIterator : public UObject {
     virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0;
 
  private:
-    static BreakIterator* buildInstance(const Locale& loc, const char *type, UErrorCode& status);
+    static BreakIterator* buildInstance(const Locale& loc, const char *type, bool checkDX, UErrorCode& status);
     static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
     static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);
 
diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h
index 045238ac5d79..dfe155d44d83 100644
--- a/icu4c/source/common/unicode/rbbi.h
+++ b/icu4c/source/common/unicode/rbbi.h
@@ -17,6 +17,7 @@
 #define RBBI_H
 
 #include "unicode/utypes.h"
+#include "unicode/uniset.h"
 
 #if U_SHOW_CPLUSPLUS_API
 
@@ -42,6 +43,7 @@ struct RBBIDataHeader;
 class  RBBIDataWrapper;
 class  UnhandledEngine;
 class  UStack;
+class  UnicodeSet;
 
 
 #ifndef U_HIDE_DRAFT_API
@@ -221,6 +223,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
      */
     UBool fIsPhraseBreaking = false;
 
+    /**
+     * A UnicodeSet for Dictionary Break Exclusion.
+     */
+    UnicodeSet* fDX = nullptr;
+private:
+
     //=======================================================================
     // constructors
     //=======================================================================
@@ -246,11 +254,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
      *        which will be responsible for closing it when it is no longer needed.
      * @param status Information on any errors encountered.
      * @param isPhraseBreaking true if phrase based breaking is required, otherwise false.
+     * @param dxs nullptr or a string to denote "Dictionary break script exclusions".
      * @see udata_open
      * @see #getBinaryRules
      * @internal (private)
      */
-    RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status);
+    RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, const char* dxs, UErrorCode &status);
 
     /** @internal */
     friend class RBBIRuleBuilder;
@@ -766,6 +775,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
      * signature)
      */
 
+    /*
+     * Check should the character be excluded from dictionary-based text break.
+     * @internal (private)
+     */
+    bool excludedFromDictionaryBreak(int32_t c);
+
     typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);
 
     template<typename RowType, PTrieFunc trieFunc>
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp
index 2afe4b3912df..15a7570c2869 100644
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -106,6 +106,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
     TESTCASE_AUTO(TestLineBreaks);
     TESTCASE_AUTO(TestSentBreaks);
     TESTCASE_AUTO(TestExtended);
+    TESTCASE_AUTO(TestDXLineBreaks);
+    TESTCASE_AUTO(TestDXWordBreaks);
 #endif
 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
     TESTCASE_AUTO(TestMonkey);
@@ -3900,6 +3902,66 @@ void RBBITest::TestLineBreaks()
 #endif
 }
 
+void RBBITest::TestDXLineBreaks()
+{
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+    UnicodeString text(u"abcde 一二三四五六七八九十อิสราเอลโชว์คลิป");
+    std::vector<int32_t> expected{ 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32 };
+    Locale        locale("ja-u-dx-hani-thai");
+    UErrorCode    status = U_ZERO_ERROR;
+    std::unique_ptr<BreakIterator> bi(BreakIterator::createLineInstance(locale, status));
+    TEST_ASSERT_SUCCESS(status);
+    if (U_FAILURE(status)) {
+        return;
+    }
+    bi->setText(text);
+    int32_t c = bi->first();
+    std::vector<int32_t> actuals;
+    do {
+      actuals.push_back(c);
+    } while ((c = bi->next()) != BreakIterator::DONE );
+
+    assertEquals(WHERE,
+                 static_cast<int32_t>(expected.size()),
+                 static_cast<int32_t>(actuals.size()));
+    if (expected.size() == actuals.size()) {
+        for (size_t i = 0; i < expected.size(); i++) {
+            assertEquals(WHERE, expected[i], actuals[i]);
+        }
+    }
+#endif
+}
+
+void RBBITest::TestDXWordBreaks()
+{
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+    UnicodeString text(u"abcde 一二三四五六七八九十อิสราเอลโชว์คลิป");
+    Locale        locale("ja-u-dx-hani-thai");
+    std::vector<int32_t> expected{ 0, 5, 6, 16, 32 };
+    UErrorCode    status = U_ZERO_ERROR;
+    std::unique_ptr<BreakIterator> bi(BreakIterator::createWordInstance(locale, status));
+    TEST_ASSERT_SUCCESS(status);
+    if (U_FAILURE(status)) {
+        return;
+    }
+    bi->setText(text);
+    int32_t c = bi->first();
+    std::vector<int32_t> actuals;
+    do {
+      actuals.push_back(c);
+    } while ((c = bi->next()) != BreakIterator::DONE );
+
+    assertEquals(WHERE,
+                 static_cast<int32_t>(expected.size()),
+                 static_cast<int32_t>(actuals.size()));
+    if (expected.size() == actuals.size()) {
+        for (size_t i = 0; i < expected.size(); i++) {
+            assertEquals(WHERE, expected[i], actuals[i]);
+        }
+    }
+#endif
+}
+
 void RBBITest::TestSentBreaks()
 {
 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h
index 537a537863ad..fdc228823695 100644
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@@ -99,6 +99,8 @@ class RBBITest: public IntlTest {
     void TestExternalBreakEngineWithFakeTaiLe();
     void TestExternalBreakEngineWithFakeYue();
 
+    void TestDXLineBreaks();
+    void TestDXWordBreaks();
 #if U_ENABLE_TRACING
     void TestTraceCreateCharacter();
     void TestTraceCreateWord();
diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIterator.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIterator.java
index 7223d22e4c34..fcafb597fbc1 100644
--- a/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIterator.java
+++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIterator.java
@@ -17,6 +17,7 @@
 import com.ibm.icu.impl.CSCharacterIterator;
 import com.ibm.icu.impl.CacheValue;
 import com.ibm.icu.impl.ICUDebug;
+import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.util.ICUCloneNotSupportedException;
 import com.ibm.icu.util.ULocale;
 
diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIteratorFactory.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIteratorFactory.java
index c78f36ed638b..0b8c02fcc55b 100644
--- a/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIteratorFactory.java
+++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIteratorFactory.java
@@ -156,12 +156,17 @@ private static BreakIterator createBreakInstance(ULocale locale, int kind) {
             throw new MissingResourceException(e.toString(),"","");
         }
 
+        // Dictionary Break Exclusion
+        String dxValue = null;
+        if (kind == BreakIterator.KIND_LINE || kind == BreakIterator.KIND_WORD) {
+            dxValue = locale.getUnicodeLocaleType("dx");
+        }
         //
         // Create a normal RuleBasedBreakIterator.
         //
         try {
             boolean isPhraseBreaking = (brkfname != null) && brkfname.contains("phrase");
-            iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(bytes, isPhraseBreaking);
+            iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(bytes, isPhraseBreaking, dxValue);
         }
         catch (IOException e) {
             // Shouldn't be possible to get here.
diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java
index d0cf532ec761..5f6782a31808 100644
--- a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java
+++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java
@@ -40,6 +40,7 @@
 import com.ibm.icu.lang.UProperty;
 import com.ibm.icu.lang.UScript;
 import com.ibm.icu.util.CodePointTrie;
+import com.ibm.icu.text.UnicodeSet;
 
 /**
  * Rule Based Break Iterator
@@ -99,17 +100,57 @@ public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is
      *
      * @param bytes a buffer supplying the compiled binary rules.
      * @param phraseBreaking a flag indicating if phrase breaking is required.
+     * @param dxValues Dictionary break script exclusions.
      * @throws IOException if there is an error while reading the rules from the buffer.
+     *         IllegalArgumentException if the dxValues is not null nor a String in the supported
+     *         format.
      * @see    #compileRules(String, OutputStream)
      * @internal
      */
     /* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules(
-            ByteBuffer bytes, boolean phraseBreaking) throws IOException {
+            ByteBuffer bytes, boolean phraseBreaking, String dxValues) throws IOException, IllegalArgumentException {
         RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes);
         instance.fPhraseBreaking = phraseBreaking;
+        instance.fDX = makeExcludedDictionaryBreakUnicodeSet(dxValues);
         return instance;
     }
 
+    /**
+     * Crate a UnicodeSet for the Dictionary Break Script Exclusions.
+     * @param dxValues Dictionary break script exclusions, a string of Script code joined by "-".
+     * @throws IOException if there is an error while constr the rules from the buffer.
+     * @internal
+     */
+    private static UnicodeSet makeExcludedDictionaryBreakUnicodeSet(
+        String dxs) throws IllegalArgumentException {
+        if (dxs == null) {
+            return null;
+        }
+        if (dxs.length() % 5 != 4) {
+            throw new IllegalArgumentException("Incorrect value for dx key: " + dxs);
+        }
+        // Change from "thai" to "[:thai:]" or "thai-arab" to "[[:thai:][:arab:]]"
+        StringBuilder builder = new StringBuilder();
+        int items = 1 + (dxs.length() / 5);
+        if (items > 1) {
+            builder.append("[");
+        }
+        for (int i = 0; i < items; i++) {
+            builder.append("[:").append(dxs.substring(i*5, i*5+4)).append(":]");
+        }
+        if (items > 1) {
+            builder.append("]");
+        }
+        return new UnicodeSet(builder.toString());
+    }
+
+    /**
+     * Check should the character be excluded from dictionary-based text break.
+     */
+    private boolean excludedFromDictionaryBreak(int c) {
+        return fDX != null && fDX.contains(c);
+    }
+
     /**
      * Create a break iterator from a precompiled set of break rules.
      *
@@ -180,6 +221,7 @@ public Object clone()  {
         result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
         result.fBreakCache = result.new BreakCache(fBreakCache);
         result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache);
+        result.fDX = fDX; // fDX could be shared w/ other instance
         return result;
     }
 
@@ -206,6 +248,9 @@ public boolean equals(Object that) {
                     (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
                 return false;
             }
+            if (!((fDX == null && other.fDX == null) || fDX.equals(other.fDX))) {
+                return false;
+            }
             if (fText == null && other.fText == null) {
                 return true;
             }
@@ -305,6 +350,10 @@ public int hashCode()
      */
     private boolean            fPhraseBreaking = false;
 
+    /**
+     * UnicodeSet for Dictionary break script exclusions.
+     */
+    protected UnicodeSet       fDX = null;
 
     /**
      * Counter for the number of characters encountered with the "dictionary"
@@ -1229,19 +1278,25 @@ void populateDictionary(int startPos, int endPos,
                     break;
                 }
 
-                // We now have a dictionary character. Get the appropriate language object
-                // to deal with it.
-                LanguageBreakEngine lbe = getLanguageBreakEngine(c);
+                if (excludedFromDictionaryBreak(c)) {
+                    c = CharacterIteration.next32(fText);
+                    // treat character in dx as AL
+                    category = (short)fRData.fTrie.get('A');
+                } else {
+                    // We now have a dictionary character. Get the appropriate language object
+                    // to deal with it.
+                    LanguageBreakEngine lbe = getLanguageBreakEngine(c);
+
+                    // Ask the language object if there are any breaks. It will add them to the cache and
+                    // leave the text pointer on the other side of its range, ready to search for the next one.
+                    if (lbe != null) {
+                        foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking);
+                    }
 
-                // Ask the language object if there are any breaks. It will add them to the cache and
-                // leave the text pointer on the other side of its range, ready to search for the next one.
-                if (lbe != null) {
-                    foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking);
+                    // Reload the loop variables for the next go-round
+                    c = CharacterIteration.current32(fText);
+                    category = (short)fRData.fTrie.get(c);
                 }
-
-                // Reload the loop variables for the next go-round
-                c = CharacterIteration.current32(fText);
-                category = (short)fRData.fTrie.get(c);
             }
 
             // If we found breaks, ensure that the first and last entries are
diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java
index 15f7265d0636..21e129ea7aec 100644
--- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java
+++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java
@@ -10,6 +10,7 @@
 
 import java.text.CharacterIterator;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 import java.util.Locale;
 
@@ -1003,4 +1004,32 @@ public int randomStringIndex() {
             assertEquals("preceding" + idx, fns.expectedPreceding(idx), bi.preceding(idx));
         }
     }
+    @Test
+    public void TestDXLineBreaks() {
+       String text = "abcde 一二三四五六七八九十อิสราเอลโชว์คลิป";
+       BreakIterator brk = BreakIterator.getLineInstance(ULocale.forLanguageTag("ja-u-dx-hani-thai"));
+       brk.setText(text);
+       List<Integer> expected = new ArrayList<Integer>(Arrays.asList(
+            0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32));
+       List<Integer> actuals = new ArrayList<Integer>();
+       int c = brk.first();
+       do {
+           actuals.add(c);
+       } while ((c = brk.next()) != BreakIterator.DONE);
+       assertEquals("-u-dx- is not working", expected, actuals);
+    }
+    @Test
+    public void TestDXWordBreaks() {
+       String text = "abcde 一二三四五六七八九十อิสราเอลโชว์คลิป";
+       BreakIterator brk = BreakIterator.getWordInstance(ULocale.forLanguageTag("ja-u-dx-hani-thai"));
+       brk.setText(text);
+       List<Integer> expected = new ArrayList<Integer>(Arrays.asList(
+            0, 5, 6, 16, 32 ));
+       List<Integer> actuals = new ArrayList<Integer>();
+       int c = brk.first();
+       do {
+           actuals.add(c);
+       } while ((c = brk.next()) != BreakIterator.DONE);
+       assertEquals("-u-dx- is not working", expected, actuals);
+    }
 }