Skip to content

Commit

Permalink
ICU-13219 add -u-dx- support to BreakIterator
Browse files Browse the repository at this point in the history
  • Loading branch information
FrankYFTang committed Nov 16, 2023
1 parent 511e5ef commit b032026
Show file tree
Hide file tree
Showing 11 changed files with 271 additions and 38 deletions.
31 changes: 23 additions & 8 deletions icu4c/source/common/brkiter.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// © 2016 and later: Unicode, Inc. and others.
//
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
Expand Down Expand Up @@ -55,7 +56,7 @@ U_NAMESPACE_BEGIN
// -------------------------------------

BreakIterator*
BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status)
BreakIterator::buildInstance(const Locale& loc, const char *type, bool checkDX, UErrorCode &status)
{
char fnbuff[256];
char ext[4]={'\0'};
Expand Down Expand Up @@ -116,8 +117,22 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st
return nullptr;
}

// Create a RuleBasedBreakIterator
result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, status);
{
const char* dxs = nullptr;
CharString dxsValue; // keep on the stack till we no longer need dxs.
// If it is word or line instance, try to get the value for dx
if (checkDX) {
UErrorCode dxsStatus = U_ZERO_ERROR;
CharStringByteSink dxsSink(&dxsValue);
loc.getKeywordValue("dx", dxsSink, dxsStatus);
if (U_SUCCESS(dxsStatus) && dxsValue.length() > 0) {
dxs = dxsValue.data();
}
}

// Create a RuleBasedBreakIterator
result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, dxs, status);
}

// If there is a result, set the valid locale and actual locale, and the kind
if (U_SUCCESS(status) && result != nullptr) {
Expand Down Expand Up @@ -421,14 +436,14 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
case UBRK_CHARACTER:
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_CHARACTER);
result = BreakIterator::buildInstance(loc, "grapheme", status);
result = BreakIterator::buildInstance(loc, "grapheme", false, status);
UTRACE_EXIT_STATUS(status);
}
break;
case UBRK_WORD:
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_WORD);
result = BreakIterator::buildInstance(loc, "word", status);
result = BreakIterator::buildInstance(loc, "word", true, status);
UTRACE_EXIT_STATUS(status);
}
break;
Expand All @@ -454,7 +469,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
uprv_strcat(lb_lw, value.data());
}
}
result = BreakIterator::buildInstance(loc, lb_lw, status);
result = BreakIterator::buildInstance(loc, lb_lw, true, status);

UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw);
UTRACE_EXIT_STATUS(status);
Expand All @@ -463,7 +478,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
case UBRK_SENTENCE:
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_SENTENCE);
result = BreakIterator::buildInstance(loc, "sentence", status);
result = BreakIterator::buildInstance(loc, "sentence", false, status);
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
char ssKeyValue[kKeyValueLenMax] = {0};
UErrorCode kvStatus = U_ZERO_ERROR;
Expand All @@ -482,7 +497,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
case UBRK_TITLE:
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_TITLE);
result = BreakIterator::buildInstance(loc, "title", status);
result = BreakIterator::buildInstance(loc, "title", false, status);
UTRACE_EXIT_STATUS(status);
}
break;
Expand Down
46 changes: 44 additions & 2 deletions icu4c/source/common/rbbi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "unicode/uchriter.h"
#include "unicode/uclean.h"
#include "unicode/udata.h"
#include "unicode/uniset.h"

#include "brkeng.h"
#include "ucln_cmn.h"
Expand Down Expand Up @@ -89,9 +90,37 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode
//
//-------------------------------------------------------------------------------
RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking,
const char* dxs,
UErrorCode &status) : RuleBasedBreakIterator(udm, status)
{
fIsPhraseBreaking = isPhraseBreaking;
if (U_FAILURE(status)) {
return;
}
if (dxs != nullptr) {
size_t length = uprv_strlen(dxs);
// The value should be a list of 4 letter script codes joined by '-'.
if (length % 5 != 4) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
size_t items = 1 + length / 5;
// Change from "thai" to "[:thai:]" or
// "thai-arab" to "[[:thai:][:arab:]]"
UnicodeString udxs;
if (items > 1) {
udxs.append(u'[');
}
for (size_t i = 0; i < items; i++) {
udxs.append(u"[:", -1);
udxs.append(UnicodeString(dxs + i * 5, 4, US_INV));
udxs.append(u":]", -1);
}
if (items > 1) {
udxs.append(u']');
}
fDX = new UnicodeSet(udxs, status);
}
}

//
Expand Down Expand Up @@ -198,7 +227,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator()
* Simple Constructor with an error code.
* Handles common initialization for all other constructors.
*/
RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) {
RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) : fDX(nullptr) {
UErrorCode ec = U_ZERO_ERROR;
if (status == nullptr) {
status = &ec;
Expand All @@ -212,6 +241,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) {
}
fDictionaryCache = lpDictionaryCache.orphan();
fBreakCache = lpBreakCache.orphan();
fDX = nullptr;

#ifdef RBBI_DEBUG
static UBool debugInitDone = false;
Expand Down Expand Up @@ -261,6 +291,9 @@ RuleBasedBreakIterator::~RuleBasedBreakIterator() {
delete fDictionaryCache;
fDictionaryCache = nullptr;

delete fDX;
fDX = nullptr;

delete fLanguageBreakEngines;
fLanguageBreakEngines = nullptr;

Expand Down Expand Up @@ -333,6 +366,7 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
// the assumption that the current position is on a rule boundary.
fBreakCache->reset(fPosition, fRuleStatusIndex);
fDictionaryCache->reset();
fDX = (that.fDX == nullptr) ? nullptr : that.fDX->cloneAsThawed();

return *this;
}
Expand Down Expand Up @@ -381,11 +415,15 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
return false;
}

// If only one has fDX or they are not equal
if (!((that2.fDX == nullptr && fDX == nullptr) || *that2.fDX == *fDX)) {
return false;
}
if (that2.fData == fData ||
(fData != nullptr && that2.fData != nullptr && *that2.fData == *fData)) {
// The two break iterators are using the same rules.
return true;
}
}
return false;
}

Expand Down Expand Up @@ -1298,6 +1336,10 @@ RuleBasedBreakIterator::getRules() const {
}
}

bool RuleBasedBreakIterator::excludedFromDictionaryBreak(int32_t c) {
return fDX != nullptr && fDX->contains(c);
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
33 changes: 20 additions & 13 deletions icu4c/source/common/rbbi_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,20 +156,27 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
break;
}

// We now have a dictionary character. Get the appropriate language object
// to deal with it.
const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));

// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != nullptr) {
foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
// We now have a dictionary character.
// Handle dx (Dictionary break script exclusions) first if needed
if (fBI->excludedFromDictionaryBreak(c)) {
utext_next32(text);
c = utext_current32(text);
// If we exclude the character, we treat it as AL
category = ucptrie_get(fBI->fData->fTrie, 'A');
} else {
// Get the appropriate language object to deal with it.
const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));

// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != nullptr) {
foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
}
// Reload the loop variables for the next go-round
c = utext_current32(text);
category = ucptrie_get(fBI->fData->fTrie, c);
}

// Reload the loop variables for the next go-round
c = utext_current32(text);
category = ucptrie_get(fBI->fData->fTrie, c);
}

// If we found breaks, ensure that the first and last entries are
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/common/unicode/brkiter.h
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,7 @@ class U_COMMON_API BreakIterator : public UObject {
virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0;

private:
static BreakIterator* buildInstance(const Locale& loc, const char *type, UErrorCode& status);
static BreakIterator* buildInstance(const Locale& loc, const char *type, bool checkDX, UErrorCode& status);
static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);

Expand Down
17 changes: 16 additions & 1 deletion icu4c/source/common/unicode/rbbi.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#define RBBI_H

#include "unicode/utypes.h"
#include "unicode/uniset.h"

#if U_SHOW_CPLUSPLUS_API

Expand All @@ -42,6 +43,7 @@ struct RBBIDataHeader;
class RBBIDataWrapper;
class UnhandledEngine;
class UStack;
class UnicodeSet;


#ifndef U_HIDE_DRAFT_API
Expand Down Expand Up @@ -221,6 +223,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
*/
UBool fIsPhraseBreaking = false;

/**
* A UnicodeSet for Dictionary Break Exclusion.
*/
UnicodeSet* fDX = nullptr;
private:

//=======================================================================
// constructors
//=======================================================================
Expand All @@ -246,11 +254,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
* which will be responsible for closing it when it is no longer needed.
* @param status Information on any errors encountered.
* @param isPhraseBreaking true if phrase based breaking is required, otherwise false.
* @param dxs nullptr or a string to denote "Dictionary break script exclusions".
* @see udata_open
* @see #getBinaryRules
* @internal (private)
*/
RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status);
RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, const char* dxs, UErrorCode &status);

/** @internal */
friend class RBBIRuleBuilder;
Expand Down Expand Up @@ -766,6 +775,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
* signature)
*/

/*
* Check should the character be excluded from dictionary-based text break.
* @internal (private)
*/
bool excludedFromDictionaryBreak(int32_t c);

typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);

template<typename RowType, PTrieFunc trieFunc>
Expand Down
62 changes: 62 additions & 0 deletions icu4c/source/test/intltest/rbbitst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(TestLineBreaks);
TESTCASE_AUTO(TestSentBreaks);
TESTCASE_AUTO(TestExtended);
TESTCASE_AUTO(TestDXLineBreaks);
TESTCASE_AUTO(TestDXWordBreaks);
#endif
#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
TESTCASE_AUTO(TestMonkey);
Expand Down Expand Up @@ -3900,6 +3902,66 @@ void RBBITest::TestLineBreaks()
#endif
}

void RBBITest::TestDXLineBreaks()
{
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
UnicodeString text(u"abcde 一二三四五六七八九十อิสราเอลโชว์คลิป");
std::vector<int32_t> expected{ 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32 };
Locale locale("ja-u-dx-hani-thai");
UErrorCode status = U_ZERO_ERROR;
std::unique_ptr<BreakIterator> bi(BreakIterator::createLineInstance(locale, status));
TEST_ASSERT_SUCCESS(status);
if (U_FAILURE(status)) {
return;
}
bi->setText(text);
int32_t c = bi->first();
std::vector<int32_t> actuals;
do {
actuals.push_back(c);
} while ((c = bi->next()) != BreakIterator::DONE );

assertEquals(WHERE,
static_cast<int32_t>(expected.size()),
static_cast<int32_t>(actuals.size()));
if (expected.size() == actuals.size()) {
for (size_t i = 0; i < expected.size(); i++) {
assertEquals(WHERE, expected[i], actuals[i]);
}
}
#endif
}

void RBBITest::TestDXWordBreaks()
{
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
UnicodeString text(u"abcde 一二三四五六七八九十อิสราเอลโชว์คลิป");
Locale locale("ja-u-dx-hani-thai");
std::vector<int32_t> expected{ 0, 5, 6, 16, 32 };
UErrorCode status = U_ZERO_ERROR;
std::unique_ptr<BreakIterator> bi(BreakIterator::createWordInstance(locale, status));
TEST_ASSERT_SUCCESS(status);
if (U_FAILURE(status)) {
return;
}
bi->setText(text);
int32_t c = bi->first();
std::vector<int32_t> actuals;
do {
actuals.push_back(c);
} while ((c = bi->next()) != BreakIterator::DONE );

assertEquals(WHERE,
static_cast<int32_t>(expected.size()),
static_cast<int32_t>(actuals.size()));
if (expected.size() == actuals.size()) {
for (size_t i = 0; i < expected.size(); i++) {
assertEquals(WHERE, expected[i], actuals[i]);
}
}
#endif
}

void RBBITest::TestSentBreaks()
{
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
Expand Down
2 changes: 2 additions & 0 deletions icu4c/source/test/intltest/rbbitst.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ class RBBITest: public IntlTest {
void TestExternalBreakEngineWithFakeTaiLe();
void TestExternalBreakEngineWithFakeYue();

void TestDXLineBreaks();
void TestDXWordBreaks();
#if U_ENABLE_TRACING
void TestTraceCreateCharacter();
void TestTraceCreateWord();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import com.ibm.icu.impl.CSCharacterIterator;
import com.ibm.icu.impl.CacheValue;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ICUCloneNotSupportedException;
import com.ibm.icu.util.ULocale;

Expand Down
Loading

0 comments on commit b032026

Please sign in to comment.