Skip to content

Commit

Permalink
ICU-13219 add DX support to BreakIterator
Browse files Browse the repository at this point in the history
  • Loading branch information
FrankYFTang committed Nov 15, 2023
1 parent 511e5ef commit 1266364
Show file tree
Hide file tree
Showing 6 changed files with 128 additions and 13 deletions.
34 changes: 34 additions & 0 deletions icu4c/source/common/brkiter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "unicode/uloc.h"
#include "unicode/ures.h"
#include "unicode/ustring.h"
#include "unicode/uniset.h"
#include "unicode/filteredbrk.h"
#include "bytesinkutil.h"
#include "ucln_cmn.h"
Expand Down Expand Up @@ -143,6 +144,39 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st
}
}

UErrorCode dxsStatus = U_ZERO_ERROR;
CharString dxs;
CharStringByteSink dxsSink(&dxs);
loc.getKeywordValue("dx", dxsSink, dxsStatus);
// Has "dx" in Locale.
if (U_SUCCESS(dxsStatus) && dxs.length() > 0) {
// The value should be a list of 4 letter script codes joined by '-'.
if (dxs.length() % 5 != 4) {
status = U_ILLEGAL_ARGUMENT_ERROR;
delete result;
return nullptr;
}
int32_t items = 1 + (dxs.length() / 5);
// Change from "thai" to "[:thai:]" or
// "thai-arab" to "[[:thai:][:arab:]]"
UnicodeString udxs;
if (items > 1) {
udxs.append(u'[');
}
for (int32_t i = 0; i < items; i++) {
udxs.append(u"[:", -1);
udxs.append(UnicodeString(dxs.data() + i * 5, 4, US_INV));
udxs.append(u":]", -1);
}
if (items > 1) {
udxs.append(u']');
}
result->fDX = new UnicodeSet(udxs, status);
if (U_FAILURE(status)) {
delete result;
return nullptr;
}
}
return result;
}

Expand Down
5 changes: 5 additions & 0 deletions icu4c/source/common/rbbi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "unicode/uchriter.h"
#include "unicode/uclean.h"
#include "unicode/udata.h"
#include "unicode/uniset.h"

#include "brkeng.h"
#include "ucln_cmn.h"
Expand Down Expand Up @@ -212,6 +213,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) {
}
fDictionaryCache = lpDictionaryCache.orphan();
fBreakCache = lpBreakCache.orphan();
fDX = nullptr;

#ifdef RBBI_DEBUG
static UBool debugInitDone = false;
Expand Down Expand Up @@ -261,6 +263,9 @@ RuleBasedBreakIterator::~RuleBasedBreakIterator() {
delete fDictionaryCache;
fDictionaryCache = nullptr;

delete fDX;
fDX = nullptr;

delete fLanguageBreakEngines;
fLanguageBreakEngines = nullptr;

Expand Down
32 changes: 19 additions & 13 deletions icu4c/source/common/rbbi_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,20 +156,26 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
break;
}

// We now have a dictionary character. Get the appropriate language object
// to deal with it.
const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));

// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != nullptr) {
foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
// We now have a dictionary character.
// Handle dx (Dictionary break script exclusions) first if needed
if (fBI->fDX != nullptr && fBI->fDX->contains(c)) {
utext_next32(text);
c = utext_current32(text);
category = ucptrie_get(fBI->fData->fTrie, 'A'); // treat character in dx as AL
} else {
// Get the appropriate language object to deal with it.
const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));

// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != nullptr) {
foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
}
// Reload the loop variables for the next go-round
c = utext_current32(text);
category = ucptrie_get(fBI->fData->fTrie, c);
}

// Reload the loop variables for the next go-round
c = utext_current32(text);
category = ucptrie_get(fBI->fData->fTrie, c);
}

// If we found breaks, ensure that the first and last entries are
Expand Down
6 changes: 6 additions & 0 deletions icu4c/source/common/unicode/rbbi.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ struct RBBIDataHeader;
class RBBIDataWrapper;
class UnhandledEngine;
class UStack;
class UnicodeSet;


#ifndef U_HIDE_DRAFT_API
Expand Down Expand Up @@ -221,6 +222,11 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
*/
UBool fIsPhraseBreaking = false;

protected:
UnicodeSet* fDX =nullptr;

private:

//=======================================================================
// constructors
//=======================================================================
Expand Down
62 changes: 62 additions & 0 deletions icu4c/source/test/intltest/rbbitst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(TestLineBreaks);
TESTCASE_AUTO(TestSentBreaks);
TESTCASE_AUTO(TestExtended);
TESTCASE_AUTO(TestDXLineBreaks);
TESTCASE_AUTO(TestDXWordBreaks);
#endif
#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
TESTCASE_AUTO(TestMonkey);
Expand Down Expand Up @@ -3900,6 +3902,66 @@ void RBBITest::TestLineBreaks()
#endif
}

void RBBITest::TestDXLineBreaks()
{
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
UnicodeString text(u"abcde 一二三四五六七八九十อิสราเอลโชว์คลิป");
std::vector<int32_t> expected{ 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32 };
Locale locale("ja-u-dx-hani-thai");
UErrorCode status = U_ZERO_ERROR;
std::unique_ptr<BreakIterator> bi(BreakIterator::createLineInstance(locale, status));
TEST_ASSERT_SUCCESS(status);
if (U_FAILURE(status)) {
return;
}
bi->setText(text);
int32_t c = bi->first();
std::vector<int32_t> actuals;
do {
actuals.push_back(c);
} while ((c = bi->next()) != BreakIterator::DONE );

assertEquals(WHERE,
static_cast<int32_t>(expected.size()),
static_cast<int32_t>(actuals.size()));
if (expected.size() == actuals.size()) {
for (size_t i = 0; i < expected.size(); i++) {
assertEquals(WHERE, expected[i], actuals[i]);
}
}
#endif
}

void RBBITest::TestDXWordBreaks()
{
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
UnicodeString text(u"abcde 一二三四五六七八九十อิสราเอลโชว์คลิป");
Locale locale("ja-u-dx-hani-thai");
std::vector<int32_t> expected{ 0, 5, 6, 16, 32 };
UErrorCode status = U_ZERO_ERROR;
std::unique_ptr<BreakIterator> bi(BreakIterator::createWordInstance(locale, status));
TEST_ASSERT_SUCCESS(status);
if (U_FAILURE(status)) {
return;
}
bi->setText(text);
int32_t c = bi->first();
std::vector<int32_t> actuals;
do {
actuals.push_back(c);
} while ((c = bi->next()) != BreakIterator::DONE );

assertEquals(WHERE,
static_cast<int32_t>(expected.size()),
static_cast<int32_t>(actuals.size()));
if (expected.size() == actuals.size()) {
for (size_t i = 0; i < expected.size(); i++) {
assertEquals(WHERE, expected[i], actuals[i]);
}
}
#endif
}

void RBBITest::TestSentBreaks()
{
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
Expand Down
2 changes: 2 additions & 0 deletions icu4c/source/test/intltest/rbbitst.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ class RBBITest: public IntlTest {
void TestExternalBreakEngineWithFakeTaiLe();
void TestExternalBreakEngineWithFakeYue();

void TestDXLineBreaks();
void TestDXWordBreaks();
#if U_ENABLE_TRACING
void TestTraceCreateCharacter();
void TestTraceCreateWord();
Expand Down

0 comments on commit 1266364

Please sign in to comment.