Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ICU-22564 Java API for the host env to register a low level break eng… #2697

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import com.ibm.icu.impl.CharacterIteration;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;

public abstract class DictionaryBreakEngine implements LanguageBreakEngine {

Expand Down Expand Up @@ -176,6 +177,11 @@ public void removeAllElements() {
public DictionaryBreakEngine() {
}

@Override
public boolean isFor(ULocale locale) {
return true; // by default, we handle all locales.
}

@Override
public boolean handles(int c) {
return fSet.contains(c); // we recognize the character
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
package com.ibm.icu.impl.breakiter;

import java.text.CharacterIterator;
import com.ibm.icu.util.ULocale;

/**
* The LanguageBreakEngine interface is to be used to implement any
Expand All @@ -21,6 +22,12 @@ public interface LanguageBreakEngine {
*/
boolean handles(int c);

/**
* @param locale A locale
* @return true if the engine is for this Locale, false otherwise
*/
boolean isFor(ULocale locale);

/**
* Implements the actual breaking logic. Find any breaks within a run in the supplied text.
* @param text The text to break over. The iterator is left at
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;

public final class UnhandledBreakEngine implements LanguageBreakEngine {
// TODO: Use two UnicodeSets, one with all frozen sets, one with unfrozen.
Expand All @@ -37,6 +38,10 @@ public final class UnhandledBreakEngine implements LanguageBreakEngine {
public UnhandledBreakEngine() {
}

@Override
public boolean isFor(ULocale locale) {
return true; // Handle all locales
}
@Override
public boolean handles(int c) {
return fHandled.contains(c);
Expand Down
19 changes: 19 additions & 0 deletions icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIterator.java
Original file line number Diff line number Diff line change
Expand Up @@ -1014,6 +1014,10 @@ public final ULocale getLocale(ULocale.Type type) {
this.actualLocale : this.validLocale;
}

protected final ULocale getRequestedLocale() {
return this.requestedLocale;
}

/**
* Set information about the locales that were used to create this
* object. If the object was not constructed from locale data,
Expand Down Expand Up @@ -1043,6 +1047,15 @@ final void setLocale(ULocale valid, ULocale actual) {
this.actualLocale = actual;
}

/**
* Set the requested locale.
* @param requested the locale requested to construct.
* @see com.ibm.icu.util.ULocale
*/
final void setRequestedLocale(ULocale requested) {
this.requestedLocale = requested;
}

/**
* The most specific locale containing any resource data, or null.
* @see com.ibm.icu.util.ULocale
Expand All @@ -1056,5 +1069,11 @@ final void setLocale(ULocale valid, ULocale actual) {
*/
private ULocale actualLocale;

/**
* The locale requsted in the constructor.
* @see com.ibm.icu.util.ULocale
*/
private ULocale requestedLocale;

// -------- END ULocale boilerplate --------
}
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ private static BreakIterator createBreakInstance(ULocale locale, int kind) {
// TODO: Determine valid and actual locale correctly.
ULocale uloc = ULocale.forLocale(rb.getLocale());
iter.setLocale(uloc, uloc);
iter.setRequestedLocale(locale);

// filtered break
if (kind == BreakIterator.KIND_SENTENCE) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.util.CodePointTrie;
import com.ibm.icu.util.ULocale;

/**
* Rule Based Break Iterator
Expand Down Expand Up @@ -727,7 +728,7 @@ private LanguageBreakEngine getLanguageBreakEngine(int c) {
// We have a dictionary character.
// Does an already instantiated break engine handle it?
for (LanguageBreakEngine candidate : fBreakEngines) {
if (candidate.handles(c)) {
if (candidate.isFor(getRequestedLocale()) && candidate.handles(c)) {
return candidate;
}
}
Expand All @@ -737,7 +738,7 @@ private LanguageBreakEngine getLanguageBreakEngine(int c) {
// Check the global list, another break iterator may have instantiated the
// desired engine.
for (LanguageBreakEngine candidate : gAllBreakEngines) {
if (candidate.handles(c)) {
if (candidate.isFor(getRequestedLocale()) && candidate.handles(c)) {
fBreakEngines.add(candidate);
return candidate;
}
Expand Down Expand Up @@ -1081,6 +1082,39 @@ private static int CISetIndex32(CharacterIterator ci, int index) {
return ci.getIndex();
}

/**
* Register a new external break engine. The external break engine will be adopted.
* Because ICU may choose to cache break engine internally, this must
* be called at application startup, prior to any calls to
* object methods of RuleBasedBreakIterator to avoid undefined behavior.
Comment on lines +1086 to +1089
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* Register a new external break engine. The external break engine will be adopted.
* Because ICU may choose to cache break engine internally, this must
* be called at application startup, prior to any calls to
* object methods of RuleBasedBreakIterator to avoid undefined behavior.
* Register a new external break engine. The external break engine will be adopted.
* Because ICU may choose to cache break engines internally,
* to avoid undefined behavior this must
* be called at application startup, prior to any calls to
* object methods of RuleBasedBreakIterator.

* @param engine the ExternalBreakEngine instance to be adopted
* @internal ICU 75 technology preview
*/
public static void registerExternalBreakEngine(ExternalBreakEngine engine) {
synchronized(gAllBreakEngines) {
gAllBreakEngines.add(0, new LanguageBreakEngine() {
@Override
public boolean handles(int c) {
return engine.handles(c);
}
@Override
public boolean isFor(ULocale locale) {
return engine.isFor(locale);
}
@Override
public int findBreaks(CharacterIterator text, int startPos, int endPos,
DictionaryBreakEngine.DequeI foundBreaks, boolean isPhraseBreaking) {
List<Integer> found = new ArrayList<Integer>();
int result = engine.fillBreaks(text, startPos, endPos, found);
for (Integer f : found) {
foundBreaks.push(f);
}
return result;
}
});
}
}

/** DictionaryCache stores the boundaries obtained from a run of dictionary characters.
* Dictionary boundaries are moved first to this cache, then from here
* to the main BreakCache, where they may inter-leave with non-dictionary
Expand Down Expand Up @@ -1885,7 +1919,42 @@ void dumpCache() {
};


public interface ExternalBreakEngine {
/**
* <p>Indicate whether this engine is used for a particular locale.
* This method is used by the RuleBasedBreakIterator to find a break engine.</p>
*
* @param locale The locale.
* @return true if this engine handles the particular character for that locale.
* @internal ICU 75 technology preview
*/
public boolean isFor(ULocale locale);

/**
* <p>Indicate whether this engine handles a particular character.This method is
* used by the RuleBasedBreakIterator after it already find a break engine to see which
* characters after the first one can be handled by this break engine.</p>
* @param c A character that the engine might handle.
* @return true if this engine handles the particular character.
* @internal ICU 75 technology preview
*/
public boolean handles(int c);

/**
* <p>Divide up a range of text handled by this break engine.</p>
*
* @param text A CharacterIterator representing the text
* @param rangeStart The start of the range of known characters
* @param rangeEnd The end of the range of known characters
* @param foundBreaks Output of a list of Integer to denote break positions.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should specify whether the contents of the List must be cleared before calling, or whether the fillBreaks is required to clear internally.

* @return The number of breaks found
* @internal ICU 75 technology preview
*/
public int fillBreaks(CharacterIterator text,
int rangeStart,
int rangeEnd,
List<Integer> foundBreaks);
}

}

109 changes: 108 additions & 1 deletion icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import java.text.CharacterIterator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;

Expand All @@ -29,10 +30,10 @@
import com.ibm.icu.impl.RBBIDataWrapper;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.CodePointTrie;
import com.ibm.icu.util.ULocale;


@RunWith(JUnit4.class)
public class RBBITest extends CoreTestFmwk {
public RBBITest() {
Expand Down Expand Up @@ -1003,4 +1004,110 @@ public int randomStringIndex() {
assertEquals("preceding" + idx, fns.expectedPreceding(idx), bi.preceding(idx));
}
}

@Test
public void TestExternalBreakEngineWithFakeYue() {
String text = "a bc def一兩年前佢真係唔鍾意畀我影相i jk lmn";
List<Integer> actual1 = new ArrayList<Integer>();
BreakIterator bi1 = BreakIterator.getWordInstance(ULocale.ROOT);

bi1.setText(text);
do {
actual1.add(bi1.current());
} while(bi1.next() != BreakIterator.DONE);
List<Integer> expected1 = new ArrayList<Integer>(Arrays.asList(
0, 1, 2, 4, 5, 8, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 26, 27, 30));
assertEquals("root break Yue as Chinese", actual1, expected1);

RuleBasedBreakIterator.registerExternalBreakEngine(
new RuleBasedBreakIterator.ExternalBreakEngine() {
UnicodeSet block = new UnicodeSet(0x4e00, 0x9FFF);
public boolean isFor(ULocale locale) {
// We implmement this for any locale with "yue" such as
// "yue", "yue-CN", "yue-Hant-CN", etc.
return locale.getLanguage().equals("yue");
}
public boolean handles(int c) {
return block.contains(c);
}
public int fillBreaks(CharacterIterator text, int start, int end,
List<Integer> foundBreaks) {
int current = start;
int i = 0;
while (current++ < end) {
if ((current - start) % 2 == 0) {
foundBreaks.add(current);
i++;
}
}
text.setIndex(end);
return i;
}
});

List<Integer> actual2 = new ArrayList<Integer>();
BreakIterator bi2 = BreakIterator.getWordInstance(new ULocale("yue"));
bi2.setText(text);
do {
actual2.add(bi2.current());
} while(bi2.next() != BreakIterator.DONE);
List<Integer> expected2 = new ArrayList<Integer>(Arrays.asList(
0, 1, 2, 4, 5, 8, 10, 12, 14, 16, 18, 20, 22, 23, 24, 26, 27, 30));
assertEquals("break Yue by Fake external breaker", actual2, expected2);
}

@Test
public void TestExternalBreakEngineWithFakeTaiLe() {
String text = "a bc defᥛᥫᥒᥰᥖᥭᥰᥞᥝᥰᥙᥥᥢᥛᥫᥒᥰᥑᥩᥢᥲᥔᥣᥝᥴᥓᥬᥖᥩᥢᥲᥛᥣᥝᥱᥙᥝᥱᥙᥤᥱᥓᥣᥒᥛᥣᥰᥓᥧ" +
"ᥰᥘᥩᥰᥗᥪᥒᥴᥛᥣᥰᥘᥬᥰᥝᥣᥱᥘᥒᥱᥔᥣᥛᥴᥘᥫᥢi jk lmn";

List<Integer> actual1 = new ArrayList<Integer>();
BreakIterator bi1 = BreakIterator.getLineInstance(ULocale.ROOT);

bi1.setText(text);
do {
actual1.add(bi1.current());
} while(bi1.next() != BreakIterator.DONE);
List<Integer> expected1 = new ArrayList<Integer>(Arrays.asList(0, 2, 5, 86, 89, 92));
assertEquals("root break Tai Le", actual1, expected1);

RuleBasedBreakIterator.registerExternalBreakEngine(
new RuleBasedBreakIterator.ExternalBreakEngine() {
UnicodeSet block = new UnicodeSet(0x1950, 0x197f);
UnicodeSet tones = new UnicodeSet(0x1970, 0x1974);
public boolean isFor(ULocale locale) {
return true; // Handle all locales
}
public boolean handles(int c) {
return block.contains(c);
}
public int fillBreaks(CharacterIterator text, int rangeStart, int rangeEnd,
List<Integer> foundBreaks) {
int i = 0;
int c = text.setIndex(rangeStart);
int current;
while ((current = text.getIndex()) < rangeEnd) {
if (tones.contains(c)) {
i++;
foundBreaks.add(current);
}
c = text.next();
}
return i;
}
});


List<Integer> actual2 = new ArrayList<Integer>();
BreakIterator bi2 = BreakIterator.getLineInstance(new ULocale("tdd"));
bi2.setText(text);
do {
actual2.add(bi2.current());
} while(bi2.next() != BreakIterator.DONE);
List<Integer> expected2 = new ArrayList<Integer>(Arrays.asList(
0, 2, 5, 11, 14, 17, 24, 28, 32, 38, 42, 45, 48, 54, 57, 60, 64, 67,
70, 73, 76, 80, 86, 89, 92));
assertEquals("break Tai Le by Fake external breaker", actual2, expected2);
}
}