Skip to content

Commit

Permalink
ICU-22564 Java API for the host env to register a low level break eng…
Browse files Browse the repository at this point in the history
…ine to break CJ + Southeastern Asia script
  • Loading branch information
FrankYFTang committed Nov 9, 2023
1 parent a7c7d8f commit c967994
Show file tree
Hide file tree
Showing 12 changed files with 213 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;


public class BurmeseBreakEngine extends DictionaryBreakEngine {
Expand Down Expand Up @@ -70,7 +71,7 @@ public int hashCode() {
}

@Override
public boolean handles(int c) {
public boolean handles(int c, ULocale locale) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
return (script == UScript.MYANMAR);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import com.ibm.icu.impl.CharacterIteration;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;

public abstract class DictionaryBreakEngine implements LanguageBreakEngine {

Expand Down Expand Up @@ -177,7 +178,7 @@ public DictionaryBreakEngine() {
}

@Override
public boolean handles(int c) {
public boolean handles(int c, ULocale locale) {
return fSet.contains(c); // we recognize the character
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;

public class KhmerBreakEngine extends DictionaryBreakEngine {

Expand Down Expand Up @@ -78,7 +79,7 @@ public int hashCode() {
}

@Override
public boolean handles(int c) {
public boolean handles(int c, ULocale locale) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
return (script == UScript.KHMER);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.UResourceBundle;

/**
Expand Down Expand Up @@ -237,7 +238,7 @@ public int hashCode() {
}

@Override
public boolean handles(int c) {
public boolean handles(int c, ULocale locale) {
return fScript == UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
package com.ibm.icu.impl.breakiter;

import java.text.CharacterIterator;
import com.ibm.icu.util.ULocale;

/**
* The LanguageBreakEngine interface is to be used to implement any
Expand All @@ -17,9 +18,10 @@
public interface LanguageBreakEngine {
/**
* @param c A Unicode codepoint value
* @param locale A locale
* @return true if the engine can handle this character, false otherwise
*/
boolean handles(int c);
boolean handles(int c, ULocale locale);

/**
* Implements the actual breaking logic. Find any breaks within a run in the supplied text.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;

public class LaoBreakEngine extends DictionaryBreakEngine {

Expand Down Expand Up @@ -78,7 +79,7 @@ public int hashCode() {
}

@Override
public boolean handles(int c) {
public boolean handles(int c, ULocale locale) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
return (script == UScript.LAO);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;

public class ThaiBreakEngine extends DictionaryBreakEngine {

Expand Down Expand Up @@ -89,7 +90,7 @@ public int hashCode() {
}

@Override
public boolean handles(int c) {
public boolean handles(int c, ULocale locale) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
return (script == UScript.THAI);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;

public final class UnhandledBreakEngine implements LanguageBreakEngine {
// TODO: Use two UnicodeSets, one with all frozen sets, one with unfrozen.
Expand All @@ -38,7 +39,7 @@ public UnhandledBreakEngine() {
}

@Override
public boolean handles(int c) {
public boolean handles(int c, ULocale locale) {
return fHandled.contains(c);
}

Expand Down
19 changes: 19 additions & 0 deletions icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIterator.java
Original file line number Diff line number Diff line change
Expand Up @@ -1014,6 +1014,10 @@ public final ULocale getLocale(ULocale.Type type) {
this.actualLocale : this.validLocale;
}

protected final ULocale getRequestedLocale() {
return this.requestedLocale;
}

/**
* Set information about the locales that were used to create this
* object. If the object was not constructed from locale data,
Expand Down Expand Up @@ -1043,6 +1047,15 @@ final void setLocale(ULocale valid, ULocale actual) {
this.actualLocale = actual;
}

/**
* Set the requested locale.
* @param requested the locale requested to construct.
* @see com.ibm.icu.util.ULocale
*/
final void setRequestedLocale(ULocale requested) {
this.requestedLocale = requested;
}

/**
* The most specific locale containing any resource data, or null.
* @see com.ibm.icu.util.ULocale
Expand All @@ -1056,5 +1069,11 @@ final void setLocale(ULocale valid, ULocale actual) {
*/
private ULocale actualLocale;

/**
* The locale requsted in the constructor.
* @see com.ibm.icu.util.ULocale
*/
private ULocale requestedLocale;

// -------- END ULocale boilerplate --------
}
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ private static BreakIterator createBreakInstance(ULocale locale, int kind) {
// TODO: Determine valid and actual locale correctly.
ULocale uloc = ULocale.forLocale(rb.getLocale());
iter.setLocale(uloc, uloc);
iter.setRequestedLocale(locale);

// filtered break
if (kind == BreakIterator.KIND_SENTENCE) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.util.CodePointTrie;
import com.ibm.icu.util.ULocale;

/**
* Rule Based Break Iterator
Expand Down Expand Up @@ -727,7 +728,7 @@ private LanguageBreakEngine getLanguageBreakEngine(int c) {
// We have a dictionary character.
// Does an already instantiated break engine handle it?
for (LanguageBreakEngine candidate : fBreakEngines) {
if (candidate.handles(c)) {
if (candidate.handles(c, getRequestedLocale())) {
return candidate;
}
}
Expand All @@ -737,7 +738,7 @@ private LanguageBreakEngine getLanguageBreakEngine(int c) {
// Check the global list, another break iterator may have instantiated the
// desired engine.
for (LanguageBreakEngine candidate : gAllBreakEngines) {
if (candidate.handles(c)) {
if (candidate.handles(c, getRequestedLocale())) {
fBreakEngines.add(candidate);
return candidate;
}
Expand Down Expand Up @@ -1081,6 +1082,35 @@ private static int CISetIndex32(CharacterIterator ci, int index) {
return ci.getIndex();
}

/**
* Register a new external break engine. The external break engine will be adopted.
* Because ICU may choose to cache break engine internally, this must
* be called at application startup, prior to any calls to
* object methods of RuleBasedBreakIterator to avoid undefined behavior.
* @param engine the ExternalBreakEngine instance to be adopted
* @internal ICU 75 technology preview
*/
public static void registerExternalBreakEngine(ExternalBreakEngine engine) {
synchronized(gAllBreakEngines) {
gAllBreakEngines.add(0, new LanguageBreakEngine() {
@Override
public boolean handles(int c, ULocale locale) {
return engine.isFor(c, locale);
}
@Override
public int findBreaks(CharacterIterator text, int startPos, int endPos,
DictionaryBreakEngine.DequeI foundBreaks, boolean isPhraseBreaking) {
List<Integer> found = new ArrayList<Integer>();
int result = engine.fillBreaks(text, startPos, endPos, found);
for (Integer f : found) {
foundBreaks.push(f);
}
return result;
}
});
}
}

/** DictionaryCache stores the boundaries obtained from a run of dictionary characters.
* Dictionary boundaries are moved first to this cache, then from here
* to the main BreakCache, where they may inter-leave with non-dictionary
Expand Down Expand Up @@ -1885,7 +1915,43 @@ void dumpCache() {
};


public interface ExternalBreakEngine {
/**
* <p>Indicate whether this engine handles a particular character when
* the RuleBasedBreakIterator is used for a particular locale. This method is used
* by the RuleBasedBreakIterator to find a break engine.</p>
* @param c A character which begins a run that the engine might handle.
* @param locale The locale.
* @return true if this engine handles the particular character for that locale.
* @internal ICU 75 technology preview
*/
public boolean isFor(int c, ULocale locale);

/**
* <p>Indicate whether this engine handles a particular character.This method is
* used by the RuleBasedBreakIterator after it already find a break engine to see which
* characters after the first one can be handled by this break engine.</p>
* @param c A character that the engine might handle.
* @return true if this engine handles the particular character.
* @internal ICU 75 technology preview
*/
public boolean handles(int c);

/**
* <p>Divide up a range of text handled by this break engine.</p>
*
* @param text A CharacterIterator representing the text
* @param rangeStart The start of the range of known characters
* @param rangeEnd The end of the range of known characters
* @param foundBreaks Output of a list of Integer to denote break positions.
* @return The number of breaks found
* @internal ICU 75 technology preview
*/
public int fillBreaks(CharacterIterator text,
int rangeStart,
int rangeEnd,
List<Integer> foundBreaks);
}

}

Loading

0 comments on commit c967994

Please sign in to comment.