Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ICU-22564 Java API for the host env to register a low level break eng… #2697

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;


public class BurmeseBreakEngine extends DictionaryBreakEngine {
Expand Down Expand Up @@ -70,7 +71,7 @@ public int hashCode() {
}

@Override
public boolean handles(int c) {
public boolean handles(int c, ULocale locale) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
return (script == UScript.MYANMAR);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import com.ibm.icu.impl.CharacterIteration;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;

public abstract class DictionaryBreakEngine implements LanguageBreakEngine {

Expand Down Expand Up @@ -177,7 +178,7 @@ public DictionaryBreakEngine() {
}

@Override
public boolean handles(int c) {
public boolean handles(int c, ULocale locale) {
return fSet.contains(c); // we recognize the character
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;

public class KhmerBreakEngine extends DictionaryBreakEngine {

Expand Down Expand Up @@ -78,7 +79,7 @@ public int hashCode() {
}

@Override
public boolean handles(int c) {
public boolean handles(int c, ULocale locale) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
return (script == UScript.KHMER);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.UResourceBundle;

/**
Expand Down Expand Up @@ -237,7 +238,7 @@ public int hashCode() {
}

@Override
public boolean handles(int c) {
public boolean handles(int c, ULocale locale) {
return fScript == UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
package com.ibm.icu.impl.breakiter;

import java.text.CharacterIterator;
import com.ibm.icu.util.ULocale;

/**
* The LanguageBreakEngine interface is to be used to implement any
Expand All @@ -17,9 +18,10 @@
public interface LanguageBreakEngine {
/**
* @param c A Unicode codepoint value
* @param locale A locale
* @return true if the engine can handle this character, false otherwise
*/
boolean handles(int c);
boolean handles(int c, ULocale locale);

/**
* Implements the actual breaking logic. Find any breaks within a run in the supplied text.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;

public class LaoBreakEngine extends DictionaryBreakEngine {

Expand Down Expand Up @@ -78,7 +79,7 @@ public int hashCode() {
}

@Override
public boolean handles(int c) {
public boolean handles(int c, ULocale locale) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
return (script == UScript.LAO);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;

public class ThaiBreakEngine extends DictionaryBreakEngine {

Expand Down Expand Up @@ -89,7 +90,7 @@ public int hashCode() {
}

@Override
public boolean handles(int c) {
public boolean handles(int c, ULocale locale) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
return (script == UScript.THAI);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;

public final class UnhandledBreakEngine implements LanguageBreakEngine {
// TODO: Use two UnicodeSets, one with all frozen sets, one with unfrozen.
Expand All @@ -38,7 +39,7 @@ public UnhandledBreakEngine() {
}

@Override
public boolean handles(int c) {
public boolean handles(int c, ULocale locale) {
return fHandled.contains(c);
}

Expand Down
19 changes: 19 additions & 0 deletions icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIterator.java
Original file line number Diff line number Diff line change
Expand Up @@ -1014,6 +1014,10 @@ public final ULocale getLocale(ULocale.Type type) {
this.actualLocale : this.validLocale;
}

protected final ULocale getRequestedLocale() {
return this.requestedLocale;
}

/**
* Set information about the locales that were used to create this
* object. If the object was not constructed from locale data,
Expand Down Expand Up @@ -1043,6 +1047,15 @@ final void setLocale(ULocale valid, ULocale actual) {
this.actualLocale = actual;
}

/**
* Set the requested locale.
* @param requested the locale requested to construct.
* @see com.ibm.icu.util.ULocale
*/
final void setRequestedLocale(ULocale requested) {
this.requestedLocale = requested;
}

/**
* The most specific locale containing any resource data, or null.
* @see com.ibm.icu.util.ULocale
Expand All @@ -1056,5 +1069,11 @@ final void setLocale(ULocale valid, ULocale actual) {
*/
private ULocale actualLocale;

/**
* The locale requsted in the constructor.
* @see com.ibm.icu.util.ULocale
*/
private ULocale requestedLocale;

// -------- END ULocale boilerplate --------
}
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ private static BreakIterator createBreakInstance(ULocale locale, int kind) {
// TODO: Determine valid and actual locale correctly.
ULocale uloc = ULocale.forLocale(rb.getLocale());
iter.setLocale(uloc, uloc);
iter.setRequestedLocale(locale);

// filtered break
if (kind == BreakIterator.KIND_SENTENCE) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.util.CodePointTrie;
import com.ibm.icu.util.ULocale;

/**
* Rule Based Break Iterator
Expand Down Expand Up @@ -727,7 +728,7 @@ private LanguageBreakEngine getLanguageBreakEngine(int c) {
// We have a dictionary character.
// Does an already instantiated break engine handle it?
for (LanguageBreakEngine candidate : fBreakEngines) {
if (candidate.handles(c)) {
if (candidate.handles(c, getRequestedLocale())) {
return candidate;
}
}
Expand All @@ -737,7 +738,7 @@ private LanguageBreakEngine getLanguageBreakEngine(int c) {
// Check the global list, another break iterator may have instantiated the
// desired engine.
for (LanguageBreakEngine candidate : gAllBreakEngines) {
if (candidate.handles(c)) {
if (candidate.handles(c, getRequestedLocale())) {
fBreakEngines.add(candidate);
return candidate;
}
Expand Down Expand Up @@ -1081,6 +1082,35 @@ private static int CISetIndex32(CharacterIterator ci, int index) {
return ci.getIndex();
}

/**
* Register a new external break engine. The external break engine will be adopted.
* Because ICU may choose to cache break engine internally, this must
* be called at application startup, prior to any calls to
* object methods of RuleBasedBreakIterator to avoid undefined behavior.
Comment on lines +1086 to +1089
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* Register a new external break engine. The external break engine will be adopted.
* Because ICU may choose to cache break engine internally, this must
* be called at application startup, prior to any calls to
* object methods of RuleBasedBreakIterator to avoid undefined behavior.
* Register a new external break engine. The external break engine will be adopted.
* Because ICU may choose to cache break engines internally,
* to avoid undefined behavior this must
* be called at application startup, prior to any calls to
* object methods of RuleBasedBreakIterator.

* @param engine the ExternalBreakEngine instance to be adopted
* @internal ICU 75 technology preview
*/
public static void registerExternalBreakEngine(ExternalBreakEngine engine) {
synchronized(gAllBreakEngines) {
gAllBreakEngines.add(0, new LanguageBreakEngine() {
@Override
public boolean handles(int c, ULocale locale) {
return engine.isFor(c, locale);
}
@Override
public int findBreaks(CharacterIterator text, int startPos, int endPos,
DictionaryBreakEngine.DequeI foundBreaks, boolean isPhraseBreaking) {
List<Integer> found = new ArrayList<Integer>();
int result = engine.fillBreaks(text, startPos, endPos, found);
for (Integer f : found) {
foundBreaks.push(f);
}
return result;
}
});
}
}

/** DictionaryCache stores the boundaries obtained from a run of dictionary characters.
* Dictionary boundaries are moved first to this cache, then from here
* to the main BreakCache, where they may inter-leave with non-dictionary
Expand Down Expand Up @@ -1885,7 +1915,43 @@ void dumpCache() {
};


public interface ExternalBreakEngine {
/**
* <p>Indicate whether this engine handles a particular character when
* the RuleBasedBreakIterator is used for a particular locale. This method is used
* by the RuleBasedBreakIterator to find a break engine.</p>
* @param c A character which begins a run that the engine might handle.
* @param locale The locale.
* @return true if this engine handles the particular character for that locale.
* @internal ICU 75 technology preview
*/
public boolean isFor(int c, ULocale locale);

/**
* <p>Indicate whether this engine handles a particular character.This method is
* used by the RuleBasedBreakIterator after it already find a break engine to see which
* characters after the first one can be handled by this break engine.</p>
* @param c A character that the engine might handle.
* @return true if this engine handles the particular character.
* @internal ICU 75 technology preview
*/
public boolean handles(int c);

/**
* <p>Divide up a range of text handled by this break engine.</p>
*
* @param text A CharacterIterator representing the text
* @param rangeStart The start of the range of known characters
* @param rangeEnd The end of the range of known characters
* @param foundBreaks Output of a list of Integer to denote break positions.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should specify whether the contents of the List must be cleared before calling, or whether the fillBreaks is required to clear internally.

* @return The number of breaks found
* @internal ICU 75 technology preview
*/
public int fillBreaks(CharacterIterator text,
int rangeStart,
int rangeEnd,
List<Integer> foundBreaks);
}

}

Loading