Skip to content

Commit

Permalink
feat: EditorUtils use document's locale to splitting words (#1175)
Browse files Browse the repository at this point in the history
* feat: EditorUtils

- Update javadoc for getWordEnd and getWordStart utility functions
- Remove "TODO: change to use document's locale"
- grab word with source and target locale of the project

Signed-off-by: Hiroshi Miura <[email protected]>

* chore: add test case in English first steps pane

Signed-off-by: Hiroshi Miura <[email protected]>

* chore: add task dependency for acceptance test

Signed-off-by: Hiroshi Miura <[email protected]>

* test: add the case

- test EditorUtils.getWord* with loaded project from Chinese to Japanese in English environment

Signed-off-by: Hiroshi Miura <[email protected]>

* refactor: reduce duplicated code

Signed-off-by: Hiroshi Miura <[email protected]>

* fix: fix copyright header typo

Signed-off-by: Hiroshi Miura <[email protected]>

* feat: use ICU4J for BreakItelator to support CJ

- Add unit test for getBoundary method with English, Japanese and Chinese

Signed-off-by: Hiroshi Miura <[email protected]>

* refactor: give locale by callers of EditorUtils

Signed-off-by: Hiroshi Miura <[email protected]>

* docs: javadoc of EditorUtils

- Update javadoc
- @deprecated for methods with old signature
- Simplify some lines

Signed-off-by: Hiroshi Miura <[email protected]>

* chore: bump [email protected]

Signed-off-by: Hiroshi Miura <[email protected]>

* chore: fix typo in dependency

Signed-off-by: Hiroshi Miura <[email protected]>

* chore: fix degraded merge for dependencies

Signed-off-by: Hiroshi Miura <[email protected]>

---------

Signed-off-by: Hiroshi Miura <[email protected]>
  • Loading branch information
miurahr authored Dec 12, 2024
1 parent fb6451f commit 20ffe61
Show file tree
Hide file tree
Showing 20 changed files with 389 additions and 31 deletions.
3 changes: 2 additions & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ dependencies {
}
runtimeOnly(libs.language.detector)
runtimeOnly(libs.dumont.hunspell)
implementation(libs.icu4j)

// Lucene for tokenizers
implementation(libs.bundles.lucene)
Expand Down Expand Up @@ -1702,7 +1703,7 @@ tasks.register('testAcceptance', Test) {
classpath = sourceSets.testAcceptance.runtimeClasspath
systemProperties = System.properties
systemProperty 'java.util.logging.config.file', "${rootDir}/config/test/logger.properties"

dependsOn firstStepsEn
dependsOn ':aligner:jar'
}

Expand Down
4 changes: 2 additions & 2 deletions gradle/libs.versions.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ commons_io = "2.16.1"
commons_text = "1.11.0"
commons_validator = "1.9.0"
jsoup = "1.18.2"
icu4j = { require = "[70,73.2[", prefer = "72.1" }
icu4j = { require = "[71.1,76.1[", prefer = "74.2" }
stax2api = "4.2.2"
woodstox = "6.5.0"
languagetool = "6.1"
Expand Down Expand Up @@ -65,7 +65,7 @@ commons-lang3 = {group = "org.apache.commons", name = "commons-lang3", version.r
commons-text = {group = "org.apache.commons", name = "commons-text", version.ref = "commons_text"}
commons-validator = {group = "commons-validator", name = "commons-validator", version.ref = "commons_validator"}
jsoup = {group = "org.jsoup", name = "jsoup", version.ref = "jsoup"}
icj4j = {group = "com.ibm.icu", name = "icu4j", version.ref = "icu4j"}
icu4j = {group = "com.ibm.icu", name = "icu4j", version.ref = "icu4j"}
stax2-api = {group = "org.codehaus.woodstox", name = "stax2-api", version.ref = "stax2api"}
woodstox-core = {group = "com.fasterxml.woodstox", name = "woodstox-core", version.ref = "woodstox"}
languagetool-all = {group = "org.languagetool", name = "language-all", version.ref = "languagetool"}
Expand Down
4 changes: 2 additions & 2 deletions language-modules/ja/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ dependencies {
exclude module: 'icu4j'
}
implementation(dependencies.variantOf(libs.lucene.gosen) { classifier("ipadic") })
implementation(libs.icj4j)
compileOnly(libs.icu4j)
}

testImplementation(libs.junit4)
Expand All @@ -43,7 +43,7 @@ dependencies {
exclude module: 'icu4j'
}
testRuntimeOnly(dependencies.variantOf(libs.lucene.gosen) { classifier("ipadic") })
testRuntimeOnly(libs.icj4j)
testRuntimeOnly(libs.icu4j)

testImplementation(libs.assertj)
testImplementation(testFixtures(project.rootProject))
Expand Down
12 changes: 9 additions & 3 deletions src/org/omegat/gui/editor/EditorController.java
Original file line number Diff line number Diff line change
Expand Up @@ -704,9 +704,12 @@ protected void loadDocument() {

doc.setDocumentFilter(new DocumentFilter3());

// add locate for target language to editor
// add locales to editor
Locale targetLocale = Core.getProject().getProjectProperties().getTargetLanguage().getLocale();
editor.setLocale(targetLocale);
editor.setTargetLocale(targetLocale);
Locale sourceLocale = Core.getProject().getProjectProperties().getSourceLanguage().getLocale();
editor.setSourceLocale(sourceLocale);

editor.setDocument(doc);

Expand Down Expand Up @@ -1639,8 +1642,9 @@ public void changeCase(CHANGE_CASE_TO toWhat) {
try {
// no selection? make it the current word
if (start == end) {
start = EditorUtils.getWordStart(editor, start);
end = EditorUtils.getWordEnd(editor, end);
Locale locale = Core.getProject().getProjectProperties().getTargetLanguage().getLocale();
start = EditorUtils.getWordStart(editor, start, locale);
end = EditorUtils.getWordEnd(editor, end, locale);

// adjust the bound again
if (start < translationStart && end <= translationEnd) {
Expand Down Expand Up @@ -1947,6 +1951,7 @@ private void createAdditionalPanes() {
.setComponentOrientation(BiDiUtils.isRtl(language) ? ComponentOrientation.RIGHT_TO_LEFT
: ComponentOrientation.LEFT_TO_RIGHT);
introPane.setEditable(false);
introPane.setName("IntroPane");
DragTargetOverlay.apply(introPane, dropInfo);
URI uri = Help.getHelpFileURI(OConsts.HELP_FIRST_STEPS_PREFIX, language, OConsts.HELP_FIRST_STEPS);
if (uri != null) {
Expand All @@ -1958,6 +1963,7 @@ private void createAdditionalPanes() {
emptyProjectPaneTitle = OStrings.getString("TF_INTRO_EMPTYPROJECT_FILENAME");
emptyProjectPane = new JTextPane();
emptyProjectPane.setEditable(false);
emptyProjectPane.setName("EmptyProjectPane");
emptyProjectPane.setText(OStrings.getString("TF_INTRO_EMPTYPROJECT"));
emptyProjectPane.setFont(mw.getApplicationFont());
DragTargetOverlay.apply(emptyProjectPane, dropInfo);
Expand Down
22 changes: 20 additions & 2 deletions src/org/omegat/gui/editor/EditorTextArea3.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;

import javax.swing.JEditorPane;
import javax.swing.JPopupMenu;
Expand Down Expand Up @@ -141,6 +142,9 @@ public class EditorTextArea3 extends JEditorPane {
*/
protected boolean overtypeMode = false;

private Locale targetLocale;
private Locale sourceLocale;

public EditorTextArea3(EditorController controller) {
this.controller = controller;
setEditorKit(new StyledEditorKit() {
Expand All @@ -165,10 +169,16 @@ protected void createInputAttributes(Element element, MutableAttributeSet set) {
c.setBlinkRate(getCaret().getBlinkRate());
setCaret(c);

sourceLocale = getLocale();
targetLocale = getLocale();

addCaretListener(e -> {
try {
int start = EditorUtils.getWordStart(EditorTextArea3.this, e.getMark());
int end = EditorUtils.getWordEnd(EditorTextArea3.this, e.getMark());
// Detection of target string locale.
// It uses a source or a target language as a processing locale.
Locale locale = isInActiveTranslation(e.getMark()) ? targetLocale : sourceLocale;
int start = EditorUtils.getWordStart(EditorTextArea3.this, e.getMark(), locale);
int end = EditorUtils.getWordEnd(EditorTextArea3.this, e.getMark(), locale);
if (end - start <= 0) {
// word not defined
return;
Expand Down Expand Up @@ -200,6 +210,14 @@ public void setFont(Font font) {
}
}

void setTargetLocale(Locale targetLocale) {
this.targetLocale = targetLocale;
}

void setSourceLocale(Locale sourceLocale) {
this.sourceLocale = sourceLocale;
}

/**
* Return OmDocument instead just a Document. If editor was not initialized
* with OmDocument, it will contains other Document implementation. In this
Expand Down
107 changes: 89 additions & 18 deletions src/org/omegat/gui/editor/EditorUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,13 @@
import java.util.Locale;

import javax.swing.text.BadLocationException;
import javax.swing.text.Document;
import javax.swing.text.Element;
import javax.swing.text.JTextComponent;
import javax.swing.text.Utilities;

import com.ibm.icu.text.BreakIterator;

import org.omegat.core.Core;
import org.omegat.core.data.ProtectedPart;
import org.omegat.core.data.SourceTextEntry;
Expand Down Expand Up @@ -61,15 +65,30 @@ private EditorUtils() {
* Determines the start of a word for the given model location. This method
* skips direction char.
*
* TODO: change to use document's locale
*
* @param c
* @param offs
* @return
* @param c TextComponent of the editor area.
* @param offs offset of the text.
* @return position of word start on the text component.
* @throws BadLocationException
* when there is no line found in the text component.
*/
@Deprecated
public static int getWordStart(JTextComponent c, int offs) throws BadLocationException {
int result = Utilities.getWordStart(c, offs);
return getWordStart(c, offs, c.getLocale());
}

/**
* Determines the start of a word for the given model location. This method
* skips direction char.
*
* @param c TextComponent of the editor area.
* @param offs offset of the text.
* @param locale locale of the text.
* @return position of word start on the text component.
* @throws BadLocationException
* when there is no line found in the text component.
*/
public static int getWordStart(JTextComponent c, int offs, Locale locale) throws BadLocationException {
int result = getWordBoundary(c, offs, locale, false);
char ch = c.getDocument().getText(result, 1).charAt(0);
if (isDirectionChar(ch)) {
result++;
Expand All @@ -81,15 +100,30 @@ public static int getWordStart(JTextComponent c, int offs) throws BadLocationExc
* Determines the end of a word for the given model location. This method
* skips direction char.
*
* TODO: change to use document's locale
*
* @param c
* @param offs
* @return
* @param c TextComponent of the editor area.
* @param offs offset of the text.
* @return position of the word end on the text component.
* @throws BadLocationException
* when there is no line found in the text component.
*/
@Deprecated
public static int getWordEnd(JTextComponent c, int offs) throws BadLocationException {
int result = Utilities.getWordEnd(c, offs);
return getWordEnd(c, offs, c.getLocale());
}

/**
* Determines the end of a word for the given model location. This method
* skips direction char.
*
* @param c TextComponent of the editor area.
* @param offs offset of the text.
* @param locale locale of the text.
* @return position of the word end on the text component.
* @throws BadLocationException
* when there is no line found in the text component.
*/
public static int getWordEnd(JTextComponent c, int offs, Locale locale) throws BadLocationException {
int result = getWordBoundary(c, offs, locale, true);
if (result > 0) {
char ch = c.getDocument().getText(result - 1, 1).charAt(0);
if (isDirectionChar(ch)) {
Expand All @@ -99,6 +133,46 @@ public static int getWordEnd(JTextComponent c, int offs) throws BadLocationExcep
return result;
}

private static int getWordBoundary(JTextComponent c, int offs, Locale locale, boolean end) throws BadLocationException {
int result = offs;
Element line = Utilities.getParagraphElement(c, offs);
if (line == null) {
throw new BadLocationException("No word at " + offs, offs);
}
int lineStart = line.getStartOffset();
Document doc = c.getDocument();
int lineEnd = Math.min(line.getEndOffset(), doc.getLength());
if (lineEnd - lineStart > 0) {
String lineString = doc.getText(lineStart, lineEnd - lineStart);
result = lineStart + getWordBoundary(locale, lineString, offs - lineStart, end);
}
return result;
}

/**
* Get word boundary.
* <p>
* When the end argument is true, return a word end.
* Otherwise, return a start of word.
* @param locale locale of the line string.
* @param lineString a string of the line.
* @param wordPosition target position of the line.
* @param end return end of word, otherwise start of word.
* @return index of the word boundary.
*/
static int getWordBoundary(Locale locale, String lineString, int wordPosition, boolean end) {
BreakIterator words = com.ibm.icu.text.BreakIterator.getWordInstance(locale);
words.setText(lineString);
if (wordPosition >= words.last()) {
wordPosition = words.last() - 1;
}
if (end) {
return words.following(wordPosition);
}
words.following(wordPosition);
return words.previous();
}

/**
* Check if char is direction char(u202A,u202B,u202C).
*
Expand Down Expand Up @@ -420,7 +494,7 @@ public static String addBidiAroundTags(String text, SourceTextEntry ste) {
StringBuilder s = new StringBuilder(text.length() * 12 / 10);
for (Tag t : tags) {
if (pos < t.pos) {
s.append(text.substring(pos, t.pos));
s.append(text, pos, t.pos);
}
s.append(SegmentBuilder.BIDI_RLM_CHAR);
s.append(SegmentBuilder.BIDI_LRM_CHAR);
Expand All @@ -437,11 +511,8 @@ public static String addBidiAroundTags(String text, SourceTextEntry ste) {

public static boolean hasBidiAroundTag(String text, String tag, int pos) {
try {
boolean has = true;
if (text.charAt(pos - 1) != SegmentBuilder.BIDI_LRM_CHAR
|| text.charAt(pos - 2) != SegmentBuilder.BIDI_RLM_CHAR) {
has = false;
}
boolean has = text.charAt(pos - 1) == SegmentBuilder.BIDI_LRM_CHAR
&& text.charAt(pos - 2) == SegmentBuilder.BIDI_RLM_CHAR;
if (text.charAt(pos + tag.length()) != SegmentBuilder.BIDI_LRM_CHAR
|| text.charAt(pos + tag.length() + 1) != SegmentBuilder.BIDI_RLM_CHAR) {
has = false;
Expand Down
3 changes: 3 additions & 0 deletions test-acceptance/data/project_CN_JP/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
project_stats.txt
project_stats.json
*.bak
Empty file.
3 changes: 3 additions & 0 deletions test-acceptance/data/project_CN_JP/glossary/glossary.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Glossary in tab-separated format -*- coding: utf-8 -*-
介绍 紹介
中的 中心的な
33 changes: 33 additions & 0 deletions test-acceptance/data/project_CN_JP/omegat.project
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
<?xml version='1.0' encoding='UTF-8'?>
<omegat>
<project version="1.0">
<source_dir>source</source_dir>
<source_dir_excludes>
<mask>**/.svn/**</mask>
<mask>**/CVS/**</mask>
<mask>**/.cvs/**</mask>
<mask>**/.git/**</mask>
<mask>**/.hg/**</mask>
<mask>**/.repositories/**</mask>
<mask>**/desktop.ini</mask>
<mask>**/Thumbs.db</mask>
<mask>**/.DS_Store</mask>
<mask>**/~$*</mask>
</source_dir_excludes>
<target_dir>target</target_dir>
<tm_dir>tm</tm_dir>
<glossary_dir>glossary</glossary_dir>
<glossary_file>.-glossary.txt</glossary_file>
<dictionary_dir>dictionary</dictionary_dir>
<export_tm_dir></export_tm_dir>
<export_tm_levels></export_tm_levels>
<source_lang>zh-CN</source_lang>
<target_lang>ja-JP</target_lang>
<source_tok>org.omegat.tokenizer.LuceneSmartChineseTokenizer</source_tok>
<target_tok>org.omegat.tokenizer.LuceneJapaneseTokenizer</target_tok>
<sentence_seg>true</sentence_seg>
<support_default_translations>true</support_default_translations>
<remove_tags>true</remove_tags>
<external_command></external_command>
</project>
</omegat>
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#Thu Nov 07 21:30:29 JST 2024
LAST_ENTRY_NUMBER=1
LAST_ENTRY_SRC=\u592A\u5E73\u5BFA\u4E2D\u7684\u6587\u7B14\u5854
LAST_ENTRY_FILE=source.txt
Empty file.
17 changes: 17 additions & 0 deletions test-acceptance/data/project_CN_JP/omegat/project_save.tmx
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE tmx SYSTEM "tmx11.dtd">
<tmx version="1.1">
<header creationtool="OmegaT" o-tmf="OmegaT TMX" adminlang="EN-US" datatype="plaintext" creationtoolversion="6.1.0_0_50ff299ad" segtype="sentence" srclang="zh-CN"/>
<body>
<!-- Default translations -->
<tu>
<tuv lang="zh-CN">
<seg>太平寺中的文笔塔</seg>
</tuv>
<tuv lang="ja-JP" changeid="Hiroshi Miura" changedate="20241107T122621Z" creationid="Hiroshi Miura" creationdate="20241107T122621Z">
<seg>太平寺の中心的なペン塔</seg>
</tuv>
</tu>
<!-- Alternative translations -->
</body>
</tmx>
5 changes: 5 additions & 0 deletions test-acceptance/data/project_CN_JP/source/source.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
太平寺中的文笔塔

文筆塔原是江苏省常州市太平寺中的塔。太平寺始建于南北朝齐梁时期,是常州最古老的佛寺之一,今已不存。
文笔塔为砖木结构,七级八面,每级4个拱门,中有旋梯。塔下有曲池、拱桥。
“夕照塔影”为文笔胜景。现存塔为光绪末年(1905-1908年)重建
Empty file.
Empty file.
Loading

0 comments on commit 20ffe61

Please sign in to comment.