feat: EditorUtils use document's locale to splitting words (#1175)

* feat: EditorUtils - Update javadoc for getWordEnd and getWordStart utility functions - Remove "TODO: change to use document's locale" - grab word with source and target locale of the project Signed-off-by: Hiroshi Miura <[email protected]> * chore: add test case in English first steps pane Signed-off-by: Hiroshi Miura <[email protected]> * chore: add task dependency for acceptance test Signed-off-by: Hiroshi Miura <[email protected]> * test: add the case - test EditorUtils.getWord* with loaded project from Chinese to Japanese in English environment Signed-off-by: Hiroshi Miura <[email protected]> * refactor: reduce duplicated code Signed-off-by: Hiroshi Miura <[email protected]> * fix: fix copyright header typo Signed-off-by: Hiroshi Miura <[email protected]> * feat: use ICU4J for BreakItelator to support CJ - Add unit test for getBoundary method with English, Japanese and Chinese Signed-off-by: Hiroshi Miura <[email protected]> * refactor: give locale by callers of EditorUtils Signed-off-by: Hiroshi Miura <[email protected]> * docs: javadoc of EditorUtils - Update javadoc - @deprecated for methods with old signature - Simplify some lines Signed-off-by: Hiroshi Miura <[email protected]> * chore: bump [email protected] Signed-off-by: Hiroshi Miura <[email protected]> * chore: fix typo in dependency Signed-off-by: Hiroshi Miura <[email protected]> * chore: fix degraded merge for dependencies Signed-off-by: Hiroshi Miura <[email protected]> --------- Signed-off-by: Hiroshi Miura <[email protected]>
omegat-org · Dec 12, 2024 · 20ffe61 · 20ffe61
1 parent fb6451f
commit 20ffe61
Show file tree

Hide file tree

Showing 20 changed files with 389 additions and 31 deletions.
diff --git a/build.gradle b/build.gradle
@@ -300,6 +300,7 @@ dependencies {
         }
         runtimeOnly(libs.language.detector)
         runtimeOnly(libs.dumont.hunspell)
+        implementation(libs.icu4j)
 
         // Lucene for tokenizers
         implementation(libs.bundles.lucene)
@@ -1702,7 +1703,7 @@ tasks.register('testAcceptance', Test) {
     classpath = sourceSets.testAcceptance.runtimeClasspath
     systemProperties = System.properties
     systemProperty 'java.util.logging.config.file', "${rootDir}/config/test/logger.properties"
-
+    dependsOn firstStepsEn
     dependsOn ':aligner:jar'
 }
 

diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
@@ -8,7 +8,7 @@ commons_io = "2.16.1"
 commons_text = "1.11.0"
 commons_validator = "1.9.0"
 jsoup = "1.18.2"
-icu4j = { require = "[70,73.2[", prefer = "72.1" }
+icu4j = { require = "[71.1,76.1[", prefer = "74.2" }
 stax2api = "4.2.2"
 woodstox = "6.5.0"
 languagetool = "6.1"
@@ -65,7 +65,7 @@ commons-lang3 = {group = "org.apache.commons", name = "commons-lang3", version.r
 commons-text = {group = "org.apache.commons", name = "commons-text", version.ref = "commons_text"}
 commons-validator = {group = "commons-validator", name = "commons-validator", version.ref = "commons_validator"}
 jsoup = {group = "org.jsoup", name = "jsoup", version.ref = "jsoup"}
-icj4j = {group = "com.ibm.icu", name = "icu4j", version.ref = "icu4j"}
+icu4j = {group = "com.ibm.icu", name = "icu4j", version.ref = "icu4j"}
 stax2-api = {group = "org.codehaus.woodstox", name = "stax2-api", version.ref = "stax2api"}
 woodstox-core = {group = "com.fasterxml.woodstox", name = "woodstox-core", version.ref = "woodstox"}
 languagetool-all = {group = "org.languagetool", name = "language-all", version.ref = "languagetool"}

diff --git a/language-modules/ja/build.gradle b/language-modules/ja/build.gradle
@@ -25,7 +25,7 @@ dependencies {
             exclude module: 'icu4j'
         }
         implementation(dependencies.variantOf(libs.lucene.gosen) { classifier("ipadic") })
-        implementation(libs.icj4j)
+        compileOnly(libs.icu4j)
     }
 
     testImplementation(libs.junit4)
@@ -43,7 +43,7 @@ dependencies {
         exclude module: 'icu4j'
     }
     testRuntimeOnly(dependencies.variantOf(libs.lucene.gosen) { classifier("ipadic") })
-    testRuntimeOnly(libs.icj4j)
+    testRuntimeOnly(libs.icu4j)
 
     testImplementation(libs.assertj)
     testImplementation(testFixtures(project.rootProject))

diff --git a/src/org/omegat/gui/editor/EditorController.java b/src/org/omegat/gui/editor/EditorController.java
@@ -704,9 +704,12 @@ protected void loadDocument() {
 
         doc.setDocumentFilter(new DocumentFilter3());
 
-        // add locate for target language to editor
+        // add locales to editor
         Locale targetLocale = Core.getProject().getProjectProperties().getTargetLanguage().getLocale();
         editor.setLocale(targetLocale);
+        editor.setTargetLocale(targetLocale);
+        Locale sourceLocale = Core.getProject().getProjectProperties().getSourceLanguage().getLocale();
+        editor.setSourceLocale(sourceLocale);
 
         editor.setDocument(doc);
 
@@ -1639,8 +1642,9 @@ public void changeCase(CHANGE_CASE_TO toWhat) {
         try {
             // no selection? make it the current word
             if (start == end) {
-                start = EditorUtils.getWordStart(editor, start);
-                end = EditorUtils.getWordEnd(editor, end);
+                Locale locale = Core.getProject().getProjectProperties().getTargetLanguage().getLocale();
+                start = EditorUtils.getWordStart(editor, start, locale);
+                end = EditorUtils.getWordEnd(editor, end, locale);
 
                 // adjust the bound again
                 if (start < translationStart && end <= translationEnd) {
@@ -1947,6 +1951,7 @@ private void createAdditionalPanes() {
                     .setComponentOrientation(BiDiUtils.isRtl(language) ? ComponentOrientation.RIGHT_TO_LEFT
                             : ComponentOrientation.LEFT_TO_RIGHT);
             introPane.setEditable(false);
+            introPane.setName("IntroPane");
             DragTargetOverlay.apply(introPane, dropInfo);
             URI uri = Help.getHelpFileURI(OConsts.HELP_FIRST_STEPS_PREFIX, language, OConsts.HELP_FIRST_STEPS);
             if (uri != null) {
@@ -1958,6 +1963,7 @@ private void createAdditionalPanes() {
         emptyProjectPaneTitle = OStrings.getString("TF_INTRO_EMPTYPROJECT_FILENAME");
         emptyProjectPane = new JTextPane();
         emptyProjectPane.setEditable(false);
+        emptyProjectPane.setName("EmptyProjectPane");
         emptyProjectPane.setText(OStrings.getString("TF_INTRO_EMPTYPROJECT"));
         emptyProjectPane.setFont(mw.getApplicationFont());
         DragTargetOverlay.apply(emptyProjectPane, dropInfo);

diff --git a/src/org/omegat/gui/editor/EditorTextArea3.java b/src/org/omegat/gui/editor/EditorTextArea3.java
@@ -42,6 +42,7 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
+import java.util.Locale;
 
 import javax.swing.JEditorPane;
 import javax.swing.JPopupMenu;
@@ -141,6 +142,9 @@ public class EditorTextArea3 extends JEditorPane {
      */
     protected boolean overtypeMode = false;
 
+    private Locale targetLocale;
+    private Locale sourceLocale;
+
     public EditorTextArea3(EditorController controller) {
         this.controller = controller;
         setEditorKit(new StyledEditorKit() {
@@ -165,10 +169,16 @@ protected void createInputAttributes(Element element, MutableAttributeSet set) {
         c.setBlinkRate(getCaret().getBlinkRate());
         setCaret(c);
 
+        sourceLocale = getLocale();
+        targetLocale = getLocale();
+
         addCaretListener(e -> {
             try {
-                int start = EditorUtils.getWordStart(EditorTextArea3.this, e.getMark());
-                int end = EditorUtils.getWordEnd(EditorTextArea3.this, e.getMark());
+                // Detection of target string locale.
+                // It uses a source or a target language as a processing locale.
+                Locale locale = isInActiveTranslation(e.getMark()) ? targetLocale : sourceLocale;
+                int start = EditorUtils.getWordStart(EditorTextArea3.this, e.getMark(), locale);
+                int end = EditorUtils.getWordEnd(EditorTextArea3.this, e.getMark(), locale);
                 if (end - start <= 0) {
                     // word not defined
                     return;
@@ -200,6 +210,14 @@ public void setFont(Font font) {
         }
     }
 
+    void setTargetLocale(Locale targetLocale) {
+        this.targetLocale = targetLocale;
+    }
+
+    void setSourceLocale(Locale sourceLocale) {
+        this.sourceLocale = sourceLocale;
+    }
+
     /**
      * Return OmDocument instead just a Document. If editor was not initialized
      * with OmDocument, it will contains other Document implementation. In this

diff --git a/src/org/omegat/gui/editor/EditorUtils.java b/src/org/omegat/gui/editor/EditorUtils.java
@@ -31,9 +31,13 @@
 import java.util.Locale;
 
 import javax.swing.text.BadLocationException;
+import javax.swing.text.Document;
+import javax.swing.text.Element;
 import javax.swing.text.JTextComponent;
 import javax.swing.text.Utilities;
 
+import com.ibm.icu.text.BreakIterator;
+
 import org.omegat.core.Core;
 import org.omegat.core.data.ProtectedPart;
 import org.omegat.core.data.SourceTextEntry;
@@ -61,15 +65,30 @@ private EditorUtils() {
      * Determines the start of a word for the given model location. This method
      * skips direction char.
      *
-     * TODO: change to use document's locale
-     *
-     * @param c
-     * @param offs
-     * @return
+     * @param c TextComponent of the editor area.
+     * @param offs offset of the text.
+     * @return position of word start on the text component.
      * @throws BadLocationException
+     *         when there is no line found in the text component.
      */
+    @Deprecated
     public static int getWordStart(JTextComponent c, int offs) throws BadLocationException {
-        int result = Utilities.getWordStart(c, offs);
+        return getWordStart(c, offs, c.getLocale());
+    }
+
+    /**
+     * Determines the start of a word for the given model location. This method
+     * skips direction char.
+     *
+     * @param c TextComponent of the editor area.
+     * @param offs offset of the text.
+     * @param locale locale of the text.
+     * @return position of word start on the text component.
+     * @throws BadLocationException
+     *         when there is no line found in the text component.
+     */
+    public static int getWordStart(JTextComponent c, int offs, Locale locale) throws BadLocationException {
+        int result = getWordBoundary(c, offs, locale, false);
         char ch = c.getDocument().getText(result, 1).charAt(0);
         if (isDirectionChar(ch)) {
             result++;
@@ -81,15 +100,30 @@ public static int getWordStart(JTextComponent c, int offs) throws BadLocationExc
      * Determines the end of a word for the given model location. This method
      * skips direction char.
      *
-     * TODO: change to use document's locale
-     *
-     * @param c
-     * @param offs
-     * @return
+     * @param c TextComponent of the editor area.
+     * @param offs offset of the text.
+     * @return position of the word end on the text component.
      * @throws BadLocationException
+     *         when there is no line found in the text component.
      */
+    @Deprecated
     public static int getWordEnd(JTextComponent c, int offs) throws BadLocationException {
-        int result = Utilities.getWordEnd(c, offs);
+        return getWordEnd(c, offs, c.getLocale());
+    }
+
+    /**
+     * Determines the end of a word for the given model location. This method
+     * skips direction char.
+     *
+     * @param c TextComponent of the editor area.
+     * @param offs offset of the text.
+     * @param locale locale of the text.
+     * @return position of the word end on the text component.
+     * @throws BadLocationException
+     *         when there is no line found in the text component.
+     */
+    public static int getWordEnd(JTextComponent c, int offs, Locale locale) throws BadLocationException {
+        int result = getWordBoundary(c, offs, locale, true);
         if (result > 0) {
             char ch = c.getDocument().getText(result - 1, 1).charAt(0);
             if (isDirectionChar(ch)) {
@@ -99,6 +133,46 @@ public static int getWordEnd(JTextComponent c, int offs) throws BadLocationExcep
         return result;
     }
 
+    private static int getWordBoundary(JTextComponent c, int offs, Locale locale, boolean end) throws BadLocationException {
+        int result = offs;
+        Element line = Utilities.getParagraphElement(c, offs);
+        if (line == null) {
+            throw new BadLocationException("No word at " + offs, offs);
+        }
+        int lineStart = line.getStartOffset();
+        Document doc = c.getDocument();
+        int lineEnd = Math.min(line.getEndOffset(), doc.getLength());
+        if  (lineEnd - lineStart > 0) {
+            String lineString = doc.getText(lineStart, lineEnd - lineStart);
+            result = lineStart + getWordBoundary(locale, lineString, offs - lineStart, end);
+        }
+        return result;
+    }
+
+    /**
+     * Get word boundary.
+     * <p>
+     * When the end argument is true, return a word end.
+     * Otherwise, return a start of word.
+     * @param locale locale of the line string.
+     * @param lineString a string of the line.
+     * @param wordPosition target position of the line.
+     * @param end return end of word, otherwise start of word.
+     * @return index of the word boundary.
+     */
+    static int getWordBoundary(Locale locale, String lineString, int wordPosition, boolean end) {
+        BreakIterator words = com.ibm.icu.text.BreakIterator.getWordInstance(locale);
+        words.setText(lineString);
+        if (wordPosition >= words.last()) {
+            wordPosition = words.last() - 1;
+        }
+        if (end) {
+            return words.following(wordPosition);
+        }
+        words.following(wordPosition);
+        return words.previous();
+}
+
     /**
      * Check if char is direction char(u202A,u202B,u202C).
      *
@@ -420,7 +494,7 @@ public static String addBidiAroundTags(String text, SourceTextEntry ste) {
         StringBuilder s = new StringBuilder(text.length() * 12 / 10);
         for (Tag t : tags) {
             if (pos < t.pos) {
-                s.append(text.substring(pos, t.pos));
+                s.append(text, pos, t.pos);
             }
             s.append(SegmentBuilder.BIDI_RLM_CHAR);
             s.append(SegmentBuilder.BIDI_LRM_CHAR);
@@ -437,11 +511,8 @@ public static String addBidiAroundTags(String text, SourceTextEntry ste) {
 
     public static boolean hasBidiAroundTag(String text, String tag, int pos) {
         try {
-            boolean has = true;
-            if (text.charAt(pos - 1) != SegmentBuilder.BIDI_LRM_CHAR
-                    || text.charAt(pos - 2) != SegmentBuilder.BIDI_RLM_CHAR) {
-                has = false;
-            }
+            boolean has = text.charAt(pos - 1) == SegmentBuilder.BIDI_LRM_CHAR
+                    && text.charAt(pos - 2) == SegmentBuilder.BIDI_RLM_CHAR;
             if (text.charAt(pos + tag.length()) != SegmentBuilder.BIDI_LRM_CHAR
                     || text.charAt(pos + tag.length() + 1) != SegmentBuilder.BIDI_RLM_CHAR) {
                 has = false;

diff --git a/test-acceptance/data/project_CN_JP/.gitignore b/test-acceptance/data/project_CN_JP/.gitignore
@@ -0,0 +1,3 @@
+project_stats.txt
+project_stats.json
+*.bak
diff --git a/test-acceptance/data/project_CN_JP/dictionary/.keep b/test-acceptance/data/project_CN_JP/dictionary/.keep
diff --git a/test-acceptance/data/project_CN_JP/glossary/glossary.txt b/test-acceptance/data/project_CN_JP/glossary/glossary.txt
@@ -0,0 +1,3 @@
+# Glossary in tab-separated format -*- coding: utf-8 -*-
+介绍      紹介
+中的      中心的な
diff --git a/test-acceptance/data/project_CN_JP/omegat.project b/test-acceptance/data/project_CN_JP/omegat.project
@@ -0,0 +1,33 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<omegat>
+  <project version="1.0">
+    <source_dir>source</source_dir>
+    <source_dir_excludes>
+      <mask>**/.svn/**</mask>
+      <mask>**/CVS/**</mask>
+      <mask>**/.cvs/**</mask>
+      <mask>**/.git/**</mask>
+      <mask>**/.hg/**</mask>
+      <mask>**/.repositories/**</mask>
+      <mask>**/desktop.ini</mask>
+      <mask>**/Thumbs.db</mask>
+      <mask>**/.DS_Store</mask>
+      <mask>**/~$*</mask>
+    </source_dir_excludes>
+    <target_dir>target</target_dir>
+    <tm_dir>tm</tm_dir>
+    <glossary_dir>glossary</glossary_dir>
+    <glossary_file>.-glossary.txt</glossary_file>
+    <dictionary_dir>dictionary</dictionary_dir>
+    <export_tm_dir></export_tm_dir>
+    <export_tm_levels></export_tm_levels>
+    <source_lang>zh-CN</source_lang>
+    <target_lang>ja-JP</target_lang>
+    <source_tok>org.omegat.tokenizer.LuceneSmartChineseTokenizer</source_tok>
+    <target_tok>org.omegat.tokenizer.LuceneJapaneseTokenizer</target_tok>
+    <sentence_seg>true</sentence_seg>
+    <support_default_translations>true</support_default_translations>
+    <remove_tags>true</remove_tags>
+    <external_command></external_command>
+  </project>
+</omegat>
diff --git a/test-acceptance/data/project_CN_JP/omegat/ignored_words.txt b/test-acceptance/data/project_CN_JP/omegat/ignored_words.txt
diff --git a/test-acceptance/data/project_CN_JP/omegat/last_entry.properties b/test-acceptance/data/project_CN_JP/omegat/last_entry.properties
@@ -0,0 +1,4 @@
+#Thu Nov 07 21:30:29 JST 2024
+LAST_ENTRY_NUMBER=1
+LAST_ENTRY_SRC=\u592A\u5E73\u5BFA\u4E2D\u7684\u6587\u7B14\u5854
+LAST_ENTRY_FILE=source.txt
diff --git a/test-acceptance/data/project_CN_JP/omegat/learned_words.txt b/test-acceptance/data/project_CN_JP/omegat/learned_words.txt
diff --git a/test-acceptance/data/project_CN_JP/omegat/project_save.tmx b/test-acceptance/data/project_CN_JP/omegat/project_save.tmx
@@ -0,0 +1,17 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<!DOCTYPE tmx SYSTEM "tmx11.dtd">
+<tmx version="1.1">
+  <header creationtool="OmegaT" o-tmf="OmegaT TMX" adminlang="EN-US" datatype="plaintext" creationtoolversion="6.1.0_0_50ff299ad" segtype="sentence" srclang="zh-CN"/>
+  <body>
+<!-- Default translations -->
+    <tu>
+      <tuv lang="zh-CN">
+        <seg>太平寺中的文笔塔</seg>
+      </tuv>
+      <tuv lang="ja-JP" changeid="Hiroshi Miura" changedate="20241107T122621Z" creationid="Hiroshi Miura" creationdate="20241107T122621Z">
+        <seg>太平寺の中心的なペン塔</seg>
+      </tuv>
+    </tu>
+<!-- Alternative translations -->
+  </body>
+</tmx>
diff --git a/test-acceptance/data/project_CN_JP/source/source.txt b/test-acceptance/data/project_CN_JP/source/source.txt
@@ -0,0 +1,5 @@
+太平寺中的文笔塔
+
+文筆塔原是江苏省常州市太平寺中的塔。太平寺始建于南北朝齐梁时期，是常州最古老的佛寺之一，今已不存。
+文笔塔为砖木结构，七级八面，每级4个拱门，中有旋梯。塔下有曲池、拱桥。
+“夕照塔影”为文笔胜景。现存塔为光绪末年（1905-1908年）重建
diff --git a/test-acceptance/data/project_CN_JP/target/.keep b/test-acceptance/data/project_CN_JP/target/.keep
diff --git a/test-acceptance/data/project_CN_JP/tm/.keep b/test-acceptance/data/project_CN_JP/tm/.keep