Merge pull request #35 from omergreen/omergreen-add-spaces-before-rea…

…dings Add spaces before readings
obynio · Feb 12, 2024 · 0f43334 · 0f43334
2 parents 6d2fd25 + a888782
commit 0f43334
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 25 deletions.
diff --git a/reading.py b/reading.py
@@ -143,14 +143,15 @@ def __init__(self, text: str, reading: Optional[str]):
         self.text = text
         self.reading = reading
 
-    def format(self, useRubyTags: bool) -> str:
+    def format(self, useRubyTags: bool, previous_character: str) -> str:
         if self.reading is None:
             return self.text
 
         if useRubyTags:
-            return "<ruby>%s<rp>(</rp><rt>%s</rt><rp>)</rp></ruby>" % (self.text, self.reading)
+            return "<ruby>{}<rp>(</rp><rt>{}</rt><rp>)</rp></ruby>".format(self.text, self.reading)
         else:
-            return '%s[%s]' % (self.text, self.reading)
+            add_space = previous_character is not None and previous_character != "]"
+            return '{}{}[{}]'.format(" " if add_space else "", self.text, self.reading)
 
 class RegexDefinition:
     def __init__(self, text: str, regexGroupIndex: Optional[int]):
@@ -205,7 +206,7 @@ def kanjiToRegex(kanji: str):
         definitions.append(RegexDefinition(captureGroup, numCaptureGroups))
         numCaptureGroups += 1
 
-    return ("^%s$" % ''.join(regexPieces), definitions)
+    return ("^{}$".format(str().join(regexPieces)), definitions)
 
 class MecabController(object):
 
@@ -276,7 +277,9 @@ def reading(self, expr, ignoreNumbers = True, useRubyTags = False):
                     nodes.append(ReadingNode(definition.text, groupReading))
 
         # Combine our nodes together into a single sentece
-        fin = ''.join(node.format(useRubyTags) for node in nodes)
+        fin = str()
+        for node in nodes:
+            fin += node.format(useRubyTags, fin[-1] if len(fin) > 0 else None)
 
         # Finalize formatting
         fin = fin.replace(ASCII_SPACE_TOKEN, ' ')

diff --git a/test/test_reading.py b/test/test_reading.py
@@ -24,7 +24,7 @@ class TestMecab(unittest.TestCase):
     # sentence should have readings
     def testNormalSentence(self):
         res = reading.mecab.reading("カリン、自分でまいた種は自分で刈り取れ")
-        self.assertEqual(res, "カリン、自分[じぶん]でまいた種[たね]は自分[じぶん]で刈[か]り取[と]れ")
+        self.assertEqual(res, "カリン、 自分[じぶん]でまいた 種[たね]は 自分[じぶん]で 刈[か]り 取[と]れ")
 
     # kanji should have a reading
     def testNormalKanji(self):
@@ -34,12 +34,12 @@ def testNormalKanji(self):
     # punctuation should be ignored
     def testWithPunctuation(self):
         res = reading.mecab.reading("昨日、林檎を2個買った。")
-        self.assertEqual(res, "昨日[きのう]、林檎[りんご]を2個[こ]買[か]った。")
+        self.assertEqual(res, "昨日[きのう]、 林檎[りんご]を2 個[こ]買[か]った。")
 
     # unicode characters should be ignored
     def testUnicodeChar(self):
         res = reading.mecab.reading("真莉、大好きだよん＾＾")
-        self.assertEqual(res, "真[ま]莉、大好[だいす]きだよん＾＾")
+        self.assertEqual(res, "真[ま]莉、 大好[だいす]きだよん＾＾")
 
     # katakana should not be given furigana readings
     def testKatakana(self):
@@ -49,61 +49,61 @@ def testKatakana(self):
     # romanji numbers should not have readings
     def testRomanjiNumbers(self):
         res = reading.mecab.reading("彼２０００万も使った。")
-        self.assertEqual(res, "彼[かれ]２０００万[まん]も使[つか]った。")
+        self.assertEqual(res, "彼[かれ]２０００ 万[まん]も 使[つか]った。")
 
     # kanji numbers should not have readings
     def testKanjiNumber(self):
         res = reading.mecab.reading("彼二千三百六十円も使った。")
-        self.assertEqual(res, "彼[かれ]二千[せん]三百[ひゃく]六十円[えん]も使[つか]った。")
+        self.assertEqual(res, "彼[かれ]二 千[せん]三 百[ひゃく]六十 円[えん]も 使[つか]った。")
 
     # ensure that verbs with okurigana don't produce furigana for the kana portions
     def testOkurigana(self):
         self.assertEqual(reading.mecab.reading("口走る"), "口走[くちばし]る")
-        self.assertEqual(reading.mecab.reading("テスト勉強の息抜きとか　どうしてんの"), "テスト勉強[べんきょう]の息抜[いきぬ]きとか　どうしてんの")
+        self.assertEqual(reading.mecab.reading("テスト勉強の息抜きとか　どうしてんの"), "テスト 勉強[べんきょう]の 息抜[いきぬ]きとか　どうしてんの")
 
     # ensure that a single word that has plain kana appearing before the kanji in
     # the word do not have attached furigana
     def testKanaPrefixes(self):
-        self.assertEqual(reading.mecab.reading("お前"), "お前[まえ]")
-        self.assertEqual(reading.mecab.reading("ローマ字"), "ローマ字[じ]")
-        self.assertEqual(reading.mecab.reading("ローマ帝国"), "ローマ帝国[ていこく]")
+        self.assertEqual(reading.mecab.reading("お前"), "お 前[まえ]")
+        self.assertEqual(reading.mecab.reading("ローマ字"), "ローマ 字[じ]")
+        self.assertEqual(reading.mecab.reading("ローマ帝国"), "ローマ 帝国[ていこく]")
 
     # ensure that a single word that both begins AND ends with kana but contains
     # kanji in the middle only generates furigana for the kanji portion, and not
     # for the kana
     def testKanaPrefixSuffix(self):
         actual = reading.mecab.reading("みじん切り")
-        self.assertEqual(actual, "みじん切[ぎ]り")
+        self.assertEqual(actual, "みじん 切[ぎ]り")
 
     # ensure that for words that have kana in between two kanji, that only the
     # kanji receive furigana readings and the kana does not
     def testKanaBetweenKanji(self):
-        self.assertEqual(reading.mecab.reading("書き込む"), "書[か]き込[こ]む")
-        self.assertEqual(reading.mecab.reading("走り抜く"), "走[はし]り抜[ぬ]く")
-        self.assertEqual(reading.mecab.reading("走り回る"), "走[はし]り回[まわ]る")
+        self.assertEqual(reading.mecab.reading("書き込む"), "書[か]き 込[こ]む")
+        self.assertEqual(reading.mecab.reading("走り抜く"), "走[はし]り 抜[ぬ]く")
+        self.assertEqual(reading.mecab.reading("走り回る"), "走[はし]り 回[まわ]る")
 
     # ensure that any regular ASCII space characters (0x20) that are in the original
     # string are found in the resultant string as well
     def testSpacesRetained(self):
-        self.assertEqual(reading.mecab.reading("この文に 空白が あります"), "この文[ぶん]に 空白[くうはく]が あります")
+        self.assertEqual(reading.mecab.reading("この文に 空白が あります"), "この 文[ぶん]に  空白[くうはく]が あります")
         self.assertEqual(reading.mecab.reading("hello world"), "hello world")
 
     # some kana characters will have different readings when used in readings
     # (such as ヶ月 being read as かげつ). ensure that we can detect and handle these
     def testKanaWithAdditionalReadings(self):
         # Check that ヵ (small) stands in for か (large) in readings
         # This should generate furigana for the small ヵ
-        self.assertEqual(reading.mecab.reading("彼はトルコを2ヵ月間訪問するつもりです"), "彼[かれ]はトルコを2ヵ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです")
+        self.assertEqual(reading.mecab.reading("彼はトルコを2ヵ月間訪問するつもりです"), "彼[かれ]はトルコを2 ヵ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです")
 
         # Check that ヶ *also* stands in for か in readings
         # This should generate furigana for the small ヶ
-        self.assertEqual(reading.mecab.reading("彼はトルコを2ヶ月間訪問するつもりです"), "彼[かれ]はトルコを2ヶ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです")
+        self.assertEqual(reading.mecab.reading("彼はトルコを2ヶ月間訪問するつもりです"), "彼[かれ]はトルコを2 ヶ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです")
 
         # For the same sentence, also make sure that the full-sized か and カ
         # are also recognized.
         # However, neither of these should generate furigana.
-        self.assertEqual(reading.mecab.reading("彼はトルコを2か月間訪問するつもりです"), "彼[かれ]はトルコを2か月[げつ]間[かん]訪問[ほうもん]するつもりです")
-        self.assertEqual(reading.mecab.reading("彼はトルコを2カ月間訪問するつもりです"), "彼[かれ]はトルコを2カ月[げつ]間[かん]訪問[ほうもん]するつもりです")
+        self.assertEqual(reading.mecab.reading("彼はトルコを2か月間訪問するつもりです"), "彼[かれ]はトルコを2か 月[げつ]間[かん]訪問[ほうもん]するつもりです")
+        self.assertEqual(reading.mecab.reading("彼はトルコを2カ月間訪問するつもりです"), "彼[かれ]はトルコを2カ 月[げつ]間[かん]訪問[ほうもん]するつもりです")
 
         # Finally, ensure that we're not just ALWAYS adding furigana to ヶ and ヵ
         # whenever we encounter them

diff --git a/test/test_utils.py b/test/test_utils.py
@@ -27,8 +27,8 @@ def testEmptyString(self):
 
     # ensure that bracket notation is correctly removed
     def testRemovesBrackets(self):
-        self.assertEqual(utils.removeFurigana("日本語[にほんご]を勉強[べんきょう]する"), "日本語を勉強する")
-        self.assertEqual(utils.removeFurigana("走[はし]り込[こ]む"), "走り込む")
+        self.assertEqual(utils.removeFurigana("日本語[にほんご]を 勉強[べんきょう]する"), "日本語を勉強する")
+        self.assertEqual(utils.removeFurigana("走[はし]り 込[こ]む"), "走り込む")
 
     # ensure that ruby tags are correctly removed
     def testRemovesRuby(self):

diff --git a/utils.py b/utils.py
@@ -35,6 +35,10 @@ def removeFurigana(text: str):
         stripped = stripped.replace("<ruby>" + ruby + "</ruby>", body)
 
     # Next, remove the bracket notation
+    # remove spaces only if bracket notation was used
+    if "[" in stripped:
+        stripped = stripped.replace(" ", "")
+
     stripped, _ = re.subn('\[[^\]]*\]', '', stripped)
 
     # Return the final string