From eaaeeea3857aeb107118b74f954e637c7c6b5cdf Mon Sep 17 00:00:00 2001 From: omergreen <71124454+omergreen@users.noreply.github.com> Date: Tue, 6 Feb 2024 00:35:23 +0200 Subject: [PATCH 1/5] Add spaces before readings --- reading.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/reading.py b/reading.py index 40e4371..647b208 100644 --- a/reading.py +++ b/reading.py @@ -143,14 +143,15 @@ def __init__(self, text: str, reading: Optional[str]): self.text = text self.reading = reading - def format(self, useRubyTags: bool) -> str: + def format(self, useRubyTags: bool, previous_character: str) -> str: if self.reading is None: return self.text if useRubyTags: return "%s(%s)" % (self.text, self.reading) else: - return '%s[%s]' % (self.text, self.reading) + add_space = previous_character is not None and isKana(previous_character) + return '%s%s[%s]' % (" " if add_space else "", self.text, self.reading) class RegexDefinition: def __init__(self, text: str, regexGroupIndex: Optional[int]): @@ -276,7 +277,9 @@ def reading(self, expr, ignoreNumbers = True, useRubyTags = False): nodes.append(ReadingNode(definition.text, groupReading)) # Combine our nodes together into a single sentece - fin = ''.join(node.format(useRubyTags) for node in nodes) + fin = '' + for node in nodes: + fin += node.format(useRubyTags, fin[-1] if len(fin) > 0 else None) # Finalize formatting fin = fin.replace(ASCII_SPACE_TOKEN, ' ') From d6a4cb84e293ca9d9688761d99ed8f03c9dd78a7 Mon Sep 17 00:00:00 2001 From: omergreen <71124454+omergreen@users.noreply.github.com> Date: Tue, 6 Feb 2024 00:47:57 +0200 Subject: [PATCH 2/5] add space before anything but ] --- reading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reading.py b/reading.py index 647b208..5074537 100644 --- a/reading.py +++ b/reading.py @@ -150,7 +150,7 @@ def format(self, useRubyTags: bool, previous_character: str) -> str: if useRubyTags: return "%s(%s)" % (self.text, self.reading) else: - add_space = previous_character is not None and isKana(previous_character) + add_space = previous_character is not None and previous_character != "]" return '%s%s[%s]' % (" " if add_space else "", self.text, self.reading) class RegexDefinition: From a58460d74ff57d18e4761d4e5c39981881a14f79 Mon Sep 17 00:00:00 2001 From: omergreen <71124454+omergreen@users.noreply.github.com> Date: Sun, 11 Feb 2024 18:33:07 +0000 Subject: [PATCH 3/5] remove spaces when stripping bracket furigana --- utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utils.py b/utils.py index 66f3838..73c19d7 100644 --- a/utils.py +++ b/utils.py @@ -35,6 +35,10 @@ def removeFurigana(text: str): stripped = stripped.replace("" + ruby + "", body) # Next, remove the bracket notation + # remove spaces only if bracket notation was used + if "[" in stripped: + stripped = stripped.replace(" ", "") + stripped, _ = re.subn('\[[^\]]*\]', '', stripped) # Return the final string From c1fe7f6b11cdd036b47ce54579a8424b31efc28e Mon Sep 17 00:00:00 2001 From: omergreen <71124454+omergreen@users.noreply.github.com> Date: Sun, 11 Feb 2024 18:33:18 +0000 Subject: [PATCH 4/5] update tests to reflect new space changes --- test/test_reading.py | 36 ++++++++++++++++++------------------ test/test_utils.py | 4 ++-- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/test/test_reading.py b/test/test_reading.py index 2420206..80e09ea 100644 --- a/test/test_reading.py +++ b/test/test_reading.py @@ -24,7 +24,7 @@ class TestMecab(unittest.TestCase): # sentence should have readings def testNormalSentence(self): res = reading.mecab.reading("カリン、自分でまいた種は自分で刈り取れ") - self.assertEqual(res, "カリン、自分[じぶん]でまいた種[たね]は自分[じぶん]で刈[か]り取[と]れ") + self.assertEqual(res, "カリン、 自分[じぶん]でまいた 種[たね]は 自分[じぶん]で 刈[か]り 取[と]れ") # kanji should have a reading def testNormalKanji(self): @@ -34,12 +34,12 @@ def testNormalKanji(self): # punctuation should be ignored def testWithPunctuation(self): res = reading.mecab.reading("昨日、林檎を2個買った。") - self.assertEqual(res, "昨日[きのう]、林檎[りんご]を2個[こ]買[か]った。") + self.assertEqual(res, "昨日[きのう]、 林檎[りんご]を2 個[こ]買[か]った。") # unicode characters should be ignored def testUnicodeChar(self): res = reading.mecab.reading("真莉、大好きだよん^^") - self.assertEqual(res, "真[ま]莉、大好[だいす]きだよん^^") + self.assertEqual(res, "真[ま]莉、 大好[だいす]きだよん^^") # katakana should not be given furigana readings def testKatakana(self): @@ -49,43 +49,43 @@ def testKatakana(self): # romanji numbers should not have readings def testRomanjiNumbers(self): res = reading.mecab.reading("彼2000万も使った。") - self.assertEqual(res, "彼[かれ]2000万[まん]も使[つか]った。") + self.assertEqual(res, "彼[かれ]2000 万[まん]も 使[つか]った。") # kanji numbers should not have readings def testKanjiNumber(self): res = reading.mecab.reading("彼二千三百六十円も使った。") - self.assertEqual(res, "彼[かれ]二千[せん]三百[ひゃく]六十円[えん]も使[つか]った。") + self.assertEqual(res, "彼[かれ]二 千[せん]三 百[ひゃく]六十 円[えん]も 使[つか]った。") # ensure that verbs with okurigana don't produce furigana for the kana portions def testOkurigana(self): self.assertEqual(reading.mecab.reading("口走る"), "口走[くちばし]る") - self.assertEqual(reading.mecab.reading("テスト勉強の息抜きとか どうしてんの"), "テスト勉強[べんきょう]の息抜[いきぬ]きとか どうしてんの") + self.assertEqual(reading.mecab.reading("テスト勉強の息抜きとか どうしてんの"), "テスト 勉強[べんきょう]の 息抜[いきぬ]きとか どうしてんの") # ensure that a single word that has plain kana appearing before the kanji in # the word do not have attached furigana def testKanaPrefixes(self): - self.assertEqual(reading.mecab.reading("お前"), "お前[まえ]") - self.assertEqual(reading.mecab.reading("ローマ字"), "ローマ字[じ]") - self.assertEqual(reading.mecab.reading("ローマ帝国"), "ローマ帝国[ていこく]") + self.assertEqual(reading.mecab.reading("お前"), "お 前[まえ]") + self.assertEqual(reading.mecab.reading("ローマ字"), "ローマ 字[じ]") + self.assertEqual(reading.mecab.reading("ローマ帝国"), "ローマ 帝国[ていこく]") # ensure that a single word that both begins AND ends with kana but contains # kanji in the middle only generates furigana for the kanji portion, and not # for the kana def testKanaPrefixSuffix(self): actual = reading.mecab.reading("みじん切り") - self.assertEqual(actual, "みじん切[ぎ]り") + self.assertEqual(actual, "みじん 切[ぎ]り") # ensure that for words that have kana in between two kanji, that only the # kanji receive furigana readings and the kana does not def testKanaBetweenKanji(self): - self.assertEqual(reading.mecab.reading("書き込む"), "書[か]き込[こ]む") - self.assertEqual(reading.mecab.reading("走り抜く"), "走[はし]り抜[ぬ]く") - self.assertEqual(reading.mecab.reading("走り回る"), "走[はし]り回[まわ]る") + self.assertEqual(reading.mecab.reading("書き込む"), "書[か]き 込[こ]む") + self.assertEqual(reading.mecab.reading("走り抜く"), "走[はし]り 抜[ぬ]く") + self.assertEqual(reading.mecab.reading("走り回る"), "走[はし]り 回[まわ]る") # ensure that any regular ASCII space characters (0x20) that are in the original # string are found in the resultant string as well def testSpacesRetained(self): - self.assertEqual(reading.mecab.reading("この文に 空白が あります"), "この文[ぶん]に 空白[くうはく]が あります") + self.assertEqual(reading.mecab.reading("この文に 空白が あります"), "この 文[ぶん]に 空白[くうはく]が あります") self.assertEqual(reading.mecab.reading("hello world"), "hello world") # some kana characters will have different readings when used in readings @@ -93,17 +93,17 @@ def testSpacesRetained(self): def testKanaWithAdditionalReadings(self): # Check that ヵ (small) stands in for か (large) in readings # This should generate furigana for the small ヵ - self.assertEqual(reading.mecab.reading("彼はトルコを2ヵ月間訪問するつもりです"), "彼[かれ]はトルコを2ヵ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです") + self.assertEqual(reading.mecab.reading("彼はトルコを2ヵ月間訪問するつもりです"), "彼[かれ]はトルコを2 ヵ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです") # Check that ヶ *also* stands in for か in readings # This should generate furigana for the small ヶ - self.assertEqual(reading.mecab.reading("彼はトルコを2ヶ月間訪問するつもりです"), "彼[かれ]はトルコを2ヶ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです") + self.assertEqual(reading.mecab.reading("彼はトルコを2ヶ月間訪問するつもりです"), "彼[かれ]はトルコを2 ヶ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです") # For the same sentence, also make sure that the full-sized か and カ # are also recognized. # However, neither of these should generate furigana. - self.assertEqual(reading.mecab.reading("彼はトルコを2か月間訪問するつもりです"), "彼[かれ]はトルコを2か月[げつ]間[かん]訪問[ほうもん]するつもりです") - self.assertEqual(reading.mecab.reading("彼はトルコを2カ月間訪問するつもりです"), "彼[かれ]はトルコを2カ月[げつ]間[かん]訪問[ほうもん]するつもりです") + self.assertEqual(reading.mecab.reading("彼はトルコを2か月間訪問するつもりです"), "彼[かれ]はトルコを2か 月[げつ]間[かん]訪問[ほうもん]するつもりです") + self.assertEqual(reading.mecab.reading("彼はトルコを2カ月間訪問するつもりです"), "彼[かれ]はトルコを2カ 月[げつ]間[かん]訪問[ほうもん]するつもりです") # Finally, ensure that we're not just ALWAYS adding furigana to ヶ and ヵ # whenever we encounter them diff --git a/test/test_utils.py b/test/test_utils.py index 90e7d65..68a9f7d 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -27,8 +27,8 @@ def testEmptyString(self): # ensure that bracket notation is correctly removed def testRemovesBrackets(self): - self.assertEqual(utils.removeFurigana("日本語[にほんご]を勉強[べんきょう]する"), "日本語を勉強する") - self.assertEqual(utils.removeFurigana("走[はし]り込[こ]む"), "走り込む") + self.assertEqual(utils.removeFurigana("日本語[にほんご]を 勉強[べんきょう]する"), "日本語を勉強する") + self.assertEqual(utils.removeFurigana("走[はし]り 込[こ]む"), "走り込む") # ensure that ruby tags are correctly removed def testRemovesRuby(self): From a888782054ff78544da759bd2da4641e530969ca Mon Sep 17 00:00:00 2001 From: Yohann Leon Date: Mon, 12 Feb 2024 23:00:52 +0900 Subject: [PATCH 5/5] Use more modern str.format --- reading.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/reading.py b/reading.py index 5074537..b6d78be 100644 --- a/reading.py +++ b/reading.py @@ -148,10 +148,10 @@ def format(self, useRubyTags: bool, previous_character: str) -> str: return self.text if useRubyTags: - return "%s(%s)" % (self.text, self.reading) + return "{}({})".format(self.text, self.reading) else: add_space = previous_character is not None and previous_character != "]" - return '%s%s[%s]' % (" " if add_space else "", self.text, self.reading) + return '{}{}[{}]'.format(" " if add_space else "", self.text, self.reading) class RegexDefinition: def __init__(self, text: str, regexGroupIndex: Optional[int]): @@ -206,7 +206,7 @@ def kanjiToRegex(kanji: str): definitions.append(RegexDefinition(captureGroup, numCaptureGroups)) numCaptureGroups += 1 - return ("^%s$" % ''.join(regexPieces), definitions) + return ("^{}$".format(str().join(regexPieces)), definitions) class MecabController(object): @@ -277,7 +277,7 @@ def reading(self, expr, ignoreNumbers = True, useRubyTags = False): nodes.append(ReadingNode(definition.text, groupReading)) # Combine our nodes together into a single sentece - fin = '' + fin = str() for node in nodes: fin += node.format(useRubyTags, fin[-1] if len(fin) > 0 else None)