From eaaeeea3857aeb107118b74f954e637c7c6b5cdf Mon Sep 17 00:00:00 2001
From: omergreen <71124454+omergreen@users.noreply.github.com>
Date: Tue, 6 Feb 2024 00:35:23 +0200
Subject: [PATCH 1/5] Add spaces before readings
---
reading.py | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/reading.py b/reading.py
index 40e4371..647b208 100644
--- a/reading.py
+++ b/reading.py
@@ -143,14 +143,15 @@ def __init__(self, text: str, reading: Optional[str]):
self.text = text
self.reading = reading
- def format(self, useRubyTags: bool) -> str:
+ def format(self, useRubyTags: bool, previous_character: str) -> str:
if self.reading is None:
return self.text
if useRubyTags:
return "%s" % (self.text, self.reading)
else:
- return '%s[%s]' % (self.text, self.reading)
+ add_space = previous_character is not None and isKana(previous_character)
+ return '%s%s[%s]' % (" " if add_space else "", self.text, self.reading)
class RegexDefinition:
def __init__(self, text: str, regexGroupIndex: Optional[int]):
@@ -276,7 +277,9 @@ def reading(self, expr, ignoreNumbers = True, useRubyTags = False):
nodes.append(ReadingNode(definition.text, groupReading))
# Combine our nodes together into a single sentece
- fin = ''.join(node.format(useRubyTags) for node in nodes)
+ fin = ''
+ for node in nodes:
+ fin += node.format(useRubyTags, fin[-1] if len(fin) > 0 else None)
# Finalize formatting
fin = fin.replace(ASCII_SPACE_TOKEN, ' ')
From d6a4cb84e293ca9d9688761d99ed8f03c9dd78a7 Mon Sep 17 00:00:00 2001
From: omergreen <71124454+omergreen@users.noreply.github.com>
Date: Tue, 6 Feb 2024 00:47:57 +0200
Subject: [PATCH 2/5] add space before anything but ]
---
reading.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/reading.py b/reading.py
index 647b208..5074537 100644
--- a/reading.py
+++ b/reading.py
@@ -150,7 +150,7 @@ def format(self, useRubyTags: bool, previous_character: str) -> str:
if useRubyTags:
return "%s" % (self.text, self.reading)
else:
- add_space = previous_character is not None and isKana(previous_character)
+ add_space = previous_character is not None and previous_character != "]"
return '%s%s[%s]' % (" " if add_space else "", self.text, self.reading)
class RegexDefinition:
From a58460d74ff57d18e4761d4e5c39981881a14f79 Mon Sep 17 00:00:00 2001
From: omergreen <71124454+omergreen@users.noreply.github.com>
Date: Sun, 11 Feb 2024 18:33:07 +0000
Subject: [PATCH 3/5] remove spaces when stripping bracket furigana
---
utils.py | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/utils.py b/utils.py
index 66f3838..73c19d7 100644
--- a/utils.py
+++ b/utils.py
@@ -35,6 +35,10 @@ def removeFurigana(text: str):
stripped = stripped.replace("" + ruby + "", body)
# Next, remove the bracket notation
+ # remove spaces only if bracket notation was used
+ if "[" in stripped:
+ stripped = stripped.replace(" ", "")
+
stripped, _ = re.subn('\[[^\]]*\]', '', stripped)
# Return the final string
From c1fe7f6b11cdd036b47ce54579a8424b31efc28e Mon Sep 17 00:00:00 2001
From: omergreen <71124454+omergreen@users.noreply.github.com>
Date: Sun, 11 Feb 2024 18:33:18 +0000
Subject: [PATCH 4/5] update tests to reflect new space changes
---
test/test_reading.py | 36 ++++++++++++++++++------------------
test/test_utils.py | 4 ++--
2 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/test/test_reading.py b/test/test_reading.py
index 2420206..80e09ea 100644
--- a/test/test_reading.py
+++ b/test/test_reading.py
@@ -24,7 +24,7 @@ class TestMecab(unittest.TestCase):
# sentence should have readings
def testNormalSentence(self):
res = reading.mecab.reading("カリン、自分でまいた種は自分で刈り取れ")
- self.assertEqual(res, "カリン、自分[じぶん]でまいた種[たね]は自分[じぶん]で刈[か]り取[と]れ")
+ self.assertEqual(res, "カリン、 自分[じぶん]でまいた 種[たね]は 自分[じぶん]で 刈[か]り 取[と]れ")
# kanji should have a reading
def testNormalKanji(self):
@@ -34,12 +34,12 @@ def testNormalKanji(self):
# punctuation should be ignored
def testWithPunctuation(self):
res = reading.mecab.reading("昨日、林檎を2個買った。")
- self.assertEqual(res, "昨日[きのう]、林檎[りんご]を2個[こ]買[か]った。")
+ self.assertEqual(res, "昨日[きのう]、 林檎[りんご]を2 個[こ]買[か]った。")
# unicode characters should be ignored
def testUnicodeChar(self):
res = reading.mecab.reading("真莉、大好きだよん^^")
- self.assertEqual(res, "真[ま]莉、大好[だいす]きだよん^^")
+ self.assertEqual(res, "真[ま]莉、 大好[だいす]きだよん^^")
# katakana should not be given furigana readings
def testKatakana(self):
@@ -49,43 +49,43 @@ def testKatakana(self):
# romanji numbers should not have readings
def testRomanjiNumbers(self):
res = reading.mecab.reading("彼2000万も使った。")
- self.assertEqual(res, "彼[かれ]2000万[まん]も使[つか]った。")
+ self.assertEqual(res, "彼[かれ]2000 万[まん]も 使[つか]った。")
# kanji numbers should not have readings
def testKanjiNumber(self):
res = reading.mecab.reading("彼二千三百六十円も使った。")
- self.assertEqual(res, "彼[かれ]二千[せん]三百[ひゃく]六十円[えん]も使[つか]った。")
+ self.assertEqual(res, "彼[かれ]二 千[せん]三 百[ひゃく]六十 円[えん]も 使[つか]った。")
# ensure that verbs with okurigana don't produce furigana for the kana portions
def testOkurigana(self):
self.assertEqual(reading.mecab.reading("口走る"), "口走[くちばし]る")
- self.assertEqual(reading.mecab.reading("テスト勉強の息抜きとか どうしてんの"), "テスト勉強[べんきょう]の息抜[いきぬ]きとか どうしてんの")
+ self.assertEqual(reading.mecab.reading("テスト勉強の息抜きとか どうしてんの"), "テスト 勉強[べんきょう]の 息抜[いきぬ]きとか どうしてんの")
# ensure that a single word that has plain kana appearing before the kanji in
# the word do not have attached furigana
def testKanaPrefixes(self):
- self.assertEqual(reading.mecab.reading("お前"), "お前[まえ]")
- self.assertEqual(reading.mecab.reading("ローマ字"), "ローマ字[じ]")
- self.assertEqual(reading.mecab.reading("ローマ帝国"), "ローマ帝国[ていこく]")
+ self.assertEqual(reading.mecab.reading("お前"), "お 前[まえ]")
+ self.assertEqual(reading.mecab.reading("ローマ字"), "ローマ 字[じ]")
+ self.assertEqual(reading.mecab.reading("ローマ帝国"), "ローマ 帝国[ていこく]")
# ensure that a single word that both begins AND ends with kana but contains
# kanji in the middle only generates furigana for the kanji portion, and not
# for the kana
def testKanaPrefixSuffix(self):
actual = reading.mecab.reading("みじん切り")
- self.assertEqual(actual, "みじん切[ぎ]り")
+ self.assertEqual(actual, "みじん 切[ぎ]り")
# ensure that for words that have kana in between two kanji, that only the
# kanji receive furigana readings and the kana does not
def testKanaBetweenKanji(self):
- self.assertEqual(reading.mecab.reading("書き込む"), "書[か]き込[こ]む")
- self.assertEqual(reading.mecab.reading("走り抜く"), "走[はし]り抜[ぬ]く")
- self.assertEqual(reading.mecab.reading("走り回る"), "走[はし]り回[まわ]る")
+ self.assertEqual(reading.mecab.reading("書き込む"), "書[か]き 込[こ]む")
+ self.assertEqual(reading.mecab.reading("走り抜く"), "走[はし]り 抜[ぬ]く")
+ self.assertEqual(reading.mecab.reading("走り回る"), "走[はし]り 回[まわ]る")
# ensure that any regular ASCII space characters (0x20) that are in the original
# string are found in the resultant string as well
def testSpacesRetained(self):
- self.assertEqual(reading.mecab.reading("この文に 空白が あります"), "この文[ぶん]に 空白[くうはく]が あります")
+ self.assertEqual(reading.mecab.reading("この文に 空白が あります"), "この 文[ぶん]に 空白[くうはく]が あります")
self.assertEqual(reading.mecab.reading("hello world"), "hello world")
# some kana characters will have different readings when used in readings
@@ -93,17 +93,17 @@ def testSpacesRetained(self):
def testKanaWithAdditionalReadings(self):
# Check that ヵ (small) stands in for か (large) in readings
# This should generate furigana for the small ヵ
- self.assertEqual(reading.mecab.reading("彼はトルコを2ヵ月間訪問するつもりです"), "彼[かれ]はトルコを2ヵ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです")
+ self.assertEqual(reading.mecab.reading("彼はトルコを2ヵ月間訪問するつもりです"), "彼[かれ]はトルコを2 ヵ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです")
# Check that ヶ *also* stands in for か in readings
# This should generate furigana for the small ヶ
- self.assertEqual(reading.mecab.reading("彼はトルコを2ヶ月間訪問するつもりです"), "彼[かれ]はトルコを2ヶ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです")
+ self.assertEqual(reading.mecab.reading("彼はトルコを2ヶ月間訪問するつもりです"), "彼[かれ]はトルコを2 ヶ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです")
# For the same sentence, also make sure that the full-sized か and カ
# are also recognized.
# However, neither of these should generate furigana.
- self.assertEqual(reading.mecab.reading("彼はトルコを2か月間訪問するつもりです"), "彼[かれ]はトルコを2か月[げつ]間[かん]訪問[ほうもん]するつもりです")
- self.assertEqual(reading.mecab.reading("彼はトルコを2カ月間訪問するつもりです"), "彼[かれ]はトルコを2カ月[げつ]間[かん]訪問[ほうもん]するつもりです")
+ self.assertEqual(reading.mecab.reading("彼はトルコを2か月間訪問するつもりです"), "彼[かれ]はトルコを2か 月[げつ]間[かん]訪問[ほうもん]するつもりです")
+ self.assertEqual(reading.mecab.reading("彼はトルコを2カ月間訪問するつもりです"), "彼[かれ]はトルコを2カ 月[げつ]間[かん]訪問[ほうもん]するつもりです")
# Finally, ensure that we're not just ALWAYS adding furigana to ヶ and ヵ
# whenever we encounter them
diff --git a/test/test_utils.py b/test/test_utils.py
index 90e7d65..68a9f7d 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -27,8 +27,8 @@ def testEmptyString(self):
# ensure that bracket notation is correctly removed
def testRemovesBrackets(self):
- self.assertEqual(utils.removeFurigana("日本語[にほんご]を勉強[べんきょう]する"), "日本語を勉強する")
- self.assertEqual(utils.removeFurigana("走[はし]り込[こ]む"), "走り込む")
+ self.assertEqual(utils.removeFurigana("日本語[にほんご]を 勉強[べんきょう]する"), "日本語を勉強する")
+ self.assertEqual(utils.removeFurigana("走[はし]り 込[こ]む"), "走り込む")
# ensure that ruby tags are correctly removed
def testRemovesRuby(self):
From a888782054ff78544da759bd2da4641e530969ca Mon Sep 17 00:00:00 2001
From: Yohann Leon
Date: Mon, 12 Feb 2024 23:00:52 +0900
Subject: [PATCH 5/5] Use more modern str.format
---
reading.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/reading.py b/reading.py
index 5074537..b6d78be 100644
--- a/reading.py
+++ b/reading.py
@@ -148,10 +148,10 @@ def format(self, useRubyTags: bool, previous_character: str) -> str:
return self.text
if useRubyTags:
- return "%s" % (self.text, self.reading)
+ return "{}".format(self.text, self.reading)
else:
add_space = previous_character is not None and previous_character != "]"
- return '%s%s[%s]' % (" " if add_space else "", self.text, self.reading)
+ return '{}{}[{}]'.format(" " if add_space else "", self.text, self.reading)
class RegexDefinition:
def __init__(self, text: str, regexGroupIndex: Optional[int]):
@@ -206,7 +206,7 @@ def kanjiToRegex(kanji: str):
definitions.append(RegexDefinition(captureGroup, numCaptureGroups))
numCaptureGroups += 1
- return ("^%s$" % ''.join(regexPieces), definitions)
+ return ("^{}$".format(str().join(regexPieces)), definitions)
class MecabController(object):
@@ -277,7 +277,7 @@ def reading(self, expr, ignoreNumbers = True, useRubyTags = False):
nodes.append(ReadingNode(definition.text, groupReading))
# Combine our nodes together into a single sentece
- fin = ''
+ fin = str()
for node in nodes:
fin += node.format(useRubyTags, fin[-1] if len(fin) > 0 else None)