Skip to content

Commit

Permalink
Merge pull request #35 from omergreen/omergreen-add-spaces-before-rea…
Browse files Browse the repository at this point in the history
…dings

Add spaces before readings
  • Loading branch information
obynio authored Feb 12, 2024
2 parents 6d2fd25 + a888782 commit 0f43334
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 25 deletions.
13 changes: 8 additions & 5 deletions reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,14 +143,15 @@ def __init__(self, text: str, reading: Optional[str]):
self.text = text
self.reading = reading

def format(self, useRubyTags: bool) -> str:
def format(self, useRubyTags: bool, previous_character: str) -> str:
if self.reading is None:
return self.text

if useRubyTags:
return "<ruby>%s<rp>(</rp><rt>%s</rt><rp>)</rp></ruby>" % (self.text, self.reading)
return "<ruby>{}<rp>(</rp><rt>{}</rt><rp>)</rp></ruby>".format(self.text, self.reading)
else:
return '%s[%s]' % (self.text, self.reading)
add_space = previous_character is not None and previous_character != "]"
return '{}{}[{}]'.format(" " if add_space else "", self.text, self.reading)

class RegexDefinition:
def __init__(self, text: str, regexGroupIndex: Optional[int]):
Expand Down Expand Up @@ -205,7 +206,7 @@ def kanjiToRegex(kanji: str):
definitions.append(RegexDefinition(captureGroup, numCaptureGroups))
numCaptureGroups += 1

return ("^%s$" % ''.join(regexPieces), definitions)
return ("^{}$".format(str().join(regexPieces)), definitions)

class MecabController(object):

Expand Down Expand Up @@ -276,7 +277,9 @@ def reading(self, expr, ignoreNumbers = True, useRubyTags = False):
nodes.append(ReadingNode(definition.text, groupReading))

# Combine our nodes together into a single sentece
fin = ''.join(node.format(useRubyTags) for node in nodes)
fin = str()
for node in nodes:
fin += node.format(useRubyTags, fin[-1] if len(fin) > 0 else None)

# Finalize formatting
fin = fin.replace(ASCII_SPACE_TOKEN, ' ')
Expand Down
36 changes: 18 additions & 18 deletions test/test_reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class TestMecab(unittest.TestCase):
# sentence should have readings
def testNormalSentence(self):
res = reading.mecab.reading("カリン、自分でまいた種は自分で刈り取れ")
self.assertEqual(res, "カリン、自分[じぶん]でまいた種[たね]は自分[じぶん]で刈[か]り取[と]れ")
self.assertEqual(res, "カリン、 自分[じぶん]でまいた 種[たね]は 自分[じぶん]で 刈[か]り 取[と]れ")

# kanji should have a reading
def testNormalKanji(self):
Expand All @@ -34,12 +34,12 @@ def testNormalKanji(self):
# punctuation should be ignored
def testWithPunctuation(self):
res = reading.mecab.reading("昨日、林檎を2個買った。")
self.assertEqual(res, "昨日[きのう]、林檎[りんご]を2個[こ]買[か]った。")
self.assertEqual(res, "昨日[きのう]、 林檎[りんご]を2 個[こ]買[か]った。")

# unicode characters should be ignored
def testUnicodeChar(self):
res = reading.mecab.reading("真莉、大好きだよん^^")
self.assertEqual(res, "真[ま]莉、大好[だいす]きだよん^^")
self.assertEqual(res, "真[ま]莉、 大好[だいす]きだよん^^")

# katakana should not be given furigana readings
def testKatakana(self):
Expand All @@ -49,61 +49,61 @@ def testKatakana(self):
# romanji numbers should not have readings
def testRomanjiNumbers(self):
res = reading.mecab.reading("彼2000万も使った。")
self.assertEqual(res, "彼[かれ]2000万[まん]も使[つか]った。")
self.assertEqual(res, "彼[かれ]2000 万[まん]も 使[つか]った。")

# kanji numbers should not have readings
def testKanjiNumber(self):
res = reading.mecab.reading("彼二千三百六十円も使った。")
self.assertEqual(res, "彼[かれ]二千[せん]三百[ひゃく]六十円[えん]も使[つか]った。")
self.assertEqual(res, "彼[かれ]二 千[せん]三 百[ひゃく]六十 円[えん]も 使[つか]った。")

# ensure that verbs with okurigana don't produce furigana for the kana portions
def testOkurigana(self):
self.assertEqual(reading.mecab.reading("口走る"), "口走[くちばし]る")
self.assertEqual(reading.mecab.reading("テスト勉強の息抜きとか どうしてんの"), "テスト勉強[べんきょう]の息抜[いきぬ]きとか どうしてんの")
self.assertEqual(reading.mecab.reading("テスト勉強の息抜きとか どうしてんの"), "テスト 勉強[べんきょう]の 息抜[いきぬ]きとか どうしてんの")

# ensure that a single word that has plain kana appearing before the kanji in
# the word do not have attached furigana
def testKanaPrefixes(self):
self.assertEqual(reading.mecab.reading("お前"), "お前[まえ]")
self.assertEqual(reading.mecab.reading("ローマ字"), "ローマ字[じ]")
self.assertEqual(reading.mecab.reading("ローマ帝国"), "ローマ帝国[ていこく]")
self.assertEqual(reading.mecab.reading("お前"), "お 前[まえ]")
self.assertEqual(reading.mecab.reading("ローマ字"), "ローマ 字[じ]")
self.assertEqual(reading.mecab.reading("ローマ帝国"), "ローマ 帝国[ていこく]")

# ensure that a single word that both begins AND ends with kana but contains
# kanji in the middle only generates furigana for the kanji portion, and not
# for the kana
def testKanaPrefixSuffix(self):
actual = reading.mecab.reading("みじん切り")
self.assertEqual(actual, "みじん切[ぎ]り")
self.assertEqual(actual, "みじん 切[ぎ]り")

# ensure that for words that have kana in between two kanji, that only the
# kanji receive furigana readings and the kana does not
def testKanaBetweenKanji(self):
self.assertEqual(reading.mecab.reading("書き込む"), "書[か]き込[こ]む")
self.assertEqual(reading.mecab.reading("走り抜く"), "走[はし]り抜[ぬ]く")
self.assertEqual(reading.mecab.reading("走り回る"), "走[はし]り回[まわ]る")
self.assertEqual(reading.mecab.reading("書き込む"), "書[か]き 込[こ]む")
self.assertEqual(reading.mecab.reading("走り抜く"), "走[はし]り 抜[ぬ]く")
self.assertEqual(reading.mecab.reading("走り回る"), "走[はし]り 回[まわ]る")

# ensure that any regular ASCII space characters (0x20) that are in the original
# string are found in the resultant string as well
def testSpacesRetained(self):
self.assertEqual(reading.mecab.reading("この文に 空白が あります"), "この文[ぶん]に 空白[くうはく]が あります")
self.assertEqual(reading.mecab.reading("この文に 空白が あります"), "この 文[ぶん]に 空白[くうはく]が あります")
self.assertEqual(reading.mecab.reading("hello world"), "hello world")

# some kana characters will have different readings when used in readings
# (such as ヶ月 being read as かげつ). ensure that we can detect and handle these
def testKanaWithAdditionalReadings(self):
# Check that ヵ (small) stands in for か (large) in readings
# This should generate furigana for the small ヵ
self.assertEqual(reading.mecab.reading("彼はトルコを2ヵ月間訪問するつもりです"), "彼[かれ]はトルコを2ヵ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです")
self.assertEqual(reading.mecab.reading("彼はトルコを2ヵ月間訪問するつもりです"), "彼[かれ]はトルコを2 ヵ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです")

# Check that ヶ *also* stands in for か in readings
# This should generate furigana for the small ヶ
self.assertEqual(reading.mecab.reading("彼はトルコを2ヶ月間訪問するつもりです"), "彼[かれ]はトルコを2ヶ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです")
self.assertEqual(reading.mecab.reading("彼はトルコを2ヶ月間訪問するつもりです"), "彼[かれ]はトルコを2 ヶ[か]月[げつ]間[かん]訪問[ほうもん]するつもりです")

# For the same sentence, also make sure that the full-sized か and カ
# are also recognized.
# However, neither of these should generate furigana.
self.assertEqual(reading.mecab.reading("彼はトルコを2か月間訪問するつもりです"), "彼[かれ]はトルコを2か月[げつ]間[かん]訪問[ほうもん]するつもりです")
self.assertEqual(reading.mecab.reading("彼はトルコを2カ月間訪問するつもりです"), "彼[かれ]はトルコを2カ月[げつ]間[かん]訪問[ほうもん]するつもりです")
self.assertEqual(reading.mecab.reading("彼はトルコを2か月間訪問するつもりです"), "彼[かれ]はトルコを2か 月[げつ]間[かん]訪問[ほうもん]するつもりです")
self.assertEqual(reading.mecab.reading("彼はトルコを2カ月間訪問するつもりです"), "彼[かれ]はトルコを2カ 月[げつ]間[かん]訪問[ほうもん]するつもりです")

# Finally, ensure that we're not just ALWAYS adding furigana to ヶ and ヵ
# whenever we encounter them
Expand Down
4 changes: 2 additions & 2 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ def testEmptyString(self):

# ensure that bracket notation is correctly removed
def testRemovesBrackets(self):
self.assertEqual(utils.removeFurigana("日本語[にほんご]を勉強[べんきょう]する"), "日本語を勉強する")
self.assertEqual(utils.removeFurigana("走[はし]り込[こ]む"), "走り込む")
self.assertEqual(utils.removeFurigana("日本語[にほんご]を 勉強[べんきょう]する"), "日本語を勉強する")
self.assertEqual(utils.removeFurigana("走[はし]り 込[こ]む"), "走り込む")

# ensure that ruby tags are correctly removed
def testRemovesRuby(self):
Expand Down
4 changes: 4 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ def removeFurigana(text: str):
stripped = stripped.replace("<ruby>" + ruby + "</ruby>", body)

# Next, remove the bracket notation
# remove spaces only if bracket notation was used
if "[" in stripped:
stripped = stripped.replace(" ", "")

stripped, _ = re.subn('\[[^\]]*\]', '', stripped)

# Return the final string
Expand Down

0 comments on commit 0f43334

Please sign in to comment.