add RI35 system for Japanese

cakimpei · Jul 20, 2022 · f7b1c3a · f7b1c3a
1 parent 55dd92d
commit f7b1c3a
Show file tree

Hide file tree

Showing 5 changed files with 180 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -6,9 +6,9 @@ Wunsen transliterates/transcribes from other languages into Thai.
 
 Currently support:
 
-- Standard Chinese (from Hanyu Pinyin)
 - Japanese (from Hepburn romanization)
 - Korean (from Revised Romanization)
+- Mandarin (from Hanyu Pinyin)
 - Vietnamese (Latin script)
 
 Demo [here](https://wunsen.herokuapp.com/).
@@ -30,15 +30,6 @@ pip install wunsen
 ```python
 from wunsen import ThapSap
 
-# Chinese (Pinyin with tone diacritics is not supported yet.)
-thap_zh = ThapSap('zh', system='RI49')
-thap_zh.thap('ni3 hao3')
-# => 'หนี ห่าว'
-
-thap_zh = ThapSap('zh', system='THC43')
-thap_zh.thap('ni3 hao3')
-# => 'หนี เห่า'
-
 # Japanese
 thap_ja = ThapSap('ja')
 thap_ja.thap('ohayō')
@@ -54,6 +45,15 @@ thap_ko = ThapSap('ko')
 thap_ko.thap('annyeonghaseyo')
 # => 'อันนย็องฮาเซโย'
 
+# Mandarin (Pinyin with tone diacritics is not supported yet.)
+thap_zh = ThapSap('zh', system='RI49')
+thap_zh.thap('ni3 hao3')
+# => 'หนี ห่าว'
+
+thap_zh = ThapSap('zh', system='THC43')
+thap_zh.thap('ni3 hao3')
+# => 'หนี เห่า'
+
 # Vietnamese
 thap_vi = ThapSap('vi')
 thap_vi.thap('xin chào')
@@ -64,11 +64,13 @@ thap_vi.thap('xin chào')
 
 There might be some differences between Wunsen result and the intended result from the actual system, so please review the results.
 
-- Chinese =>
+- Japanese =>
+    - หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น (สำนักงานราชบัณฑิตยสภา พ.ศ. 2561) 'ORS61'
+    - หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น (ราชบัณฑิตยสถาน พ.ศ. 2535) 'RI35'
+- Korean => หลักเกณฑ์การทับศัพท์ภาษาเกาหลี (ราชบัณฑิตยสถาน พ.ศ. 2555) 'RI55'
+- Mandarin =>
     - หลักเกณฑ์การทับศัพท์ภาษาจีน (ราชบัณฑิตยสถาน พ.ศ. 2549) 'RI49'
     - เกณฑ์การถ่ายทอดเสียงภาษาจีนแมนดารินด้วยอักขรวิธีไทย (คณะกรรมการสืบค้นประวัติศาสตร์ไทยในเอกสารภาษาจีน พ.ศ. 2543) 'THC43'
-- Japanese => หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น (สำนักงานราชบัณฑิตยสภา พ.ศ. 2561) 'ORS61'
-- Korean => หลักเกณฑ์การทับศัพท์ภาษาเกาหลี (ราชบัณฑิตยสถาน พ.ศ. 2555) 'RI55'
 - Vietnamese => หลักเกณฑ์การทับศัพท์ภาษาเวียดนาม (ราชบัณฑิตยสถาน พ.ศ. 2555) 'RI55'
 
 Wunsen is not affiliated with proposers of the above transliteration systems.

diff --git a/src/wunsen/japanese/ja_mapping.py b/src/wunsen/japanese/ja_mapping.py
@@ -58,8 +58,7 @@
     }
 }
 
-ORS61_NO_DIACRITIC = deepcopy(ORS61)
-ORS61_NO_DIACRITIC['vowel'].update({
+NO_DIACRITIC = {
     'aa': 'อา',
     'ee': 'เอ',
     'ii': 'อี',
@@ -70,4 +69,23 @@
     'yoo': 'เอียว',
     'you': 'เอียว',
     'yuu': 'อีว'
-})
+}
+
+ORS61_NO_DIACRITIC = deepcopy(ORS61)
+ORS61_NO_DIACRITIC['vowel'].update(NO_DIACRITIC)
+
+RI35 = deepcopy(ORS61)
+
+RI35['onset'].update({
+    'ch': ['ช', 'ช']
+})
+del RI35['vowel']['ai']
+RI35['vowel'].update({
+    'ya': 'เอียะ'
+})
+RI35['coda'].update({
+    's': 'ส'
+})
+
+RI35_NO_DIACRITIC = deepcopy(RI35)
+RI35_NO_DIACRITIC['vowel'].update(NO_DIACRITIC)
diff --git a/src/wunsen/japanese/ja_thapsap.py b/src/wunsen/japanese/ja_thapsap.py
@@ -6,9 +6,12 @@
 
 from wunsen.splitutils.splitter import SplitSyl, NotInDict
 from wunsen.splitutils.exception import NotAvailableSystem
-from .ja_mapping import ORS61, ORS61_NO_DIACRITIC
+from .ja_mapping import ORS61, ORS61_NO_DIACRITIC, RI35, RI35_NO_DIACRITIC
 
-class JaRs61:
+class JaOrs61:
+
+    coda_ng = ['g', 'k', 'h', 'f', 'w', 'y']
+    coda_m = ['b', 'm', 'p']
 
     def __init__(self, input: str) -> None:
         self.ja_dict = ORS61
@@ -51,6 +54,7 @@ def adapt_split(
                     and new_syl[2] == ''
                     and old_syl[index+1][0] == 'ts'):
                 new_syl[2] = 't'
+            # no coda for these vowels
             if (new_syl[1] in ['yo', 'yoo', 'you', 'yu', 'yuu', 'yō', 'yū']
                     and new_syl[2] != ''):
                 new_syl[2] = ''
@@ -77,11 +81,14 @@ def select_vowel(self, index: int, syl: List[str]) -> str:
                 vowel = 'อือ'
         else:
             vowel = self.ja_dict['vowel'][syl[1]]
-            if (syl[1] in ['a', 'e', 'o', 'u']
-                    and syl[2] == ''
-                    and index != self.last_syl_index):
-                self.pref.update({'vowel_length': 'long'})
+            self.shorten_vowel(index, syl)
         return vowel
+
+    def shorten_vowel(self, index: int, syl: List[str]) -> None:
+        if (syl[1] in ['a', 'e', 'o', 'u']
+                and syl[2] == ''
+                and index != self.last_syl_index):
+            self.pref.update({'vowel_length': 'long'})
 
     def select_coda(self, index: int, syl: List[str]) -> str:
         if syl[2] == '':
@@ -109,14 +116,30 @@ def select_coda_n(self, index: int, syl: List[str]) -> str:
         next_onset = self.split[index+1][0]
         if next_onset == '':
             coda = 'ง'
-        elif next_onset in ['g', 'k', 'h', 'f', 'w', 'y']:
+        elif next_onset in self.coda_ng:
             coda = 'ง'
-        elif next_onset in ['b', 'm', 'p']:
+        elif next_onset in self.coda_m:
             coda = 'ม'
         else:
             coda = self.ja_dict['coda'][syl[2]]
         return coda
 
+class JaRi35(JaOrs61):
+
+    coda_ng = ['g', 'k', 'h', 'f', 'w']
+
+    def __init__(self, input: str) -> None:
+        self.ja_dict = RI35
+        if input == 'Hepburn-no diacritic':
+            self.ja_dict = RI35_NO_DIACRITIC
+
+    def shorten_vowel(self, index: int, syl: List[str]) -> None:
+        pass
+
+    def select_coda_s(self, index: int, syl: List[str]) -> str:
+        coda = self.ja_dict['coda'][syl[2]]
+        return coda
+
 class ThapJa:
 
     def __init__(
@@ -127,6 +150,7 @@ def __init__(
         :param system: Select thapsap system.
             - 'ORS61' for the Office of the Royal Society (2018/2561)
             system
+            - 'RI35' for the Royal Institute (1992/2535) system
 
         :param input: Select input type.
             - 'Hepburn-macron' for Hepburn romanization with macron
@@ -135,7 +159,9 @@ def __init__(
             without diacritic (ex. arigatou)
         """
         if system == 'ORS61':
-            self.transcript = JaRs61(input)
+            self.transcript = JaOrs61(input)
+        elif system == 'RI35':
+            self.transcript = JaRi35(input)
         else:
             raise NotAvailableSystem
 

diff --git a/src/wunsen/main.py b/src/wunsen/main.py
@@ -17,7 +17,7 @@ def __init__(self, lang: str, **setting: Any) -> None:
         
         :key str system: Specify system of transcription/
                     transliteration to use. Available system:
-                    ja: 'ORS61' | ko: 'RI55' | vi: 'RI55' |
+                    ja: 'ORS61', 'RI35' | ko: 'RI55' | vi: 'RI55' |
                     zh: 'RI49', 'THC43'
         :key str input: Specify input type
                     ja: 'Hepburn-macron', 'Hepburn-no diacritic' |

diff --git a/tests/test_ja_thapsap.py b/tests/test_ja_thapsap.py
@@ -166,6 +166,104 @@
     # Ishii (Ishi-i)
 }
 
+RI35_EXAMPLE = {
+    'obi': 'โอะบิ',
+    'konbanwa': 'คมบังวะ',
+    'chīsai': 'ชีซะอิ',
+    'konnichiwa': 'คนนิชิวะ',
+    'denwa': 'เด็งวะ',
+    'Yamada': 'ยะมะดะ',
+    'Fujisan': 'ฟุจิซัง',
+    'fune': 'ฟุเนะ',
+    'ginkō': 'กิงโก',
+    'arigatō': 'อะริงะโต',
+    'hashi': 'ฮะชิ',
+    'Hiroshima': 'ฮิโระชิมะ',
+    'kaji': 'คะจิ',
+    'kao': 'คะโอะ',
+    'niku': 'นิกุ',
+    'gakkō': 'กักโก',
+    'mado': 'มะโดะ',
+    'Nagoya': 'นะโงะยะ',
+    'konnichiwa': 'คนนิชิวะ',
+    'pen': 'เพ็ง',
+    'tenpura': 'เท็มปุระ',
+    'Nippon': 'นิปปง',
+    'ringo': 'ริงโงะ',
+    'sakana': 'ซะกะนะ',
+    'sashimi': 'ซะชิมิ',
+    'kissaten': 'คิสซะเต็ง',
+    'zasshi': 'ซัสชิ',
+    'te': 'เทะ',
+    'migite': 'มิงิเตะ',
+    'itchi': 'อิตชิ',
+    'tsukue': 'สึกุเอะ',
+    'mittsu': 'มิตสึ',
+    'watashi': 'วะตะชิ',
+    'yama': 'ยะมะ',
+    'mizu': 'มิซุ',
+
+    'yama': 'ยะมะ',
+    'sakura': 'ซะกุระ',
+    'gakkō': 'กักโก',
+    'san': 'ซัง',
+    'okāsan': 'โอะกาซัง',
+    'obāsan': 'โอะบาซัง',
+    'ike': 'อิเกะ',
+    'fune': 'ฟุเนะ',
+    'denwa': 'เด็งวะ',
+    'sensei': 'เซ็นเซ',
+    'ē': 'เอ',
+    'onēsan': 'โอะเนซัง',
+    'sensei': 'เซ็นเซ',
+    'kin': 'คิง',
+    'kaki': 'คะกิ',
+    'hashi': 'ฮะชิ',
+    'onīsan': 'โอะนีซัง',
+    'oishī': 'โอะอิชี',
+    'ocha': 'โอะชะ',
+    'kome': 'โคะเมะ',
+    'Nippon': 'นิปปง',
+    'konnichiwa': 'คนนิชิวะ',
+    'otōsan': 'โอะโตซัง',
+    'sayōnara': 'ซะโยนะระ',
+    'shinbun': 'ชิมบุง',
+    'isu': 'อิซุ',
+    'Suzuki': 'ซุซุกิ',
+    'jūyō': 'จูโย',
+    'jūsho': 'จูโชะ',
+    'kyaku': 'เคียะกุ',
+    'hyaku': 'เฮียะกุ',
+    'nyānyā': 'เนียเนีย',
+    'ryokō': 'เรียวโก',
+    'byōin': 'เบียวอิง',
+    'ryōri': 'เรียวริ',
+    'kyu': 'คิว',
+    'kyūkō': 'คีวโก'
+}
+
+RI35_EX_NO_DIACRITIC = {
+    'chiisai': 'ชีซะอิ',
+    'ginkoo': 'กิงโก',
+    'arigatoo': 'อะริงะโต',
+    'gakkoo': 'กักโก',
+    'okaasan': 'โอะกาซัง',
+    'obaasan': 'โอะบาซัง',
+    'ee': 'เอ',
+    'oneesan': 'โอะเนซัง',
+    'oniisan': 'โอะนีซัง',
+    'oishii': 'โอะอิชี',
+    'otoosan': 'โอะโตซัง',
+    'sayoonara': 'ซะโยนะระ',
+    'juuyoo': 'จูโย',
+    'juusho': 'จูโชะ',
+    'nyaanyaa': 'เนียเนีย',
+    'ryokoo': 'เรียวโก',
+    'byooin': 'เบียวอิง',
+    'ryoori': 'เรียวริ',
+    'kyuukoo': 'คีวโก'
+}
+
 class TestSpellWord(unittest.TestCase):
 
     def test_general(self):
@@ -183,5 +281,15 @@ def test_ors61_ex_no_diacritic(self):
         for case in ORS61_EX_NO_DIACRITIC:
             self.assertEqual(thap_sap.thap(case), ORS61_EX_NO_DIACRITIC[case])
 
+    def test_ri35_example(self):
+        thap_sap = ThapSap('ja', system='RI35')
+        for case in RI35_EXAMPLE:
+            self.assertEqual(thap_sap.thap(case), RI35_EXAMPLE[case])
+
+    def test_ri35_ex_no_diacritic(self):
+        thap_sap = ThapSap('ja', system='RI35', input='Hepburn-no diacritic')
+        for case in RI35_EX_NO_DIACRITIC:
+            self.assertEqual(thap_sap.thap(case), RI35_EX_NO_DIACRITIC[case])
+
 if __name__ == '__main__':
     unittest.main()