Merge pull request #1 from cakimpei/Add-Mandarin

Add mandarin
cakimpei · Jul 12, 2022 · 5b6d4ea · 5b6d4ea
2 parents f5e0447 + acc6298
commit 5b6d4ea
Show file tree

Hide file tree

Showing 9 changed files with 896 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -4,6 +4,7 @@ Wunsen provides 'thai-ization' of different languages.
 
 Currently support:
 
+- Standard Chinese (from Hanyu Pinyin)
 - Japanese (from Hepburn romanization)
 - Korean (from Revised Romanization)
 - Vietnamese (Latin script)
@@ -27,6 +28,15 @@ pip install wunsen
 ```python
 from wunsen import ThapSap
 
+# Chinese (Pinyin with tone diacritics is not supported yet.)
+thap_zh = ThapSap('zh', system='RI49')
+thap_zh.thap('ni3 hao3')
+# => 'หนี ห่าว'
+
+thap_zh = ThapSap('zh', system='THC43')
+thap_zh.thap('ni3 hao3')
+# => 'หนี เห่า'
+
 # Japanese
 thap_ja = ThapSap('ja')
 thap_ja.thap('ohayō')
@@ -52,13 +62,18 @@ thap_vi.thap('xin chào')
 
 There might be some differences between Wunsen result and the intended result from the actual system, so please review the results.
 
-- Japanese => หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น (สำนักงานราชบัณฑิตยสภา พ.ศ. 2561)
-- Korean => หลักเกณฑ์การทับศัพท์ภาษาเกาหลี (ราชบัณฑิตยสถาน พ.ศ. 2555)
-- Vietnamese => หลักเกณฑ์การทับศัพท์ภาษาเวียดนาม (ราชบัณฑิตยสถาน พ.ศ. 2555)
+- Chinese =>
+    - หลักเกณฑ์การทับศัพท์ภาษาจีน (ราชบัณฑิตยสถาน พ.ศ. 2549) 'RI49'
+    - เกณฑ์การถ่ายทอดเสียงภาษาจีนแมนดารินด้วยอักขรวิธีไทย (คณะกรรมการสืบค้นประวัติศาสตร์ไทยในเอกสารภาษาจีน พ.ศ. 2543) 'THC43'
+- Japanese => หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น (สำนักงานราชบัณฑิตยสภา พ.ศ. 2561) 'ORS61'
+- Korean => หลักเกณฑ์การทับศัพท์ภาษาเกาหลี (ราชบัณฑิตยสถาน พ.ศ. 2555) 'RI55'
+- Vietnamese => หลักเกณฑ์การทับศัพท์ภาษาเวียดนาม (ราชบัณฑิตยสถาน พ.ศ. 2555) 'RI55'
 
 ### Notes
 
-Wunsen might break syllables in incorrect place:
+#### Syllabification Issues
+
+Wunsen might break syllables in incorrect place. You might have to add apostrophe:
 
 ```python
 thap_ja.thap("honya | hon'ya")
@@ -67,3 +82,52 @@ thap_ja.thap("honya | hon'ya")
 thap_ko.thap("waengwaeng, maeum | waeng'waeng, ma'eum")
 # => "แว็นกแว็ง, แมอุม | แว็งแว็ง, มาอึม"
 ```
+
+#### Chinese Tone Sandhi
+
+For Standard Chinese, both Thai-ization systems specify that we should apply third tone sandhi rule to the Thai result. Wunsen will automatically apply it, but you can turn it off.
+
+```python
+thap_zh_no_sandhi = ThapSap('zh', option={'sandhi': False})
+thap_zh_no_sandhi.thap('ni3 hao3')
+# => 'หนี่ เห่า' / ni3 hao3
+
+# if we turn it on
+thap_zh_with_sandhi = ThapSap('zh', option={'sandhi': True})
+thap_zh_with_sandhi.thap('ni3 hao3')
+# => 'หนี เห่า' / ni2 hao3
+
+thap_zh_with_sandhi.thap('ni3hao3')
+# => 'หนีเห่า' / ni2hao3
+
+# examples from wikipedia
+thap_zh_with_sandhi.thap('bao3guan3 hao3')
+# => 'เป๋าก๋วน เห่า' / bao2guan2 hao3
+
+thap_zh_with_sandhi.thap('lao3 bao3guan3')
+# => 'เหล่า เป๋าก่วน' / lao3 bao2guan3
+```
+
+Wunsen doesn't apply 不 (bù) and 一 (yī) tone rules as they are difficult to recognize in Pinyin.
+
+#### Japanese long vowels
+
+Although we should transcribe two short vowels from different origins, that are next to each other, as two short vowels (not one long vowel), Wunsen cannot cover this case entirely.
+
+```python
+thap_ja.thap("公子 kōshi | 子牛 koushi | 石井 Ishii | ただいま tadaima")
+# => "公子 โคชิ | 子牛 โคอูชิ | 石井 อิชิอิ | ただいま ทาไดมะ"
+# kōshi, koushi, Ishii are fine but tadaima is ta-dai-ma in Wunsen instead of ta-da-i-ma
+
+thap_ja_no_macron.thap("公子 koushi | 子牛 koushi | 石井 Ishii | ただいま tadaima")
+# => "公子 โคชิ | 子牛 โคชิ | 石井 อิชี | ただいま ทาไดมะ"
+# they're transcribed as kou-shi | kou-shi | i-shii | ta-dai-ma so they're incorrect except 公子
+```
+
+#### Spacing in Thai
+
+If we want to follow the actual transcription/transliteration system, in some cases, space between syllables or words might have to be deleted in Thai result.
+
+For example, หลักเกณฑ์การทับศัพท์ภาษาเวียดนาม (ราชบัณฑิตยสถาน พ.ศ. 2555) (Vietnamese system) specifies that space in Vietnamese place names should be deleted, but in personal name, the space should still be there as in Vietnamese.
+
+Because it depends on the situation, Wunsen will leave spacing as it is.
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = wunsen
-version = 0.0.1
+version = 0.0.2
 author = cakimpei
 author_email = [email protected]
 description = Thai-ization tool

diff --git a/src/wunsen/main.py b/src/wunsen/main.py
@@ -3,29 +3,36 @@
 from .vietnamese import ThapVi
 from .japanese import ThapJa
 from .korean import ThapKo
+from .mandarin import ThapZh
 
 class ThapSap:
     def __init__(self, lang: str, **setting: Any) -> None:
         """Setting
         
         :param lang: Specify language to use ('ja' Japanese |
-                    'ko' Korean | 'vi' Vietnamese)
+                    'ko' Korean | 'vi' Vietnamese |
+                    'zh' Standard Chinese)
         
         The rest are not required.
         
-        :key system: Specify system of transcription/transliteration
-                    to use. Available system:
-                    ja: 'ORS61' | ko: 'RI55' | vi: 'RI55'
-        :key input: Specify input type
+        :key str system: Specify system of transcription/
+                    transliteration to use. Available system:
+                    ja: 'ORS61' | ko: 'RI55' | vi: 'RI55' |
+                    zh: 'RI49', 'THC43'
+        :key str input: Specify input type
                     ja: 'Hepburn-macron', 'Hepburn-no diacritic' |
-                    ko: 'RR' | vi: 'VA'
+                    ko: 'RR' | vi: 'VA' | zh: 'Pinyin-number'
+        :key dict option: (Standard Chinese only) Specify option
+                    zh: {'sandhi': True/False} (for third tone sandhi)
         """
         if lang == 'ja':
             self._transcriber = ThapJa(**setting)
         elif lang == 'ko':
             self._transcriber = ThapKo(**setting)
         elif lang == 'vi':
             self._transcriber = ThapVi(**setting)
+        elif lang == 'zh':
+            self._transcriber = ThapZh(**setting)
         else:
             raise ValueError('Language not found')
 

diff --git a/src/wunsen/mandarin/__init__.py b/src/wunsen/mandarin/__init__.py
@@ -0,0 +1 @@
+from .zh_thapsap import ThapZh
diff --git a/src/wunsen/mandarin/zh_mapping.py b/src/wunsen/mandarin/zh_mapping.py
@@ -0,0 +1,144 @@
+from copy import deepcopy
+
+RI49 = {
+    'onset': {
+        'b': 'ป',
+        'c': 'ช',
+        'ch': 'ช',
+        'd': 'ต',
+        'f': 'ฟ',
+        'g': 'ก',
+        'h': 'ฮ',
+        'j': 'จ',
+        'k': 'ค',
+        'l': 'ล',
+        'm': 'ม',
+        'n': 'น',
+        'ng': 'ง',
+        'p': 'พ',
+        'q': 'ช',
+        'r': 'ร',
+        's': 'ซ',
+        'sh': 'ช',
+        't': 'ท',
+        'w': 'ว', ##
+        'x': 'ซ',
+        'y': 'ย', ##
+        'z': 'จ',
+        'zh': 'จ',
+        '': 'อ'
+    },
+    'onset_cond': {
+        'w': 'อ', # เมื่อตามด้วย u
+        'y': 'อ' # เมื่อตามด้วย i
+    },
+    'rime': {
+        # [vowel_onset, vowel, silent_before, coda, phinthu]
+        'a': ['อ', 'อา', '', '', False], ##
+        'ai': ['อ', 'อาย', '', '', False],
+        'an': ['อ', 'อา', '', 'น', False],
+        'ang': ['อ', 'อา', '', 'ง', False],
+        'ao': ['อ', 'อาว', '', '', False],
+        'e': ['อ', 'เออ', '', '', False], ##
+        'ei': ['อ', 'เอ', 'ย', '', False],
+        'en': ['อ', 'เออ', '', 'น', False],
+        'eng': ['อ', 'เออ', '', 'ง', False],
+        'er': ['อ', 'เออ', 'ร', '', False],
+        'i': ['อ', 'อี', '', '', False], ##
+        'ia': ['อ', 'เอีย', '', '', False],
+        'ian': ['อ', 'เอีย', '', 'น', False],
+        'iang': ['อ', 'เอีย', '', 'ง', False],
+        'iao': ['อ', 'เอียว', '', '', False],
+        'ie': ['อ', 'เอีย', '', '', False],
+        'in': ['อ', 'อิ', '', 'น', False],
+        'ing': ['อ', 'อิ', '', 'ง', False],
+        'iong': ['ย', 'โอะ', '', 'ง', True],
+        'iu': ['อ', 'อิว', '', '', False],
+        'o': ['อ', 'โอ', '', '', False], ##
+        'ong': ['อ', 'โอะ', '', 'ง', False],
+        'ou': ['อ', 'โอว', '', '', False],
+        'u': ['อ', 'อู', '', '', False], ##
+        'ü': ['ว', 'อี', '', '', True],
+        'ua': ['ว', 'อา', '', '', False],
+        'uai': ['ว', 'อาย', '', '', False],
+        'uan': ['ว', 'อะ', '', 'น', False], ##
+        'uang': ['ว', 'อา', '', 'ง', False],
+        'ue': ['ว', 'เอ', '', '', True], ##
+        'üe': ['ว', 'เอ', '', '', True],
+        'ui': ['อ', 'อุย', '', '', False],
+        'un': ['อ', 'อุ', '', 'น', False], ##
+        'ün': ['ว', 'อิ', '', 'น', True],
+        'uo': ['อ', 'อัว', '', '', False],
+
+        'v': ['ว', 'อี', '', '', True],
+        've': ['ว', 'เอ', '', '', True],
+        'vn': ['ว', 'อิ', '', 'น', True]
+
+    },
+    'rime_cond': {
+        'e': ['อ', 'เอ', '', '', False], # after y
+        'i': ['อ', 'อือ', '', '', False], # after c, ch, r, s, sh, z, zh
+        'o': ['อ', 'อัว', '', '', False], # after b, f, m, p, w
+        'u': ['ว', 'อี', '', '', True], # after j, q, x, y
+        'uan': ['ว', 'เอีย', '', 'น', True], # after j, q, x, y
+        'un': ['ว', 'อิ', '', 'น', True] # after j, q, x, y
+    },
+    'cond_rime': {
+        'e': ['y'],
+        'i': ['c', 'ch', 'r', 's', 'sh', 'z', 'zh'],
+        'o': ['b', 'f', 'm', 'p', 'w'],
+        'u': ['j', 'q', 'x', 'y'],
+        'uan': ['j', 'q', 'x', 'y'],
+        'un': ['j', 'q', 'x', 'y']
+    },
+    'rime_cond2': {
+        # tone 0, 5
+        'a': '-+ะ',
+        'i': '-ึ+' # after c, ch, r, s, sh, z, zh
+    },
+    'tone': {
+        0: -1,
+        1: 0,
+        2: 4,
+        3: 1,
+        4: 2,
+        5: -1
+    }
+}
+
+THC43 = deepcopy(RI49)
+THC43['onset'].update({
+    'sh': 'ซ',
+})
+THC43['rime'].update({
+    'ai': ['อ', 'ไอ', '', '', False],
+    'ang': ['อ', 'อะ', '', 'ง', False],
+    'ao': ['อ', 'เอา', '', '', False],
+    'ia': ['ย', 'อา', '', '', False],
+    'iong': ['ย', 'โอะ', '', 'ง', False],
+    'ü': ['อ', 'อี', 'ว', '', False],
+    'uai': ['ว', 'ไอ', '', '', False],
+    'uan': ['อ', 'อัว', '', 'น', False],
+    'uang': ['อ', 'อัว', '', 'ง', False],
+    'ue': ['อ', 'เอ', 'ว', '', False],
+    'üe': ['อ', 'เอ', 'ว', '', False],
+    'ün': ['ว', 'อิ', '', 'น', False],
+
+    'v': ['อ', 'อี', 'ว', '', False],
+    've': ['อ', 'เอ', 'ว', '', False],
+    'vn': ['ว', 'อิ', '', 'น', False]
+})
+THC43['rime_cond'].update({
+    'an': ['อ', 'เอีย', '', 'น', False],
+    'e': ['อ', 'เอีย', '', '', False],
+    'u': ['อ', 'อี', 'ว', '', False],
+    'uan': ['ว', 'เอีย', '', 'น', False],
+    'ue': ['ว', 'เอีย', '', '', False],
+    'un': ['ว', 'อิ', '', 'น', False]
+})
+THC43['cond_rime'].update({
+    'an': ['y'],
+    'e': ['y'],
+    'uan': ['j', 'q', 'x'],
+    'ue': ['j', 'q', 'x']
+})