diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..9c90017 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 cakimpei, cakimpei@gmail.com + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..5b2cfed --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +# Wunsen + +Wunsen provides 'thai-ization' of different languages. + +Currently support: + +- Japanese (from Hepburn romanization) +- Korean (from Revised Romanization) +- Vietnamese (Latin script) + +## Installation + +Requirement: + +Python >= 3.7 + +[khanaa](https://github.com/cakimpei/khanaa) + +``` +pip install wunsen +``` + +## Usage + +```python +from wunsen import ThapSap + +# Japanese +thap_ja = ThapSap('ja') +thap_ja.thap('ohayō') +# => 'โอฮาโย' + +# without macron +thap_ja_no_macron = ThapSap('ja', input='Hepburn-no diacritic') +thap_ja_no_macron.thap('ohayou') +# => 'โอฮาโย' + +# Korean +thap_ko = ThapSap('ko') +thap_ko.thap('annyeonghaseyo') +# => 'อันนย็องฮาเซโย' + +# Vietnamese +thap_vi = ThapSap('vi') +thap_vi.thap('xin chào') +# => 'ซีน จ่าว' +``` + +## Transcription/Transliteration System in Wunsen + +There might be some differences between Wunsen result and the intended result from the actual system, so please review the results. + +- Japanese => หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น (สำนักงานราชบัณฑิตยสภา พ.ศ. 2561) +- Korean => หลักเกณฑ์การทับศัพท์ภาษาเกาหลี (ราชบัณฑิตยสถาน พ.ศ. 2555) +- Vietnamese => หลักเกณฑ์การทับศัพท์ภาษาเวียดนาม (ราชบัณฑิตยสถาน พ.ศ. 2555) + +### Notes + +Wunsen might break syllables in incorrect place: + +```python +thap_ja.thap("honya | hon'ya") +# => "โฮเนีย | ฮงยะ" + +thap_ko.thap("waengwaeng, maeum | waeng'waeng, ma'eum") +# => "แว็นกแว็ง, แมอุม | แว็งแว็ง, มาอึม" +``` \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..fa7093a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools>=42"] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..bfbd0fd --- /dev/null +++ b/setup.cfg @@ -0,0 +1,26 @@ +[metadata] +name = wunsen +version = 0.0.1 +author = cakimpei +author_email = cakimpei@gmail.com +description = Thai-ization tool +long_description = file: README.md +long_description_content_type = text/markdown +url = https://github.com/cakimpei/wunsen +project_urls = + Bug Tracker = https://github.com/cakimpei/wunsen/issues +classifiers = + Programming Language :: Python :: 3 + License :: OSI Approved :: MIT License + Operating System :: OS Independent + +[options] +package_dir = + = src +packages = find: +python_requires = >=3.7 +install_requires = + khanaa + +[options.packages.find] +where = src \ No newline at end of file diff --git a/src/wunsen/__init__.py b/src/wunsen/__init__.py new file mode 100644 index 0000000..9929f58 --- /dev/null +++ b/src/wunsen/__init__.py @@ -0,0 +1 @@ +from .main import ThapSap \ No newline at end of file diff --git a/src/wunsen/japanese/__init__.py b/src/wunsen/japanese/__init__.py new file mode 100644 index 0000000..325f798 --- /dev/null +++ b/src/wunsen/japanese/__init__.py @@ -0,0 +1 @@ +from .ja_thapsap import ThapJa \ No newline at end of file diff --git a/src/wunsen/japanese/ja_mapping.py b/src/wunsen/japanese/ja_mapping.py new file mode 100644 index 0000000..44eedd3 --- /dev/null +++ b/src/wunsen/japanese/ja_mapping.py @@ -0,0 +1,73 @@ +from copy import deepcopy + +ORS61 = { + + 'onset': { + 'b': ['บ', 'บ'], + 'ch': ['ช', 'จ'], + 'd': ['ด', 'ด'], + 'f': ['ฟ', 'ฟ'], + 'g': ['ก', 'ง'], + 'h': ['ฮ', 'ฮ'], + 'j': ['จ', 'จ'], + 'k': ['ค', 'ก'], + 'm': ['ม', 'ม'], + 'n': ['น', 'น'], + 'p': ['พ', 'ป'], + 'r': ['ร', 'ร'], + 's': ['ซ', 'ซ'], + 'sh': ['ช', 'ช'], + 't': ['ท', 'ต'], + 'ts': ['ซ', 'ซ'], # different pattern (short vowel) + 'v': ['ว', 'ว'], # not in royal ins + 'w': ['ว', 'ว'], + 'y': ['ย', 'ย'], + 'z': ['ซ', 'ซ'] + #'': ['อ', 'อ'] + }, + + 'coda': { + 'f': 'ฟ', + 'k': 'ก', + 'm': 'ม', + 'n': 'น', # or ง, ม + 'p': 'ป', + 's': 'ซ', # or ช + 't': 'ต' + }, + + 'vowel': { + 'a': 'อะ', # อา (not last syllable, no coda) + 'ā': 'อา', + 'ai': 'ไอ', + 'e': 'เอะ', # เอ + 'ē': 'เอ', + 'ei': 'เอ', + 'i': 'อิ', + 'ī': 'อี', + 'o': 'โอะ', # โอ + 'ō': 'โอ', + 'u': 'อุ', # อู # อึ for tsu + 'ū': 'อู', # อือ for tsuu + 'ya': 'เอีย', + 'yā': 'เอีย', + 'yo': 'เอียว', ## delete coda + 'yō': 'เอียว', + 'yu': 'อิว', ## delete coda + 'yū': 'อีว' + } +} + +ORS61_NO_DIACRITIC = deepcopy(ORS61) +ORS61_NO_DIACRITIC['vowel'].update({ + 'aa': 'อา', + 'ee': 'เอ', + 'ii': 'อี', + 'oo': 'โอ', + 'ou': 'โอ', + 'uu': 'อู', # อือ for tsuu + 'yaa': 'เอีย', + 'yoo': 'เอียว', + 'you': 'เอียว', + 'yuu': 'อีว' +}) \ No newline at end of file diff --git a/src/wunsen/japanese/ja_thapsap.py b/src/wunsen/japanese/ja_thapsap.py new file mode 100644 index 0000000..8dd504f --- /dev/null +++ b/src/wunsen/japanese/ja_thapsap.py @@ -0,0 +1,149 @@ +import re +from typing import List, Pattern +import unicodedata + +from khanaa import SpellWord + +from wunsen.splitutils.splitter import SplitSyl, NotInDict +from wunsen.splitutils.exception import NotAvailableSystem +from .ja_mapping import ORS61, ORS61_NO_DIACRITIC + +class JaRs61: + + def __init__(self, input: str) -> None: + self.ja_dict = ORS61 + if input == 'Hepburn-no diacritic': + self.ja_dict = ORS61_NO_DIACRITIC + + def thap_kham(self, word: str) -> str: + try: + splitter = SplitSyl(self.ja_dict) + word = unicodedata.normalize('NFC', word) + self.split = splitter.split_syl(word) + except NotInDict: + return word + self.last_syl_index = len(self.split) - 1 + self.split = self.adapt_split(self.split, self.last_syl_index) + thai_transcript = [] + for index, syl in enumerate(self.split): + self.pref = {} + syl_info = { + 'onset': self.select_onset(index, syl), + 'vowel': self.select_vowel(index, syl), + 'coda': self.select_coda(index, syl) + } + spell = SpellWord(**self.pref) + thai_transcript.append(spell.spell_out(**syl_info)) + return ''.join(thai_transcript) + + @staticmethod + def adapt_split( + old_syl: List[List[str]], + last_syl_index: int) -> List[List[str]]: + new_split = [] + for index, syl in enumerate(old_syl): + new_syl = syl + # ya = ยา not เอีย, เยีย + if new_syl[0] == '' and new_syl[1][0] == 'y': + new_syl = ['y', new_syl[1][1:], new_syl[2]] + # mitsu = มิตสึ + if (index != last_syl_index + and new_syl[2] == '' + and old_syl[index+1][0] == 'ts'): + new_syl[2] = 't' + if (new_syl[1] in ['yo', 'yoo', 'you', 'yu', 'yuu', 'yō', 'yū'] + and new_syl[2] != ''): + new_syl[2] = '' + new_split.append(new_syl) + return new_split + + def select_onset(self, index: int, syl: List[str]) -> str: + if syl[0] == '': + onset = 'อ' + elif syl[0] == 'ts' and syl[1] == 'u': + onset = 'ส' + else: + if index == 0: + onset = self.ja_dict['onset'][syl[0]][0] + else: + onset = self.ja_dict['onset'][syl[0]][1] + return onset + + def select_vowel(self, index: int, syl: List[str]) -> str: + if syl[0] == 'ts' and syl[1] in ['u', 'uu', 'ū']: + if syl[1] == 'u': + vowel = 'อึ' + elif syl[1] in ['uu', 'ū']: + vowel = 'อือ' + else: + vowel = self.ja_dict['vowel'][syl[1]] + if (syl[1] in ['a', 'e', 'o', 'u'] + and syl[2] == '' + and index != self.last_syl_index): + self.pref.update({'vowel_length': 'long'}) + return vowel + + def select_coda(self, index: int, syl: List[str]) -> str: + if syl[2] == '': + coda = '' + elif syl[2] == 's': + coda = self.select_coda_s(index, syl) + elif syl[2] == 'n': + coda = self.select_coda_n(index, syl) + else: + coda = self.ja_dict['coda'][syl[2]] + return coda + + def select_coda_s(self, index: int, syl: List[str]) -> str: + if (index != self.last_syl_index + and self.split[index+1][0] == 'sh'): + coda = 'ช' + else: + coda = self.ja_dict['coda'][syl[2]] + return coda + + def select_coda_n(self, index: int, syl: List[str]) -> str: + if index == self.last_syl_index: + coda = 'ง' + return coda + next_onset = self.split[index+1][0] + if next_onset == '': + coda = 'ง' + elif next_onset in ['g', 'k', 'h', 'f', 'w', 'y']: + coda = 'ง' + elif next_onset in ['b', 'm', 'p']: + coda = 'ม' + else: + coda = self.ja_dict['coda'][syl[2]] + return coda + +class ThapJa: + + def __init__( + self, system: str = 'ORS61', + input: str = 'Hepburn-macron') -> None: + """Setting + + :param system: Select thapsap system. + - 'ORS61' for the Office of the Royal Society (2018/2561) + system + + :param input: Select input type. + - 'Hepburn-macron' for Hepburn romanization with macron + (ex. arigatō) + - 'Hepburn-no diacritic' for Hepburn romanization + without diacritic (ex. arigatou) + """ + if system == 'ORS61': + self.transcript = JaRs61(input) + else: + raise NotAvailableSystem + + def thap(self, text: str) -> str: + def find_expression() -> Pattern[str]: + char_list = ''.join(['a-zA-ZāēīōūĀĒĪŌŪ', u'\u00af']) + exp = f"([{char_list}]|(?<=[{char_list}])'(?=[{char_list}]))+" + return re.compile(exp) + def replace(match): + return self.transcript.thap_kham(match.group()) + return re.sub(find_expression(), replace, text) \ No newline at end of file diff --git a/src/wunsen/korean/__init__.py b/src/wunsen/korean/__init__.py new file mode 100644 index 0000000..558185e --- /dev/null +++ b/src/wunsen/korean/__init__.py @@ -0,0 +1 @@ +from .ko_thapsap import ThapKo \ No newline at end of file diff --git a/src/wunsen/korean/ko_mapping.py b/src/wunsen/korean/ko_mapping.py new file mode 100644 index 0000000..506d3e4 --- /dev/null +++ b/src/wunsen/korean/ko_mapping.py @@ -0,0 +1,59 @@ +RI55 = { + # word initial, medial + 'onset': { + 'b': ['พ', 'บ'], + 'ch': ['ช', 'ช'], + 'd': ['ท', 'ด'], + 'g': ['ค', 'ก'], + 'h': ['ฮ', 'ฮ'], + 'j': ['ช', 'จ'], + 'jj': ['จ', 'จ'], + 'k': ['ค', 'ค'], + 'kh': ['ค', 'ค'], # first is None + 'kk': ['ก', 'ก'], + 'm': ['ม', 'ม'], + 'n': ['น', 'น'], + 'p': ['พ', 'พ'], + 'ph': ['พ', 'พ'], # first is None + 'pp': ['ป', 'ป'], + 'r': ['ร', 'ร'], + 'l': ['ล', 'ล'], # first is None + 's': ['ซ', 'ซ'], # change to ช when vowel is อี or วี + 'ss': ['ซ', 'ซ'], + 't': ['ท', 'ท'], + 'th': ['ท', 'ท'], # first is None + 'tt': ['ต', 'ต'] + }, + 'coda': { + 'p': 'บ', + 't': 'ด', + 'k': 'ก', + 'm': 'ม', + 'n': 'น', + 'ng': 'ง', + 'l': 'ล' + }, + 'vowel': { + 'a': ['อ', 'อา'], + 'ae': ['อ', 'แอ'], + 'e': ['อ', 'เอ'], + 'eo': ['อ', 'ออ'], + 'eu': ['อ', 'อือ'], + 'i': ['อ', 'อี'], + 'o': ['อ', 'โอ'], + 'u': ['อ', 'อู'], + 'oe': ['ว', 'เอ'], + 'ui': ['อ', 'อึย'], # can change to อี + 'wa': ['ว', 'อา'], + 'wae': ['ว', 'แอ'], + 'we': ['ว', 'เอ'], # + 'wi': ['ว', 'อี'], + 'wo': ['ว', 'ออ'], + 'ya': ['ย', 'อา'], + 'yae': ['ย', 'แอ'], # + 'ye': ['ย', 'เอ'], + 'yeo': ['ย', 'ออ'], + 'yo': ['ย', 'โอ'], + 'yu': ['ย', 'อู'], + } +} \ No newline at end of file diff --git a/src/wunsen/korean/ko_thapsap.py b/src/wunsen/korean/ko_thapsap.py new file mode 100644 index 0000000..a79822a --- /dev/null +++ b/src/wunsen/korean/ko_thapsap.py @@ -0,0 +1,89 @@ +import re +from typing import List + +from khanaa import SpellWord + +from wunsen.splitutils.exception import NotAvailableSystem +from wunsen.splitutils.splitter import SplitSyl, NotInDict +from .ko_mapping import RI55 + +class KoRi55: + + ko_dict = RI55 + + def __init__(self) -> None: + pass + + def thap_kham(self, word: str) -> str: + try: + splitter = SplitSyl(self.ko_dict) + self.split = splitter.split_syl(word) + except NotInDict: + return word + thai_transcript = [] + for index, syl in enumerate(self.split): + self.pref = {'clear_vowel_onset': 'all'} + syl_info = { + 'onset': self.select_onset(index, syl), + 'vowel': self.select_vowel(index, syl), + 'coda': self.select_coda(syl) + } + spell = SpellWord(**self.pref) + thai_transcript.append(spell.spell_out(**syl_info)) + return ''.join(thai_transcript) + + def select_onset(self, index: int, syl: List[str]) -> str: + vowel_onset = self.ko_dict['vowel'][syl[1]][0] + if syl[0] == '': + return vowel_onset + elif syl[0] == 's' and syl[1] in ['i', 'wi']: + onset = 'ช' + else: + if index == 0: + onset = self.ko_dict['onset'][syl[0]][0] + else: + onset = self.ko_dict['onset'][syl[0]][1] + if vowel_onset != 'อ': + onset = ''.join([onset, vowel_onset]) + return onset + + def select_vowel(self, index: int, syl: List[str]) -> str: + if syl[1] == 'ui': + if syl[0] == '' and index == 0: + vowel = self.ko_dict['vowel'][syl[1]][1] + else: + vowel = 'อี' + else: + vowel = self.ko_dict['vowel'][syl[1]][1] + if syl[2] != '': + self.pref.update({'vowel_length': 'short'}) + return vowel + + def select_coda(self, syl: List[str]) -> str: + if syl[2] == '': + coda = '' + else: + coda = self.ko_dict['coda'][syl[2]] + return coda + +class ThapKo: + def __init__(self, system: str = 'RI55', input: str = 'RR') -> None: + """Setting + + :param system: Select thapsap system. + - 'RI55' for the Royal Institute (2012/2555) system + + :param input: Select input type. + - 'RR' for Revised Romanization + """ + if system == 'RI55' and input == 'RR': + self.transcript = KoRi55() + else: + raise NotAvailableSystem + + def thap(self, text: str) -> str: + def replace(match): + return self.transcript.thap_kham(match.group()) + return re.sub( + r"([a-zA-Zāēīōū]|(?<=[a-zA-Zāēīōū])'(?=[a-zA-Zāēīōū]))+", + replace, text) \ No newline at end of file diff --git a/src/wunsen/main.py b/src/wunsen/main.py new file mode 100644 index 0000000..26ab7e7 --- /dev/null +++ b/src/wunsen/main.py @@ -0,0 +1,33 @@ +from typing import Any + +from .vietnamese import ThapVi +from .japanese import ThapJa +from .korean import ThapKo + +class ThapSap: + def __init__(self, lang: str, **setting: Any) -> None: + """Setting + + :param lang: Specify language to use ('ja' Japanese | + 'ko' Korean | 'vi' Vietnamese) + + The rest are not required. + + :key system: Specify system of transcription/transliteration + to use. Available system: + ja: 'ORS61' | ko: 'RI55' | vi: 'RI55' + :key input: Specify input type + ja: 'Hepburn-macron', 'Hepburn-no_diacritic' | + ko: 'RR' | vi: 'VA' + """ + if lang == 'ja': + self._transcriber = ThapJa(**setting) + elif lang == 'ko': + self._transcriber = ThapKo(**setting) + elif lang == 'vi': + self._transcriber = ThapVi(**setting) + else: + raise ValueError('Language not found') + + def thap(self, text: str) -> str: + return self._transcriber.thap(text) \ No newline at end of file diff --git a/src/wunsen/splitutils/__init__.py b/src/wunsen/splitutils/__init__.py new file mode 100644 index 0000000..f53f4b6 --- /dev/null +++ b/src/wunsen/splitutils/__init__.py @@ -0,0 +1,2 @@ +from .splitter import SplitSyl +from .exception import NotAvailableSystem \ No newline at end of file diff --git a/src/wunsen/splitutils/exception.py b/src/wunsen/splitutils/exception.py new file mode 100644 index 0000000..d47ce24 --- /dev/null +++ b/src/wunsen/splitutils/exception.py @@ -0,0 +1,2 @@ +class NotAvailableSystem(Exception): + pass \ No newline at end of file diff --git a/src/wunsen/splitutils/splitter.py b/src/wunsen/splitutils/splitter.py new file mode 100644 index 0000000..7a34ff4 --- /dev/null +++ b/src/wunsen/splitutils/splitter.py @@ -0,0 +1,104 @@ +import re +from typing import Any, Dict, List, Pattern, Tuple + +class NotInDict(Exception): + pass + +class SplitSyl: + def __init__(self, transcription_dict: Dict[str, Any]): + self.ts_dict = transcription_dict + + def split_syl(self, word: str) -> List[List[str]]: + word = word.lower() + group_split = SplitSyl.grouping(SplitSyl.split_vowel(self, word), 2) + con_exp = self.find_con_exp() + result = [] + + for syl in group_split[:-1]: + con_split = SplitSyl.split_con(self, con_exp, syl[0]) + con_split = SplitSyl.find_coda_onset(self, con_split) + result.extend([con_split[0], con_split[1], syl[1]]) + + SplitSyl.check_coda(self, group_split[-1][0]) + result.append(group_split[-1][0]) + SplitSyl.check_first(result) + del result[0] + return SplitSyl.grouping(result, 3) + + @staticmethod + def grouping(list: List[str], group_length: int) -> List[List[str]]: + return [list[i:i+group_length] + for i in range(0, len(list), group_length)] + + def split_vowel(self, word: str) -> List[str]: + vowel_pattern = '|'.join(sorted( + self.ts_dict['vowel'], + key=lambda x: len(x), + reverse=True)) + vowel_pattern = f'({vowel_pattern})' + + expression = re.compile(vowel_pattern) + + return re.split(expression, word) + + def find_con_exp(self) -> Pattern[str]: + all_onset = '$|'.join(sorted(self.ts_dict['onset'], + key=lambda con: len(con), reverse=True)) + con_pattern = rf'({all_onset}$)' + return re.compile(con_pattern) + + def split_con(self, con_pattern: Pattern[str], con: str) -> List[str]: + if con == '' or con == "'": + return ['', ''] + elif con.find("'") != -1: + return SplitSyl.comma_case(self, con) + else: + con_split = re.split(con_pattern, con) + con_split = [found for found in con_split if found] + return con_split + + def comma_case(self, con: str) -> List[str]: + parts = con.partition("'") + if ((parts[0] == '' + or parts[0] in self.ts_dict['coda']) + and (parts[2] == '' + or parts[2] in self.ts_dict['onset'])): + return [parts[0], parts[2]] + else: + raise NotInDict + + def find_coda_onset(self, con_split: List[str]) -> Tuple[str, str]: + if len(con_split) > 2: + raise NotInDict + elif len(con_split) == 2: + if (con_split[0] != '' + and con_split[0] not in self.ts_dict['coda'] + or (con_split[1] != '' + and con_split[1] not in self.ts_dict['onset'])): + raise NotInDict + coda = con_split[0] + onset = con_split[-1] + else: + if (con_split[0] != '' + and con_split[0] not in self.ts_dict['onset']): + if con_split[0] not in self.ts_dict['coda']: + raise NotInDict + else: + coda = con_split[0] + onset = '' + else: + coda = '' + onset = con_split[0] + return coda, onset + + def check_coda(self, con: str) -> None: + if con not in self.ts_dict['coda'] and con != '': + raise NotInDict + else: + return + + def check_first(syl_list: List[str]) -> None: + if syl_list[0] != '': + raise NotInDict + else: + return \ No newline at end of file diff --git a/src/wunsen/vietnamese/__init__.py b/src/wunsen/vietnamese/__init__.py new file mode 100644 index 0000000..3e749b9 --- /dev/null +++ b/src/wunsen/vietnamese/__init__.py @@ -0,0 +1 @@ +from .vi_thapsap import ThapVi \ No newline at end of file diff --git a/src/wunsen/vietnamese/vi_mapping.py b/src/wunsen/vietnamese/vi_mapping.py new file mode 100644 index 0000000..fdf278c --- /dev/null +++ b/src/wunsen/vietnamese/vi_mapping.py @@ -0,0 +1,108 @@ +RI55 = { + 'onset': { + 'b': 'บ', # บา บ่า บ๋า บ๊า | บัก บัก บัก บั๊ก 1 + 'c': 'ก', + 'ch': 'จ', + 'd': 'ซ', # ซา ส่า สา ซ้า | ซัก สัก สัก ซัก | ซาก สาก สาก ซ้าก 2 + 'đ': 'ด', + 'g': 'ก', + 'gh': 'ก', + 'gi': 'ซ', + 'h': 'ฮ', + 'k': 'ก', + 'kh': 'ค', + 'l': 'ล', + 'm': 'ม', # มา หม่า หมา ม้า | มัก หมัก หมัก มัก 3 + 'n': 'น', + 'ng': 'ง', + 'ngh': 'ง', + 'nh': 'ญ', + 'ph': 'ฟ', + 'qu': 'กว', + 'r': 'ซ', + 's': 'ซ', + 't': 'ต', + 'th': 'ท', + 'tr': 'จ', + 'v': 'ว', + 'x': 'ซ', + '': 'อ' + }, + 'coda': { + 'c': 'ก', + 'ch': 'ก', + 'm': 'ม', + 'n': 'น', + 'ng': 'ง', + 'nh': 'ญ', + 'p': 'ป', + 't': 'ต', + '': '' + }, + 'vowel': { + 'a': ['อ', 'อา'], + 'ă': ['อ', 'อะ'], + 'â': ['อ', 'เออ'], + 'e': ['อ', 'แอ'], + 'ê': ['อ', 'เอ'], + 'i': ['อ', 'อี'], + 'o': ['อ', 'ออ'], + 'oo': ['อ', 'ออ'], + 'ô': ['อ', 'โอ'], + 'ôô': ['อ', 'โอ'], + 'ơ': ['อ', 'เออ'], + 'u': ['อ', 'อู'], + 'ư': ['อ', 'อือ'], + 'y': ['อ', 'อี'], + 'ai': ['อ', 'อาย'], + 'ao': ['อ', 'อาว'], + 'au': ['อ', 'เอา'], + 'âu': ['อ', 'เอิว'], + 'ay': ['อ', 'อัย'], + 'ây': ['อ', 'เอ็ย'], + 'eo': ['อ', 'แอว'], + 'êu': ['อ', 'เอว'], + 'ia': ['อ', 'เอีย'], + 'iê': ['อ', 'เอีย'], + 'iu': ['อ', 'อีว'], + 'oa': ['ว', 'อา'], + 'oă': ['ว', 'อะ'], + 'oe': ['ว', 'แอ'], + 'oi': ['อ', 'ออย'], + 'ôi': ['อ', 'โอย'], + 'ơi': ['อ', 'เอย'], + 'ua': ['อ', 'อัว'], + 'ưa': ['อ', 'เอือ'], + 'uâ': ['อ', 'อัว'], + 'uê': ['ว', 'เอ'], + 'ui': ['อ', 'อูย'], + 'ưi': ['อ', 'อืย'], + 'uô': ['อ', 'อัว'], + 'uơ': ['อ', 'อัว'], + 'ươ': ['อ', 'เอือ'], + 'ưu': ['อ', 'อืว'], + 'uy': ['ว', 'อี'], + 'iêu': ['อ', 'เอียว'], + 'yêu': ['อ', 'เอียว'], + 'oai': ['ว', 'อาย'], + 'oao': ['ว', 'อาว'], + 'oay': ['ว', 'อัย'], + 'uây': ['ว', 'เอ็ย'], + 'uôi': ['อ', 'อวย'], + 'uya': ['ว', 'เอีย'], + 'uyê': ['ว', 'เอีย'], + 'uyu': ['ว', 'อีว'], + 'ươi': ['อ', 'เอือย'], + 'ươu': ['อ', 'เอือว'], + 'yê': ['อ', 'เอีย'], # not in royal ins + }, + 'tone': { + # [Thai tone, shorten?] + # no marker: [0, False] # ngang + u'\u0300': [1, False], # huyền + u'\u0303': [4, False], # ngã + u'\u0309': [4, False], # hỏi + u'\u0301': [3, False], # sắc + u'\u0323': [1, True] # nặng + } +} \ No newline at end of file diff --git a/src/wunsen/vietnamese/vi_splitter.py b/src/wunsen/vietnamese/vi_splitter.py new file mode 100644 index 0000000..53d184b --- /dev/null +++ b/src/wunsen/vietnamese/vi_splitter.py @@ -0,0 +1,60 @@ +import re +import unicodedata +from typing import Any, Dict, List + +class NotVietnamese(Exception): + pass + +class SplitSyl: + """As Vietnamese syllables are usually divided by space or hyphen, + we will assume that the input (after the first split) has + only one syllable. + """ + def __init__(self, system: Dict[str, Any]) -> None: + self.sys_dict = system + + def split_syl(self, syl) -> List[str]: + self.syl = unicodedata.normalize('NFC', syl.lower()) + self.find_structure() + self.find_tone() + self.check_vowel() + return [self.onset, self.vowel, self.coda, self.tone] + + def find_structure(self) -> None: + pattern = f'({self.find_onset_pattern()}|{self.find_coda_pattern()})' + expression = re.compile(pattern) + split = re.split(expression, self.syl) + + self.onset = split[1] + self.vowel = split[2] + self.coda = split[3] + + def find_onset_pattern(self) -> str: + onset_pattern = '|^'.join(sorted( + self.sys_dict['onset'], + key=lambda onset_char: len(onset_char), + reverse=True)) + onset_pattern = f'^{onset_pattern}' + return onset_pattern + + def find_coda_pattern(self) -> str: + coda_pattern = '$|'.join(sorted( + self.sys_dict['coda'], + key=lambda coda_char: len(coda_char), + reverse=True)) + coda_pattern = f'{coda_pattern}$' + return coda_pattern + + def find_tone(self) -> None: + self.tone = '' + self.vowel = unicodedata.normalize('NFD', self.vowel) + pattern = re.compile('|'.join(self.sys_dict['tone'])) + finding = re.search(pattern, self.vowel) + if finding: + self.vowel = self.vowel.replace(finding.group(0), '') + self.tone = finding.group(0) + self.vowel = unicodedata.normalize('NFC', self.vowel) + + def check_vowel(self) -> None: + if self.vowel not in self.sys_dict['vowel']: + raise NotVietnamese \ No newline at end of file diff --git a/src/wunsen/vietnamese/vi_thapsap.py b/src/wunsen/vietnamese/vi_thapsap.py new file mode 100644 index 0000000..5fa15a3 --- /dev/null +++ b/src/wunsen/vietnamese/vi_thapsap.py @@ -0,0 +1,112 @@ +import re + +from khanaa import SpellWord + +from .vi_mapping import RI55 +from .vi_splitter import NotVietnamese, SplitSyl + +_VI_ALPHABET = (r'a-zA-Zàáãạảăắằẳẵặâấầẩẫậèéẹẻẽêềếểễệđìíĩỉịòóõọỏôốồổỗộơớờởỡợ' + r'ùúũụủưứừửữựỳỵỷỹýÀÁÃẠẢĂẮẰẲẴẶÂẤẦẨẪẬÈÉẸẺẼÊỀẾỂỄỆĐÌÍĨỈỊÒÓÕỌỎÔỐỒỔỖỘƠỚỜỞỠỢ' + r'ÙÚŨỤỦƯỨỪỬỮỰỲỴỶỸÝ') + +_VI_DIACRITIC = ''.join([u'\u0300', u'\u0303', u'\u0309', u'\u0301', + u'\u0323', u'\u0302']) + +_VI_ALL = ''.join(f'[{_VI_ALPHABET}{_VI_DIACRITIC}]+') + +class ViRi55: + + vi_dict = RI55 + + def __init__(self) -> None: + pass + + def thap_kham(self, word: str) -> str: + try: + split_syl = SplitSyl(self.vi_dict) + self.split = split_syl.split_syl(word) + except NotVietnamese: + return word + + self.shorten = False + self.pref = {'clear_vowel': False, 'vowel_pair_form': {'อาย': 'อัย', + 'โอย': 'โอย'}} + + self.select_onset() + self.select_vowel() + self.select_coda() + self.select_tone() + self.tone_shorten() + syl_info = { + 'onset': self.onset, 'vowel': self.vowel, + 'coda': self.coda, 'tone': self.tone + } + spell = SpellWord(**self.pref) + result = spell.spell_out(**syl_info) + return result + + def select_onset(self) -> None: + self.onset = self.vi_dict['onset'][self.split[0]] + self.check_gi(self.split[0], self.split[1]) + vowel_onset = self.vi_dict['vowel'][self.split[1]][0] + if vowel_onset != 'อ': + if self.split[0] == '': + self.onset == vowel_onset + else: + self.onset = ''.join([self.onset, vowel_onset]) + + def check_gi(self, original_onset: str, original_vowel: str) -> None: + """Check for gi with (reduced) i as a vowel. + + gì should be สี่ not กี่""" + if original_onset == 'g' and original_vowel == 'i': + self.onset = self.vi_dict['onset']['gi'] + + def select_vowel(self) -> None: + self.vowel = self.vi_dict['vowel'][self.split[1]][1] + if ((self.split[1] == 'a' and self.split[2] == 'nh') or + (self.split[1] in ['ê', 'i', 'o', 'ô', 'u', 'ư', 'y'] and + self.split[2] in ['c', 'ch', 'k', 'ng', 'nh'])): + self.shorten = True + + def select_coda(self) -> None: + if self.split[2] == '': + self.coda = '' + else: + self.coda = self.vi_dict['coda'][self.split[2]] + + def select_tone(self) -> None: + if self.split[3] == '': + self.tone = '' + else: + self.tone = self.vi_dict['tone'][self.split[3]][0] + if self.vi_dict['tone'][self.split[3]][1]: + self.shorten = True + + def tone_shorten(self) -> None: + if self.shorten == True: + self.pref.update({'vowel_length': 'short'}) + +class ThapVi: + def __init__(self, system: str = 'RI55', input: str = 'VA') -> None: + """Setting + + :param system: Select thapsap system. + - 'RI55' for the Royal Institute (2012/2555) system + + :param input: Select input type. + - 'VA' for Vietnamese Alphabet + """ + if system == 'RI55' and input == 'VA': + self._transcript = ViRi55() + else: + raise NotAvailableSystem + + def thap(self, text: str) -> str: + pattern = re.compile(_VI_ALL) + def replace(match): + return self._transcript.thap_kham(match.group()) + return re.sub(pattern, replace, text) + +class NotAvailableSystem(Exception): + pass \ No newline at end of file diff --git a/tests/test_ja_thapsap.py b/tests/test_ja_thapsap.py new file mode 100644 index 0000000..784c3b3 --- /dev/null +++ b/tests/test_ja_thapsap.py @@ -0,0 +1,187 @@ +import unittest +from wunsen import ThapSap + +GENERAL = { + '日本語 にほんご Nihongo นิฮงโงะ': '日本語 にほんご นิฮงโงะ นิฮงโงะ', + 'ニホンゴ123456789': 'ニホンゴ123456789', + """uzubekisutan +kazafusutan +kirugisu +tajikisutan +torukumenisutan + +indoneshia +kanbojia +shingapōru +tai +higashitimōru +firipin +burunei +betonamu +marēshia +myanmā +raosu""": +"""อูซูเบกิซูตัง +คาซาฟูซูตัง +คิรูงิซุ +ทาจิกิซูตัง +โทรูกูเมนิซูตัง + +อินโดเนชิอะ +คัมโบจิอะ +ชิงงาโปรุ +ไท +ฮิงาชิติโมรุ +ฟิริปิง +บูรูเน +เบโตนามุ +มาเรชิอะ +เมียมมา +ราโอซุ""" +} + +ORS61_EXAMPLE = { + "bon'odori": "บงโอโดริ", + "obi": "โอบิ", + "chīsai": "ชีไซ", + "konnichiwa": "คนนิจิวะ", + "denwa": "เด็งวะ", + "Edo": "เอโดะ", + "fune": "ฟูเนะ", + "Gifu": "กิฟุ", + "ginkō": "กิงโก", + "arigatō": "อาริงาโต", + "hashi": "ฮาชิ", + "Hiroshima": "ฮิโรชิมะ", + "Jōmon": "โจมง", + "kaji": "คาจิ", + "kao": "คาโอะ", + "niku": "นิกุ", + "gakkō": "กักโก", + "mado": "มาโดะ", + "shimbun": "ชิมบุง", + "samma": "ซัมมะ", + "empitsu": "เอ็มปิตสึ", + "Nagoya": "นาโงยะ", + "kinoko": "คิโนโกะ", + "Nippon": "นิปปง", + "konnichiwa": "คนนิจิวะ", + "minchō": "มินโจ", + "jinja": "จินจะ", + "konnichiwa": "คนนิจิวะ", + "konnyaku": "คนเนียกุ", + "Endō": "เอ็นโด", + "renraku": "เร็นรากุ", + "hontō": "ฮนโต", + "jinzai": "จินไซ", + "ginnan": "กินนัง", + "ringo": "ริงโงะ", + "ginkō": "กิงโก", + "kokusanhin": "โคกูซังฮิง", + "denwa": "เด็งวะ", + "hon'ya": "ฮงยะ", # honya => โฮเนีย + "shinsai": "ชินไซ", + "manshū": "มันชู", + "bon'odori": "บงโอโดริ", + "ichiban": "อิจิบัง", + "pen": "เพ็ง", + "tempura": "เท็มปูระ", + "Nippon": "นิปปง", + "renraku": "เร็นรากุ", + "Nara": "นาระ", + "sakana": "ซากานะ", + "kissaten": "คิซซาเต็ง", + "zasshi": "ซัชชิ", + "Shōwa": "โชวะ", + "sashimi": "ซาชิมิ", + "te": "เทะ", + "migite": "มิงิเตะ", + "matcha": "มัตจะ", + "kitte": "คิตเตะ", + "tsunami": "สึนามิ", + "mittsu": "มิตสึ", + "mitsu": "มิตสึ", + "tsūyaku": "ซือยากุ", + "ittsū": "อิตซือ", + "futsū": "ฟุตซือ", + "watashi": "วาตาชิ", + "Fujiwara": "ฟูจิวาระ", + "yama": "ยามะ", + "Yayoi": "ยาโยอิ", + "zō": "โซ", + "mizu": "มิซุ", + + "hyaku": "เฮียกุ", + "kyakkan": "เคียกกัง", + "kyā": "เคีย", + "ryokō": "เรียวโก", + "hyotto": "เฮียวโตะ", + "ryōri": "เรียวริ", + "byuffe": "บิวเฟะ", + "kyūkō": "คีวโก", + "Ryūkyū": "รีวกีว", + + "wasabi": "วาซาบิ", + "yama": "ยามะ", + "gakkō": "กักโก", + "okāsan": "โอกาซัง", + "haiku": "ไฮกุ", + + "eki": "เอกิ", + "fune": "ฟูเนะ", + "denwa": "เด็งวะ", + "onēsan": "โอเนซัง", + "sensei": "เซ็นเซ", + + "kaki": "คากิ", + "kin": "คิง", + "oishī": "โออิชี", + + "ocha": "โอจะ", + "oto": "โอโตะ", + "konnichiwa": "คนนิจิวะ", + "sayōnara": "ซาโยนาระ", + "Sōseki": "โซเซกิ", + "Ōno": "โอโนะ", + + "Kabuki": "คาบูกิ", + "isu": "อิซุ", + "shimbun": "ชิมบุง", + "jūyō": "จูโย", + + "Ichirō SUZUKI": "อิจิโร ซูซูกิ", + "Takuya KIMURA": "ทากูยะ คิมูระ" +} + +ORS61_EX_NO_DIACRITIC = { + 'okaasan': 'โอกาซัง', + 'oishii': 'โออิชี', + 'juuyoo': 'จูโย', + 'oneesan': 'โอเนซัง', + 'sensei': 'เซ็นเซ', + 'sayoonara': 'ซาโยนาระ', + 'koushi': 'โคชิ' + # koushi (ko-ushi) + # keito (ke-ito) + # Ishii (Ishi-i) +} + +class TestSpellWord(unittest.TestCase): + + def test_general(self): + thap_sap = ThapSap('ja') + for case in GENERAL: + self.assertEqual(thap_sap.thap(case), GENERAL[case]) + + def test_ors61_example(self): + thap_sap = ThapSap('ja') + for case in ORS61_EXAMPLE: + self.assertEqual(thap_sap.thap(case), ORS61_EXAMPLE[case]) + + def test_ors61_ex_no_diacritic(self): + thap_sap = ThapSap('ja', input='Hepburn-no diacritic') + for case in ORS61_EX_NO_DIACRITIC: + self.assertEqual(thap_sap.thap(case), ORS61_EX_NO_DIACRITIC[case]) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_ko_thapsap.py b/tests/test_ko_thapsap.py new file mode 100644 index 0000000..5758fde --- /dev/null +++ b/tests/test_ko_thapsap.py @@ -0,0 +1,203 @@ +import unittest +from wunsen import ThapSap + +GENERAL = { + "한국말, han'gugeo, कोरियाई भाषा": '한국말, ฮันกูกอ, कोरियाई भाषा', + 'Koreanische Sprache': 'Koreanische Sprache' +} + +RI55_EXAMPLE = { + "baji": "พาจี", + "haengbok": "แฮ็งบก", + "bap": "พับ", + "cheonji": "ช็อนจี", + "Chungcheong": "ชุงช็อง", + "kkot": "กด", + "Daegu": "แทกู", + "daedap": "แทดับ", + "natgari": "นัดการี", + # "Gangwon": "คังว็อน", + "Hanguk": "ฮันกุก", + "goguk": "โคกุก", + "hobak": "โฮบัก", + "gonghang": "คงฮัง", + "jido": "ชีโด", + "jujeonja": "ชูจ็อนจา", + "bit": "พิด", + "jjari": "จารี", + "gajja": "คาจา", + "kal": "คัล", + "mankeum": "มันคึม", + "bueok": "พูอ็อก", + "chukha": "ชูคา", + "kkangtong": "กังทง", + "eokkae": "ออแก", + "bak": "พัก", + # "maeum": "มาอึม", + "nonmun": "นนมุน", + "gim": "คิม", + "na": "นา", + "nuna": "นูนา", + "Namdaemun": "นัมแดมุน", + "Gangneung": "คังนึง", + "madang": "มาดัง", + "pal": "พัล", + "apeuda": "อาพือดา", + "yeopsaram": "ย็อบซารัม", + "iphak": "อีพัก", + "ppallae": "ปัลแล", + "ippal": "อีปัล", + "ramyeon": "รามย็อน", + "haru": "ฮารู", + "mal": "มัล", + "Jeolla": "ช็อลลา", + "sada": "ซาดา", + "saengsan": "แซ็งซัน", + "sutso": "ซุดโซ", + "sijang": "ชีจัง", + "dosi": "โทชี", + "ssal": "ซัล", + "bulssuk": "พุลซุก", + # "ssaetta": "แซ็ดตา", + "Taebaek": "แทแบ็ก", + "gita": "คีทา", + "Hanbat": "ฮันบัด", + "mathyeong": "มาทย็อง", + "ttukkeong": "ตูก็อง", + "heoritti": "ฮอรีตี", + + "ai": "อาอี", + "nara": "นารา", + "ap": "อับ", + "bap": "พับ", + "aein": "แออิน", + "gae": "แค", + "aengmusae": "แอ็งมูแซ", + "naemsae": "แน็มแซ", + "e": "เอ", + "nemo": "เนโม", + "enganhada": "เอ็นกันฮาดา", + "sem": "เซ็ม", + "eomeoni": "ออมอนี", + "seoda": "ซอดา", + "eonni": "อ็อนนี", + "deoreopta": "ทอร็อบทา", + "euro": "อือโร", + "seuseuro": "ซือซือโร", + # "maeum": "มาอึม", + "ireum": "อีรึม", + "ima": "อีมา", + "si": "ชี", + # "Yongin": "ยงอิน", + "sil": "ชิล", + "o": "โอ", + "podo": "โพโด", + "ot": "อด", + "don": "ทน", + "ugi": "อูกี", + "dubu": "ทูบู", + "undong": "อุนดง", + "sul": "ซุล", + + "oeguk": "เวกุก", + "goeroum": "คเวโรอุม", + "oenson": "เว็นซน", + "hoengdan": "ฮเว็งดัน", + "uisa": "อึยซา", + "Yeouido": "ยออีโด", + "huin saek": "ฮีน แซ็ก", # "ฮีนแซ็ก" + "wa": "วา", + "gwaja": "ควาจา", + "wang": "วัง", + "Gwangju": "ควังจู", + "wae": "แว", + "dwaeji": "ทแวจี", + # "waengwaeng": "แว็งแว็ง", + "kkwaengnamu": "กแว็งนามู", + "weiteo": "เวอีทอ", + "gwebeom": "คเวบ็อม", + "wennil": "เว็นนิล", + "wi": "วี", + "gwi": "ควี", + "witsaram": "วิดซารัม", + "swin": "ชวิน", + "areumdawo!": "อารึมดาวอ!", # "อารึมดาวอ" + "jwo": "ชวอ", + "wollae": "ว็อลแล", + "yeogwon": "ยอกว็อน", + "yagu": "ยากู", + "chyawo": "ชยาวอ", + "yak": "ยัก", + "dalgyal": "ทัลกยัล", + "yaegi": "แยกี", + "gyae": "คแย", + "yejeol": "เยจ็อล", + "sigye": "ชีกเย", + "yennal": "เย็นนัล", + "gyetnal": "คเย็ดนัล", + "yeohaeng": "ยอแฮ็ง", + "byeo": "พยอ", + "yeonmal": "ย็อนมัล", + "ramyeon": "รามย็อน", + "yori": "โยรี", + "hakgyo": "ฮักกโย", + "yong": "ยง", + "gongryong": "คงรยง", + "uyu": "อูยู", + "hyuji": "ฮยูจี", + "yuk": "ยุก", + "gyul": "คยุล" +} + +COUNTRIES = { + """beurunai +kambodia +indonesia +raoseu +malleisia +miyanma +pillipin +singgaporeu +taeguk +beteunam + +dongtimoreu, papuanyugini + +jungguk, ilbon, daehanmin'guk +oseuteureillia, indo, nyujillaendeu, reosia, miguk""": +"""พือรูนาอี +คัมโบดีอา +อินโดเนชีอา +ราโอซือ +มัลเลอีชีอา +มียันมา +พิลลีพิน +ชิงกาโพรือ +แทกุก +เพทือนัม + +ทงทีโมรือ, พาพูอานยูกีนี + +ชุงกุก, อิลบน, แทฮันมินกุก +โอซือทือเรอิลลีอา, อินโด, นยูจิลแล็นดือ, รอชีอา, มีกุก""" +} + +class TestSpellWord(unittest.TestCase): + + def test_general(self): + thap_sap = ThapSap('ko') + for case in GENERAL: + self.assertEqual(thap_sap.thap(case), GENERAL[case]) + + def test_ri55_example(self): + thap_sap = ThapSap('ko') + for case in RI55_EXAMPLE: + self.assertEqual(thap_sap.thap(case), RI55_EXAMPLE[case]) + + def test_countries(self): + thap_sap = ThapSap('ko') + for case in COUNTRIES: + self.assertEqual(thap_sap.thap(case), COUNTRIES[case]) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_vi_thapsap.py b/tests/test_vi_thapsap.py new file mode 100644 index 0000000..f3a3f1d --- /dev/null +++ b/tests/test_vi_thapsap.py @@ -0,0 +1,218 @@ +import unittest +from wunsen import ThapSap + +GENERAL = { + # composed form + 'tiếng Việt': 'เตี๊ยง เหวียต', + # decomposed form + 'tiếng Việt': 'เตี๊ยง เหวียต', + 'ベトナム語': 'ベトナム語', + '越南语': '越南语', + 'ภาษาเวียดนาม': 'ภาษาเวียดนาม', + 'Từ Hán Việt詞漢越': 'ตื่อ ฮ้าน เหวียต詞漢越' +} + +RS55_EXAMPLE = { + # 'quốc ngữ': 'โกว๊กหงือ', => กว๊กหงือ + 'ngang': 'งาง', + 'ma': 'มา', + 'câm': 'เกิม', + 'cơm': 'เกิม', + 'huyền': 'ฮเหวี่ยน', + 'mà': 'หม่า', + 'bà': 'บ่า', + 'xã': 'สา', + 'rõ': 'สอ', + 'mả': 'หมา', + 'rẻ': 'แส', + 'má': 'ม้า', + 'mát': 'ม้าต', + 'mạ': 'หมะ', + 'chị': 'จิ', + 'mạn': 'หมั่น', + 'hữu': 'หืว', + 'hưu': 'ฮืว', + 'ngoan': 'งวาน', + 'ngoãn': 'งหวาน', + 'ngoen': 'แงวน', + 'ngoẻn': 'งแหวน', + 'Hội An': 'โห่ย อาน', # 'โห่ยอาน' + 'Gia Long': 'ซา ล็อง', # 'ซาล็อง' + 'Võ Nguyên Giáp': 'หวอ เงวียน ซ้าป', + 'Bà Rịa-Vũng Tàu': 'บ่า เสียะ-หวุง เต่า', # 'บ่าเสียะ-หวุงเต่า' + 'Cao Bá Quát': 'กาว บ๊า กว๊าต', # 'กาวบ๊า-กว๊าต' + 'khuây': 'เคว็ย', + 'khuấy': 'เคว้ย', + + 'bay': 'บัย', + 'các': 'ก๊าก', + 'bác': 'บ๊าก', + 'cha': 'จา', + 'thích': 'ทิก', + 'da': 'ซา', + 'dậy': 'เส่ย', + 'đình': 'ดิ่ญ', + 'gà': 'ก่า', + 'ghe': 'แก', + 'già': 'ส่า', + 'giá': 'ซ้า', + 'họ': 'เหาะ', + 'ho': 'ฮอ', + 'kể': 'เก๋', + 'khi': 'คี', + 'khỉ': 'ขี', + 'lo': 'ลอ', + 'mẹ': 'แหมะ', + 'cằm': 'กั่ม', + 'no': 'นอ', + 'ăn': 'อัน', + 'ngà': 'หง่า', + 'ông': 'อง', + 'nghi': 'งี', + 'nhà': 'หญ่า', + 'sinh': 'ซิญ', + 'nhanh': 'ญัญ', + 'khép': 'แค้ป', + + 'Pháp': 'ฟ้าป', + 'phở': 'เฝอ', + 'quan': 'กวาน', + 'ra': 'ซา', + 'rổ': 'โส', + 'sư': 'ซือ', + 'sả': 'สา', + 'tôi': 'โตย', + 'bút': 'บู๊ต', + 'thu': 'ทู', + 'thả': 'ถา', + 'trà': 'จ่า', + 'vui': 'วูย', + 'xa': 'ซา', + 'xã': 'สา', + 'gì': 'สี่', + + 'màn': 'หม่าน', + 'nhanh': 'ญัญ', + 'ta': 'ตา', + 'mặn': 'หมั่น', + 'tân': 'เติน', + 'em': 'แอม', + 'mẹ': 'แหมะ', + 'lệnh': 'เหล่ญ', + 'bên': 'เบน', + 'tê': 'เต', + 'lịch': 'หลิก', + 'in': 'อีน', + 'đi': 'ดี', + 'Mỹ': 'หมี', + 'xong': 'ซ็อง', + 'con': 'กอน', + 'có': 'ก๊อ', + 'sông': 'ซง', + 'bốn': 'โบ๊น', + 'cô': 'โก', + 'lớn': 'เลิ้น', + 'mở': 'เหมอ', + 'chúc': 'จุ๊ก', + 'núp': 'นู้ป', + 'tủ': 'ตู๋', + 'nhưng': 'ญึง', + 'như': 'ญือ', + 'xoong': 'ซอง', + 'lôông tôông': 'โลง โตง', # 'โลงโตง' + + 'ai': 'อาย', + 'bài': 'บ่าย', + 'ao': 'อาว', + 'cao': 'กาว', + 'nhau': 'เญา', + 'sáu': 'เซ้า', + 'đâu': 'เดิว', + 'cay': 'กัย', + 'cây': 'เก็ย', + 'mèo': 'แหม่ว', + 'đều': 'เด่ว', + 'bia': 'เบีย', + 'tiếng': 'เตี๊ยง', + 'dìu': 'สี่ว', + 'dịu': 'สิ่ว', + 'hoa': 'ฮวา', + 'hoặc': 'ฮหวัก', + 'khoét': 'แคว้ต', + 'nói': 'น้อย', + 'tôi': 'โตย', + 'chơi': 'เจย', + 'mua': 'มัว', + 'đưa': 'เดือ', + 'xuân': 'ซวน', + 'thuê': 'เทว', + 'vui': 'วูย', + 'gửi': 'กื๋ย', + 'buồn': 'บ่วน', + 'thuở': 'ถัว', + 'đường': 'เดื่อง', + 'hưu': 'ฮืว', + 'quý': 'กวี๊', + 'kiểu': 'เกี๋ยว', + 'yếu': 'เอี๊ยว', + 'ngoài': 'งหว่าย', + 'ngoao': 'งวาว', + 'ngoáy': 'งวั้ย', + 'khuấy': 'เคว้ย', + 'khuây': 'เคว็ย', + 'muối': 'ม้วย', + 'tuổi': 'ต๋วย', + 'nuôi': 'นวย', + 'khuya': 'เควีย', + 'duyên': 'เซวียน', + 'khuỷu': 'ขวีว', + 'tươi': 'เตือย', + 'hươu': 'เฮือว', + 'rượu': 'เสื่อว' +} + +COUNTRIES = {"""Hiệp hội các quốc gia Đông Nam Á: + - Nhà nước Brunei Darussalam + - Vương quốc Campuchia + - Cộng hòa Indonesia + - Cộng hoà Dân chủ Nhân dân Lào + - Liên bang Malaysia + - Cộng hòa Liên bang Myanmar + - Cộng hòa Philippines + - Cộng hòa Singapore + - Vương quốc Thái Lan + - Cộng hòa Xã hội chủ nghĩa Việt Nam + """: + """เหียป โห่ย ก๊าก กว๊ก ซา ดง นาม อ๊า: + - หญ่า เนื้อก Brunei Darussalam + - เวือง กว๊ก Campuchia + - ก่ง ฮหว่า Indonesia + - ก่ง ฮหว่า เซิน จู๋ เญิน เซิน หล่าว + - เลียน บาง Malaysia + - ก่ง ฮหว่า เลียน บาง Myanmar + - ก่ง ฮหว่า Philippines + - ก่ง ฮหว่า Singapore + - เวือง กว๊ก ท้าย ลาน + - ก่ง ฮหว่า สา โห่ย จู๋ เหงีย เหวียต นาม + """ +} + +class TestSpellWord(unittest.TestCase): + + def test_general(self): + thap_sap = ThapSap('vi') + for case in GENERAL: + self.assertEqual(thap_sap.thap(case), GENERAL[case]) + + def test_rs55_example(self): + thap_sap = ThapSap('vi') + for case in RS55_EXAMPLE: + self.assertEqual(thap_sap.thap(case), RS55_EXAMPLE[case]) + + def test_countries(self): + thap_sap = ThapSap('vi') + for case in COUNTRIES: + self.assertEqual(thap_sap.thap(case), COUNTRIES[case]) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file