first commit

cakimpei · Jun 1, 2022 · 4988857 · 4988857
commit 4988857
Show file tree

Hide file tree

Showing 22 changed files with 1,520 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 cakimpei, [email protected]
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,67 @@
+# Wunsen
+
+Wunsen provides 'thai-ization' of different languages.
+
+Currently support:
+
+- Japanese (from Hepburn romanization)
+- Korean (from Revised Romanization)
+- Vietnamese (Latin script)
+
+## Installation
+
+Requirement:
+
+Python >= 3.7
+
+[khanaa](https://github.com/cakimpei/khanaa)
+
+```
+pip install wunsen
+```
+
+## Usage
+
+```python
+from wunsen import ThapSap
+
+# Japanese
+thap_ja = ThapSap('ja')
+thap_ja.thap('ohayō')
+# => 'โอฮาโย'
+
+# without macron
+thap_ja_no_macron = ThapSap('ja', input='Hepburn-no diacritic')
+thap_ja_no_macron.thap('ohayou')
+# => 'โอฮาโย'
+
+# Korean
+thap_ko = ThapSap('ko')
+thap_ko.thap('annyeonghaseyo')
+# => 'อันนย็องฮาเซโย'
+
+# Vietnamese
+thap_vi = ThapSap('vi')
+thap_vi.thap('xin chào')
+# => 'ซีน จ่าว'
+```
+
+## Transcription/Transliteration System in Wunsen
+
+There might be some differences between Wunsen result and the intended result from the actual system, so please review the results.
+
+- Japanese => หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น (สำนักงานราชบัณฑิตยสภา พ.ศ. 2561)
+- Korean => หลักเกณฑ์การทับศัพท์ภาษาเกาหลี (ราชบัณฑิตยสถาน พ.ศ. 2555)
+- Vietnamese => หลักเกณฑ์การทับศัพท์ภาษาเวียดนาม (ราชบัณฑิตยสถาน พ.ศ. 2555)
+
+### Notes
+
+Wunsen might break syllables in incorrect place:
+
+```python
+thap_ja.thap("honya | hon'ya")
+# => "โฮเนีย | ฮงยะ"
+
+thap_ko.thap("waengwaeng, maeum | waeng'waeng, ma'eum")
+# => "แว็นกแว็ง, แมอุม | แว็งแว็ง, มาอึม"
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=42"]
+build-backend = "setuptools.build_meta"
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,26 @@
+[metadata]
+name = wunsen
+version = 0.0.1
+author = cakimpei
+author_email = [email protected]
+description = Thai-ization tool
+long_description = file: README.md
+long_description_content_type = text/markdown
+url = https://github.com/cakimpei/wunsen
+project_urls =
+    Bug Tracker = https://github.com/cakimpei/wunsen/issues
+classifiers =
+    Programming Language :: Python :: 3
+    License :: OSI Approved :: MIT License
+    Operating System :: OS Independent
+
+[options]
+package_dir =
+    = src
+packages = find:
+python_requires = >=3.7
+install_requires =
+    khanaa
+
+[options.packages.find]
+where = src
diff --git a/src/wunsen/__init__.py b/src/wunsen/__init__.py
@@ -0,0 +1 @@
+from .main import ThapSap
diff --git a/src/wunsen/japanese/__init__.py b/src/wunsen/japanese/__init__.py
@@ -0,0 +1 @@
+from .ja_thapsap import ThapJa
diff --git a/src/wunsen/japanese/ja_mapping.py b/src/wunsen/japanese/ja_mapping.py
@@ -0,0 +1,73 @@
+from copy import deepcopy
+
+ORS61 = {
+
+    'onset': {
+        'b': ['บ', 'บ'],
+        'ch': ['ช', 'จ'],
+        'd': ['ด', 'ด'],
+        'f': ['ฟ', 'ฟ'],
+        'g': ['ก', 'ง'],
+        'h': ['ฮ', 'ฮ'],
+        'j': ['จ', 'จ'],
+        'k': ['ค', 'ก'],
+        'm': ['ม', 'ม'],
+        'n': ['น', 'น'],
+        'p': ['พ', 'ป'],
+        'r': ['ร', 'ร'],
+        's': ['ซ', 'ซ'],
+        'sh': ['ช', 'ช'],
+        't': ['ท', 'ต'],
+        'ts': ['ซ', 'ซ'], # different pattern (short vowel)
+        'v': ['ว', 'ว'], # not in royal ins
+        'w': ['ว', 'ว'],
+        'y': ['ย', 'ย'],
+        'z': ['ซ', 'ซ']
+        #'': ['อ', 'อ']
+    },
+
+    'coda': {
+        'f': 'ฟ',
+        'k': 'ก',
+        'm': 'ม',
+        'n': 'น', # or ง, ม
+        'p': 'ป',
+        's': 'ซ', # or ช
+        't': 'ต'
+    },
+
+    'vowel': {
+        'a': 'อะ', # อา (not last syllable, no coda)
+        'ā': 'อา',
+        'ai': 'ไอ',
+        'e': 'เอะ', # เอ
+        'ē': 'เอ',
+        'ei': 'เอ',
+        'i': 'อิ',
+        'ī': 'อี',
+        'o': 'โอะ', # โอ
+        'ō': 'โอ',
+        'u': 'อุ', # อู # อึ for tsu
+        'ū': 'อู', # อือ for tsuu
+        'ya': 'เอีย',
+        'yā': 'เอีย',
+        'yo': 'เอียว', ## delete coda
+        'yō': 'เอียว',
+        'yu': 'อิว', ## delete coda
+        'yū': 'อีว'
+    }
+}
+
+ORS61_NO_DIACRITIC = deepcopy(ORS61)
+ORS61_NO_DIACRITIC['vowel'].update({
+    'aa': 'อา',
+    'ee': 'เอ',
+    'ii': 'อี',
+    'oo': 'โอ',
+    'ou': 'โอ',
+    'uu': 'อู', # อือ for tsuu
+    'yaa': 'เอีย',
+    'yoo': 'เอียว',
+    'you': 'เอียว',
+    'yuu': 'อีว'
+})
diff --git a/src/wunsen/japanese/ja_thapsap.py b/src/wunsen/japanese/ja_thapsap.py
@@ -0,0 +1,149 @@
+import re
+from typing import List, Pattern
+import unicodedata
+
+from khanaa import SpellWord
+
+from wunsen.splitutils.splitter import SplitSyl, NotInDict
+from wunsen.splitutils.exception import NotAvailableSystem
+from .ja_mapping import ORS61, ORS61_NO_DIACRITIC
+
+class JaRs61:
+
+    def __init__(self, input: str) -> None:
+        self.ja_dict = ORS61
+        if input == 'Hepburn-no diacritic':
+            self.ja_dict = ORS61_NO_DIACRITIC
+
+    def thap_kham(self, word: str) -> str:
+        try:
+            splitter = SplitSyl(self.ja_dict)
+            word = unicodedata.normalize('NFC', word)
+            self.split = splitter.split_syl(word)
+        except NotInDict:
+            return word
+        self.last_syl_index = len(self.split) - 1
+        self.split = self.adapt_split(self.split, self.last_syl_index)
+        thai_transcript = []
+        for index, syl in enumerate(self.split):
+            self.pref = {}
+            syl_info = {
+                'onset': self.select_onset(index, syl),
+                'vowel': self.select_vowel(index, syl),
+                'coda': self.select_coda(index, syl)
+                }
+            spell = SpellWord(**self.pref)
+            thai_transcript.append(spell.spell_out(**syl_info))
+        return ''.join(thai_transcript)
+
+    @staticmethod
+    def adapt_split(
+            old_syl: List[List[str]],
+            last_syl_index: int) -> List[List[str]]:
+        new_split = []
+        for index, syl in enumerate(old_syl):
+            new_syl = syl
+            # ya = ยา not เอีย, เยีย
+            if new_syl[0] == '' and new_syl[1][0] == 'y':
+                new_syl = ['y', new_syl[1][1:], new_syl[2]]
+            # mitsu = มิตสึ
+            if (index != last_syl_index
+                    and new_syl[2] == ''
+                    and old_syl[index+1][0] == 'ts'):
+                new_syl[2] = 't'
+            if (new_syl[1] in ['yo', 'yoo', 'you', 'yu', 'yuu', 'yō', 'yū']
+                    and new_syl[2] != ''):
+                new_syl[2] = ''
+            new_split.append(new_syl)
+        return new_split
+
+    def select_onset(self, index: int, syl: List[str]) -> str:
+        if syl[0] == '':
+            onset = 'อ'
+        elif syl[0] == 'ts' and syl[1] == 'u':
+            onset = 'ส'
+        else:
+            if index == 0:
+                onset = self.ja_dict['onset'][syl[0]][0]
+            else:
+                onset = self.ja_dict['onset'][syl[0]][1]
+        return onset
+
+    def select_vowel(self, index: int, syl: List[str]) -> str:
+        if syl[0] == 'ts' and syl[1] in ['u', 'uu', 'ū']:
+            if syl[1] == 'u':
+                vowel = 'อึ'
+            elif syl[1] in ['uu', 'ū']:
+                vowel = 'อือ'
+        else:
+            vowel = self.ja_dict['vowel'][syl[1]]
+            if (syl[1] in ['a', 'e', 'o', 'u']
+                    and syl[2] == ''
+                    and index != self.last_syl_index):
+                self.pref.update({'vowel_length': 'long'})
+        return vowel
+
+    def select_coda(self, index: int, syl: List[str]) -> str:
+        if syl[2] == '':
+            coda = ''
+        elif syl[2] == 's':
+            coda = self.select_coda_s(index, syl)
+        elif syl[2] == 'n':
+            coda = self.select_coda_n(index, syl)
+        else:
+            coda = self.ja_dict['coda'][syl[2]]
+        return coda
+
+    def select_coda_s(self, index: int, syl: List[str]) -> str:
+        if (index != self.last_syl_index
+                and self.split[index+1][0] == 'sh'):
+            coda = 'ช'
+        else:
+            coda = self.ja_dict['coda'][syl[2]]
+        return coda
+
+    def select_coda_n(self, index: int, syl: List[str]) -> str:
+        if index == self.last_syl_index:
+            coda = 'ง'
+            return coda
+        next_onset = self.split[index+1][0]
+        if next_onset == '':
+            coda = 'ง'
+        elif next_onset in ['g', 'k', 'h', 'f', 'w', 'y']:
+            coda = 'ง'
+        elif next_onset in ['b', 'm', 'p']:
+            coda = 'ม'
+        else:
+            coda = self.ja_dict['coda'][syl[2]]
+        return coda
+
+class ThapJa:
+
+    def __init__(
+            self, system: str = 'ORS61',
+            input: str = 'Hepburn-macron') -> None:
+        """Setting
+        
+        :param system: Select thapsap system.
+            - 'ORS61' for the Office of the Royal Society (2018/2561)
+            system
+
+        :param input: Select input type.
+            - 'Hepburn-macron' for Hepburn romanization with macron
+            (ex. arigatō)
+            - 'Hepburn-no diacritic' for Hepburn romanization
+            without diacritic (ex. arigatou)
+        """
+        if system == 'ORS61':
+            self.transcript = JaRs61(input)
+        else:
+            raise NotAvailableSystem
+
+    def thap(self, text: str) -> str:
+        def find_expression() -> Pattern[str]:
+            char_list = ''.join(['a-zA-ZāēīōūĀĒĪŌŪ', u'\u00af'])
+            exp = f"([{char_list}]|(?<=[{char_list}])'(?=[{char_list}]))+"
+            return re.compile(exp)
+        def replace(match):
+            return self.transcript.thap_kham(match.group())
+        return re.sub(find_expression(), replace, text)
diff --git a/src/wunsen/korean/__init__.py b/src/wunsen/korean/__init__.py
@@ -0,0 +1 @@
+from .ko_thapsap import ThapKo