Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
cakimpei committed Jun 1, 2022
0 parents commit 4988857
Show file tree
Hide file tree
Showing 22 changed files with 1,520 additions and 0 deletions.
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2022 cakimpei, [email protected]

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
67 changes: 67 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Wunsen

Wunsen provides 'thai-ization' of different languages.

Currently support:

- Japanese (from Hepburn romanization)
- Korean (from Revised Romanization)
- Vietnamese (Latin script)

## Installation

Requirement:

Python >= 3.7

[khanaa](https://github.com/cakimpei/khanaa)

```
pip install wunsen
```

## Usage

```python
from wunsen import ThapSap

# Japanese
thap_ja = ThapSap('ja')
thap_ja.thap('ohayō')
# => 'โอฮาโย'

# without macron
thap_ja_no_macron = ThapSap('ja', input='Hepburn-no diacritic')
thap_ja_no_macron.thap('ohayou')
# => 'โอฮาโย'

# Korean
thap_ko = ThapSap('ko')
thap_ko.thap('annyeonghaseyo')
# => 'อันนย็องฮาเซโย'

# Vietnamese
thap_vi = ThapSap('vi')
thap_vi.thap('xin chào')
# => 'ซีน จ่าว'
```

## Transcription/Transliteration System in Wunsen

There might be some differences between Wunsen result and the intended result from the actual system, so please review the results.

- Japanese => หลักเกณฑ์การทับศัพท์ภาษาญี่ปุ่น (สำนักงานราชบัณฑิตยสภา พ.ศ. 2561)
- Korean => หลักเกณฑ์การทับศัพท์ภาษาเกาหลี (ราชบัณฑิตยสถาน พ.ศ. 2555)
- Vietnamese => หลักเกณฑ์การทับศัพท์ภาษาเวียดนาม (ราชบัณฑิตยสถาน พ.ศ. 2555)

### Notes

Wunsen might break syllables in incorrect place:

```python
thap_ja.thap("honya | hon'ya")
# => "โฮเนีย | ฮงยะ"

thap_ko.thap("waengwaeng, maeum | waeng'waeng, ma'eum")
# => "แว็นกแว็ง, แมอุม | แว็งแว็ง, มาอึม"
```
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[build-system]
requires = ["setuptools>=42"]
build-backend = "setuptools.build_meta"
26 changes: 26 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
[metadata]
name = wunsen
version = 0.0.1
author = cakimpei
author_email = [email protected]
description = Thai-ization tool
long_description = file: README.md
long_description_content_type = text/markdown
url = https://github.com/cakimpei/wunsen
project_urls =
Bug Tracker = https://github.com/cakimpei/wunsen/issues
classifiers =
Programming Language :: Python :: 3
License :: OSI Approved :: MIT License
Operating System :: OS Independent

[options]
package_dir =
= src
packages = find:
python_requires = >=3.7
install_requires =
khanaa

[options.packages.find]
where = src
1 change: 1 addition & 0 deletions src/wunsen/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .main import ThapSap
1 change: 1 addition & 0 deletions src/wunsen/japanese/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .ja_thapsap import ThapJa
73 changes: 73 additions & 0 deletions src/wunsen/japanese/ja_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from copy import deepcopy

ORS61 = {

'onset': {
'b': ['บ', 'บ'],
'ch': ['ช', 'จ'],
'd': ['ด', 'ด'],
'f': ['ฟ', 'ฟ'],
'g': ['ก', 'ง'],
'h': ['ฮ', 'ฮ'],
'j': ['จ', 'จ'],
'k': ['ค', 'ก'],
'm': ['ม', 'ม'],
'n': ['น', 'น'],
'p': ['พ', 'ป'],
'r': ['ร', 'ร'],
's': ['ซ', 'ซ'],
'sh': ['ช', 'ช'],
't': ['ท', 'ต'],
'ts': ['ซ', 'ซ'], # different pattern (short vowel)
'v': ['ว', 'ว'], # not in royal ins
'w': ['ว', 'ว'],
'y': ['ย', 'ย'],
'z': ['ซ', 'ซ']
#'': ['อ', 'อ']
},

'coda': {
'f': 'ฟ',
'k': 'ก',
'm': 'ม',
'n': 'น', # or ง, ม
'p': 'ป',
's': 'ซ', # or ช
't': 'ต'
},

'vowel': {
'a': 'อะ', # อา (not last syllable, no coda)
'ā': 'อา',
'ai': 'ไอ',
'e': 'เอะ', # เอ
'ē': 'เอ',
'ei': 'เอ',
'i': 'อิ',
'ī': 'อี',
'o': 'โอะ', # โอ
'ō': 'โอ',
'u': 'อุ', # อู # อึ for tsu
'ū': 'อู', # อือ for tsuu
'ya': 'เอีย',
'yā': 'เอีย',
'yo': 'เอียว', ## delete coda
'yō': 'เอียว',
'yu': 'อิว', ## delete coda
'yū': 'อีว'
}
}

ORS61_NO_DIACRITIC = deepcopy(ORS61)
ORS61_NO_DIACRITIC['vowel'].update({
'aa': 'อา',
'ee': 'เอ',
'ii': 'อี',
'oo': 'โอ',
'ou': 'โอ',
'uu': 'อู', # อือ for tsuu
'yaa': 'เอีย',
'yoo': 'เอียว',
'you': 'เอียว',
'yuu': 'อีว'
})
149 changes: 149 additions & 0 deletions src/wunsen/japanese/ja_thapsap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import re
from typing import List, Pattern
import unicodedata

from khanaa import SpellWord

from wunsen.splitutils.splitter import SplitSyl, NotInDict
from wunsen.splitutils.exception import NotAvailableSystem
from .ja_mapping import ORS61, ORS61_NO_DIACRITIC

class JaRs61:

def __init__(self, input: str) -> None:
self.ja_dict = ORS61
if input == 'Hepburn-no diacritic':
self.ja_dict = ORS61_NO_DIACRITIC

def thap_kham(self, word: str) -> str:
try:
splitter = SplitSyl(self.ja_dict)
word = unicodedata.normalize('NFC', word)
self.split = splitter.split_syl(word)
except NotInDict:
return word
self.last_syl_index = len(self.split) - 1
self.split = self.adapt_split(self.split, self.last_syl_index)
thai_transcript = []
for index, syl in enumerate(self.split):
self.pref = {}
syl_info = {
'onset': self.select_onset(index, syl),
'vowel': self.select_vowel(index, syl),
'coda': self.select_coda(index, syl)
}
spell = SpellWord(**self.pref)
thai_transcript.append(spell.spell_out(**syl_info))
return ''.join(thai_transcript)

@staticmethod
def adapt_split(
old_syl: List[List[str]],
last_syl_index: int) -> List[List[str]]:
new_split = []
for index, syl in enumerate(old_syl):
new_syl = syl
# ya = ยา not เอีย, เยีย
if new_syl[0] == '' and new_syl[1][0] == 'y':
new_syl = ['y', new_syl[1][1:], new_syl[2]]
# mitsu = มิตสึ
if (index != last_syl_index
and new_syl[2] == ''
and old_syl[index+1][0] == 'ts'):
new_syl[2] = 't'
if (new_syl[1] in ['yo', 'yoo', 'you', 'yu', 'yuu', 'yō', 'yū']
and new_syl[2] != ''):
new_syl[2] = ''
new_split.append(new_syl)
return new_split

def select_onset(self, index: int, syl: List[str]) -> str:
if syl[0] == '':
onset = 'อ'
elif syl[0] == 'ts' and syl[1] == 'u':
onset = 'ส'
else:
if index == 0:
onset = self.ja_dict['onset'][syl[0]][0]
else:
onset = self.ja_dict['onset'][syl[0]][1]
return onset

def select_vowel(self, index: int, syl: List[str]) -> str:
if syl[0] == 'ts' and syl[1] in ['u', 'uu', 'ū']:
if syl[1] == 'u':
vowel = 'อึ'
elif syl[1] in ['uu', 'ū']:
vowel = 'อือ'
else:
vowel = self.ja_dict['vowel'][syl[1]]
if (syl[1] in ['a', 'e', 'o', 'u']
and syl[2] == ''
and index != self.last_syl_index):
self.pref.update({'vowel_length': 'long'})
return vowel

def select_coda(self, index: int, syl: List[str]) -> str:
if syl[2] == '':
coda = ''
elif syl[2] == 's':
coda = self.select_coda_s(index, syl)
elif syl[2] == 'n':
coda = self.select_coda_n(index, syl)
else:
coda = self.ja_dict['coda'][syl[2]]
return coda

def select_coda_s(self, index: int, syl: List[str]) -> str:
if (index != self.last_syl_index
and self.split[index+1][0] == 'sh'):
coda = 'ช'
else:
coda = self.ja_dict['coda'][syl[2]]
return coda

def select_coda_n(self, index: int, syl: List[str]) -> str:
if index == self.last_syl_index:
coda = 'ง'
return coda
next_onset = self.split[index+1][0]
if next_onset == '':
coda = 'ง'
elif next_onset in ['g', 'k', 'h', 'f', 'w', 'y']:
coda = 'ง'
elif next_onset in ['b', 'm', 'p']:
coda = 'ม'
else:
coda = self.ja_dict['coda'][syl[2]]
return coda

class ThapJa:

def __init__(
self, system: str = 'ORS61',
input: str = 'Hepburn-macron') -> None:
"""Setting
:param system: Select thapsap system.
- 'ORS61' for the Office of the Royal Society (2018/2561)
system
:param input: Select input type.
- 'Hepburn-macron' for Hepburn romanization with macron
(ex. arigatō)
- 'Hepburn-no diacritic' for Hepburn romanization
without diacritic (ex. arigatou)
"""
if system == 'ORS61':
self.transcript = JaRs61(input)
else:
raise NotAvailableSystem

def thap(self, text: str) -> str:
def find_expression() -> Pattern[str]:
char_list = ''.join(['a-zA-ZāēīōūĀĒĪŌŪ', u'\u00af'])
exp = f"([{char_list}]|(?<=[{char_list}])'(?=[{char_list}]))+"
return re.compile(exp)
def replace(match):
return self.transcript.thap_kham(match.group())
return re.sub(find_expression(), replace, text)
1 change: 1 addition & 0 deletions src/wunsen/korean/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .ko_thapsap import ThapKo
Loading

0 comments on commit 4988857

Please sign in to comment.