Skip to content

Commit

Permalink
add exceptions
Browse files Browse the repository at this point in the history
  • Loading branch information
eroux committed Sep 18, 2022
1 parent 824d9c6 commit eb7c42d
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 19 deletions.
4 changes: 4 additions & 0 deletions TODO.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
## TODO

- option (MST 438) : /p/ between two vowels can be pronounced β

- how is nyon mongs pronounced? [ɲø:moŋ] or [ɲønmoŋ]? MST gives phonology nyönmong but doesn't really give a phonetic equivalent

- བཟས་སོང་། (passe) va etre prononce ཟ་སོང་། (mange passe)
- glottal stop + k-| ?
- study behavior for ambiguous syllables (probably list some as exceptions)
Expand Down
4 changes: 3 additions & 1 deletion bophono/PhonStateMST.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@ def __init__(self, options={}, pos=None, endOfSentence=False):
self.eatR = options['eatR'] if 'eatR' in options else False
self.eatL = options['eatL'] if 'eatL' in options else False
self.eatP = options['eatP'] if 'eatP' in options else True
self.eatK = options['eatK'] if 'eatK' in options else True
# this is rather complex: basically
self.gsuffixstrategy = options['gsuffixstrategy'] if 'gsuffixstrategy' in options else "simple"
self.gsuffixchar = options['gsuffixchar'] if 'gsuffixchar' in options else "simple"
self.aiAffixchar = options['aiAffixchar'] if 'aiAffixchar' in options else 'ːɪ̯'
self.aiAffixmonochar = options['aiAffixmonochar'] if 'aiAffixmonochar' in options else self.syllablesepchar+'ɪ'
# does the འི affix in monosyllabic words change the vowel sound (a -> ä) or not (defaults to not)
Expand Down
22 changes: 16 additions & 6 deletions bophono/data/exceptions.csv
Original file line number Diff line number Diff line change
Expand Up @@ -19,24 +19,30 @@
ས་གདན,s+ap|tän
ཁ་གཅོད,kh+ap|cö'
དབ/Cb,w+
རྗེས་འཇུག,c-en|cu'
རྗེས་འཇུག,c-en|cuk
#
# numbers, from NT Annex 1, completed by Drupchen
# NT, spoken Tibetan only
#
བཅུ་གཅིག,c+uk|c+ik
བཅུ་གཉིས,c+uk|ny+i'
ཡོད་རེད,y-o:|re'
#
# numbers, from NT Annex 1 and p. 132, completed by Drupchen
#
#བཅུ་གཅིག,c+uk|c+ik # not indicated in NT, skipping this one and the next one
#བཅུ་གཉིས,c+uk|ny+i'
བཅུ་གསུམ,c+uk|s+um
བཅུ་བཞི/Ci,c+up|sh-
བཅོ་ལྔ/Ca,c+ö|ng+
བཅུ་བདུན,c+up|t-ün
བཅོ་བརྒྱད,c+op|ky-ä'
བཅུ་དགུ/Cu,c+ur|k-
རྩ་བཞི/Ci,ts+ap|sh-
རྩ་བདུན,ts+ap|t-ün
རྩ་བརྒྱད,ts+ap|ky-ä'
སོ་བརྒྱད,s+op|ky-ä'
བཞི་བཅུ/Cu,sh-ip|c+
ལྔ་བཅུ/Cu,ng+ap|c+
ང་བཞི/Ci,ng-ap|sh-
དགུ་བཅུ/Cu,k-u|c+
དགུ་བཅུ/Cu,k-up|c+
# possible according to accent, not always (source: Drupchen)
ལྔ་ཁྲི/Ci,ng+ap|thr+
གོ་བཞི/Ci,k-op|sh-
Expand All @@ -49,6 +55,8 @@
# Sanskrit (TODO: check)
པདྨ/Ca,p+e|m-
ཀརྨ/Ca,k+ar|m-
སེངྒེ/Ce,s+eng|kh-
བཛྲ/Ca,p-en|ts-# ?
#
# NT Annex 1, nasalizer
#
Expand Down Expand Up @@ -92,6 +100,8 @@
ཕྱག་མཛོད,ch+ang|~ts-ö'
ལྷ་མཛེས,lh+an|~ts-e'
ལོ་མཆོད,l-om|~ch+ö'
#མཆོད་འབུལ,ch+öm|p-ul # TODO: in NT p. 380 pul has a tone...
སྒྲ་སྙན,tr-ap|ny+än
#
# THL nasalizations
#
Expand Down Expand Up @@ -131,7 +141,7 @@
ཡ་བཞི/Ci,y-ap|sh-
ས་བཅད,s+ap|c+ä'
#ཡང་ལེ་བེར,y-ang|l-e|w-er
#མཆོད་རྟེན,ch+ör|t+en (only in some dialects)
#མཆོད་རྟེན,ch+ör|t+en (only in some dialects, but also in NT p. 380)
སྡེ་དགེ/Ce,t-er|k-
སློ་བཟང,l+op|s-ang
# not sure about these:
Expand Down
66 changes: 61 additions & 5 deletions demo.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,71 @@
import bophono
import sys

options = {
'aspirateLowTones': True,
options_fast = {
'weakAspirationChar': '',
'aspirateLowTones': False,
'prefixStrategy': 'always',
'aiAffixchar': 'ː',
'hightonechar':'',
'lowtonechar':'',
'nasalchar': '',
'stopSDMode': "eow",
'useUnreleasedStops': False,
'eatR': True,
'eatL': True,
'syllablesepchar': ''
}


options_fastidious = {
'weakAspirationChar': '3',
'aspirateLowTones': True,
'prefixStrategy': 'always',
'aiAffixchar': 'ː',
'hightonechar':'̄',
'lowtonechar':'̱',
'nasalchar': '',
'stopSDMode': "eow",
'eatP': False,
'useUnreleasedStops': True,
'eatK': False,
'syllablesepchar': ''
}

def toEnglish(s, mode):
s = s.replace("y", "ü")
if (mode == "expert"):
s = s.replace("ɔ", "o1")
s = s.replace("ɣ", "g2")
s = s.replace("̊", "1")
s = s.replace("̥", "1")

else:
s = s.replace("ɔ", "o")
s = s.replace("ɣ", "g")
s = s.replace("̊", "")
s = s.replace("̥", "")
s = s.replace("ɖ", "ḍ")
s = s.replace("ʈ", "ṭ")
s = s.replace("ɲ", "ny")
s = s.replace("ø", "ö")
s = s.replace("ɟ", "gy")
s = s.replace("c", "ky")
s = s.replace("j", "y")
s = s.replace("ɛ", "è")
s = s.replace("e", "é")
s = s.replace("ŋ", "ṅ")
s = s.replace("tɕ", "ch")
s = s.replace("ɕ", "sh")
s = s.replace("dʑ", "j")
s = s.replace("dz", "z")

return s

filename = 'tests/demo.txt'
converter = bophono.UnicodeToApi(schema="MST", options = options) # try with CAT for Amdokä
converter_fastidious = bophono.UnicodeToApi(schema="MST", options = options_fastidious)
converter_fast = bophono.UnicodeToApi(schema="MST", options = options_fast)
converter_kvp = bophono.UnicodeToApi(schema="KVP", options = options_fast)
if (len(sys.argv) > 1):
filename = sys.argv[1]
with open(filename, 'r', encoding="utf8") as f:
Expand All @@ -24,12 +79,13 @@
words = line.split()
res = ""
for word in words:
res += converter.get_api(word)+' '
res += converter_kvp.get_api(word)+' '
#res = toEnglish(res, "expert")
print(res)

#Chinese transcription
sentence = "བཀྲ་ཤིས་"
api = converter.get_api(sentence)
api = converter_fast.get_api(sentence)
zh = bophono.apitochinese.api2chinese(api)
print("\n" + sentence + " -> " + api + \
" -> " + zh["zhuyin"] + " -> " + zh["chinese_trad"])
7 changes: 0 additions & 7 deletions doc/a-variations.md

This file was deleted.

0 comments on commit eb7c42d

Please sign in to comment.