finish PR

Esukhia · Oct 8, 2024 · 7f33a0e · 7f33a0e
1 parent 9967324
commit 7f33a0e
Show file tree

Hide file tree

Showing 6 changed files with 31 additions and 20 deletions.
diff --git a/bophono/PhonStateKVP.py b/bophono/PhonStateKVP.py
@@ -42,16 +42,16 @@ def doCombineCurEnd(self, endofword, nrc='', nextvowel=''): # nrc = next root co
         self.phon += self.end
 
 
-    def combineWithException(self, exception, tibetanSyllable):
+    def combineWithException(self, exception):
         syllables = exception.split('|')
         for syl in syllables:
             indexplusminus = syl.find('-')
             if indexplusminus == -1:
                 print("invalid exception syllable: "+syl)
                 continue
-            self.combineWith(syl[:indexplusminus], syl[indexplusminus+1:], tibetanSyllable)
+            self.combineWith(syl[:indexplusminus], syl[indexplusminus+1:])
 
-    def combineWith(self, nextroot, nextend, tibetanSyllable):
+    def combineWith(self, nextroot, nextend):
         nextrootconsonant = nextroot
         nextvowel = ''
         self.doCombineCurEnd(False, nextrootconsonant, nextvowel)
@@ -60,8 +60,18 @@ def combineWith(self, nextroot, nextend, tibetanSyllable):
             self.phon += ""
         elif nextrootconsonant.startswith("dz") and self.position > 1:
             self.phon += "z"
-        elif "གྲ" in tibetanSyllable and nextrootconsonant.startswith("tr") and self.position == 2:
-            self.phon += "dr"
+        elif nextrootconsonant.startswith("tdr"):
+            # Here the KVP rules have the rather puzzling convention to have different rules
+            # for syllables that have the exact same phonology in Tibetan. It has:
+            # བྲ -> always dra
+            # དྲ -> dra in second position, tra in first position
+            # which doesn't make sense as Tibetans make no difference between བྲ and དྲ.
+            # We thus have to artificially differentiate them at the phonological level recorded in roots.csv
+            # By having "tdra" for དྲ.
+            if self.position == 1:
+                self.phon += "tr"
+            else:
+                self.phon += "dr"
         else:
             self.phon += nextrootconsonant
         # decompose multi-syllable ends:
@@ -70,7 +80,7 @@ def combineWith(self, nextroot, nextend, tibetanSyllable):
             self.end = ends[0]
             for endsyl in ends[1:]:
                 # we suppose that roots are always null
-                self.combineWith(endsyl[:1], endsyl[1:], tibetanSyllable)
+                self.combineWith(endsyl[:1], endsyl[1:])
         else:
             self.end = nextend
 

diff --git a/bophono/UnicodeToApi.py b/bophono/UnicodeToApi.py
@@ -70,7 +70,7 @@ def __combine_next_syll_phon(self, tibstr, bindex, state, eindex):
             return -1
         if endinfo['i'] < eindex and self.__is_tib_letter(tibstr[endinfo['i']]) and (tibstr[endinfo['i']] not in self.ignored_chars):
             return -1
-        state.combineWith(rootinfo['d'], endinfo['d'], tibstr[bindex:eindex])
+        state.combineWith(rootinfo['d'], endinfo['d'])
         assert(endinfo['i']>bindex)
         return endinfo['i']
 
@@ -97,7 +97,7 @@ def get_api(self, tibstr, bindex=0, eindex=-1, pos=None, endOfSentence=False):
                 # if it starts with '2:' and we're in the first syllable, we ignore it:
                 if exceptioninfo['d'].startswith('2:'):
                     exceptioninfo['d'] = exceptioninfo['d'][2:]
-                state.combineWithException(exceptioninfo['d'], tibstr[bindex:eindex])
+                state.combineWithException(exceptioninfo['d'])
                 nextidx = self.__get_next_letter_index(tibstr, exceptioninfo['i']+1, eindex)
                 if nextidx == -1:
                     nextidx = eindex

diff --git a/bophono/data/exceptions-kvp.csv b/bophono/data/exceptions-kvp.csv
@@ -35,8 +35,10 @@
 ཨ་གསར,ags-ar
 ས་གདན,sabd-en
 ཁ་གཅོད,khabch-ö
-# dba becomes wa except if exactly dba (no vowel, no suffix)
-དབ*/Cb,w-
+# dba doesn't become wa if it has vowel i, e or u
+དབེ/Ce,-
+དབུ/Cu,-
+དབི/Ci,-
 # numbers, from NT Annex 1, completed by Drupchen
 བཅུ་གཅིག,chugch-ik
 བཅུ་གཉིས,chugny-i
@@ -159,4 +161,4 @@
 འཕྲོ་འདུ/Cu,tront-
 སྤྲོ་བསྡུ/Co,tront-
 ན་བཟ/Cb,namz-
-མ་འགགས,mank-ak
+མ་འགགས,mank-ak
diff --git a/bophono/data/roots.csv b/bophono/data/roots.csv
@@ -31,7 +31,7 @@
 འཁྲ,~thr+,[']tr+,tr,tr
 ག,kh-,k,g,g
 གྱ,khy-,c,gy,gy
-གྲ,thr-,tr,tr,tr
+གྲ,thr-,tr,tdr,tr
 གླ,l+,l,l,l
 དག*,k-,[r]g,g,g
 དགྱ,ky-,[r]j,gy,gy
@@ -136,7 +136,7 @@
 བྱ,ch-,sh,j,j
 བྲ,thr-,tr,dr,dr
 བླ,l+,l,l,l
-དབ*,+,R,-,-
+དབ*,+,R,w,-
 དབྱ,y+,[r]y,y,y
 དབྲ,r+,,r,r
 འབ*,~p-,[']b,b,b
@@ -233,4 +233,4 @@
 སྟྭ,t+,[s]t,t,t
 སྭ,s+,s+,s,s
 བསྭ,s+,s+,s,s
-ཧྭ,h+,h,h,h
+ཧྭ,h+,h,h,h
diff --git a/tests/test_KVP_corrections.py b/tests/test_KVP_corrections.py
@@ -189,14 +189,14 @@ def test_ratas():
 
 def test_dao_wa():
     assert_equal_phonetics("KVP", "དབ", "dab")
-    assert_equal_phonetics("KVP", "དབོ", "o")
-    assert_equal_phonetics("KVP", "དབོས", "ö")
     assert_equal_phonetics("KVP", "དབུ", "u")
     assert_equal_phonetics("KVP", "དབུས", "ü")
     assert_equal_phonetics("KVP", "དབི", "i")
     assert_equal_phonetics("KVP", "དབེ", "e")
-    assert_equal_phonetics("KVP", "དབང", "ang")
     assert_equal_phonetics("KVP", "དབྱང", "yang")
+    assert_equal_phonetics("KVP", "དབོ", "wo")
+    assert_equal_phonetics("KVP", "དབོས", "wö")
+    assert_equal_phonetics("KVP", "དབང", "wang")
 
 ### Additional Phonetics Instructions:
 
@@ -277,7 +277,6 @@ def test_specific_cases():
     assert_equal_phonetics("KVP", "བར་ཆད", "barche")
     assert_equal_phonetics("KVP", "བར་དོ", "bardo")
 
-
 ### Checking that things work as expected in KVP_corrections.csv
 
 def load_corrections():
@@ -291,4 +290,4 @@ def load_corrections():
 
 @pytest.mark.parametrize("tibetan, expected", corrections)
 def test_phonetics_tool_corrections(tibetan, expected):
-    assert_equal_phonetics("KVP", tibetan, expected)
+    assert_equal_phonetics("KVP", tibetan, expected)
diff --git a/tests/test_KVP_wasur.py b/tests/test_KVP_wasur.py
@@ -22,7 +22,7 @@ def load_wasur_cases():
 def test_cases_without_wasur():
     assert_equal_phonetics("KVP", "མངས", "nge")
     assert_equal_phonetics("KVP", "མགས", "ge")
-    assert_equal_phonetics("KVP", "དབས", "e")
+    assert_equal_phonetics("KVP", "དབས", "we")
     assert_equal_phonetics("KVP", "དངས", "nge")
     assert_equal_phonetics("KVP", "དགས", "ge")
     assert_equal_phonetics("KVP", "དམས", "me")