add better chonjuk translation

OpenPecha · Dec 24, 2024 · a935122 · a935122
1 parent d36897d
commit a935122
Show file tree

Hide file tree

Showing 22 changed files with 45,885 additions and 1,638 deletions.
diff --git a/data/align.py b/data/align.py
@@ -0,0 +1,86 @@
+from pathlib import Path
+
+import claudette
+from tqdm import tqdm
+
+data_path = Path(__file__).parent / "chonjuk"
+commentary_fn = data_path / "chonjuk_commentary.csv"
+root_en = data_path / "new_chonjuk-en-root.txt"
+algin_fn = data_path / "chonjuk_trans_com.tsv"
+
+segment_pairs = commentary_fn.read_text().strip().split("\n")[1:]
+verses_en = [
+    verse.replace("\n", " ") for verse in root_en.read_text().strip().split("\n\n")
+]
+
+
+model_name = claudette.models[1]
+
+
+def claud_sonet_chat(prompt):
+    claud_sonet = claudette.Chat(model_name)
+    response = claud_sonet(prompt)
+    return "".join([textblock.text for textblock in response.content])
+
+
+auto_align_prompt = """
+You are a translation expert. Post-correct the Target English text by removing parts which is not the translation of the source Tibetan text. Also greate a glossary terms for the following text.
+
+Do not change the original meaning of the source text.
+Do not split the tibetan text into multiple parts.
+
+Input format:
+[Source Tibetan text]\t[Poor translation Target English text]
+
+Output format:
+[Same Source Tibetan text]\tt[Post-corrected target English text]
+Glossary:
+[Term in Tibetan]\t[Term in English]\t[Definition for English term]
+(multiple lines)
+
+Input:
+{}
+"""
+
+
+def get_rough_alignments():
+    def get_en_verse(verses_en, i):
+        context_len = 1
+        if i < context_len:
+            left_i = 0
+        else:
+            left_i = i - context_len
+
+        return " ".join(verses_en[left_i : i + context_len + 1])
+
+    lines = []
+    for i, segment_pair in enumerate(segment_pairs):
+        root, cmt = segment_pair.split(",")
+        if cmt.strip() == "":
+            continue
+        root_en_verse = get_en_verse(verses_en, i)
+        lines.append(f"{root}\t{root_en_verse}")
+    return lines
+
+
+def parse_output(output):
+    output_lines = output.strip().split("\n")
+    for line in output_lines:
+        if "\t" not in line:
+            continue
+        yield line
+
+
+def align():
+    rough_aligned_lines = get_rough_alignments()
+    with algin_fn.open("w") as f:
+        for i in tqdm(range(len(rough_aligned_lines))):
+            # if i == 3:
+            #     break
+            output = claud_sonet_chat(auto_align_prompt.format(rough_aligned_lines[i]))
+            f.write(output + "\n")
+            f.write("-" * 100 + "\n")
+
+
+if __name__ == "__main__":
+    align()
diff --git a/data/align_line_by_line.py b/data/align_line_by_line.py
@@ -0,0 +1,34 @@
+from pathlib import Path
+
+import claudette
+
+data_path = Path(__file__).parent / "chonjuk"
+alignment_fn = data_path / "chonjuk_translation.tsv"
+
+model_name = claudette.models[1]
+
+
+def claud_sonet_chat(prompt):
+    claud_sonet = claudette.Chat(model_name)
+    response = claud_sonet(prompt)
+    return "".join([textblock.text for textblock in response.content])
+
+
+auto_align_prompt = """
+You are a translation expert. Analyze the following parallel text pairs and realign them to create better translation units while maintaining semantic completeness. Each realigned pair should:
+
+Represent complete thoughts/sentences
+Maintain proper context between source and target
+Have similar length and complexity
+Preserve the original meaning
+
+Input format:
+[Source text in Tibetan]\t[Target text in English]
+(multiple lines)
+Output format:
+[Realigned source text]\t[Realigned target text]
+(one aligned pair per line)"
+
+Input:
+{}
+"""
diff --git a/data/chonjuk/TM0876-bo.txt b/data/chonjuk/TM0876-bo.txt
diff --git a/data/chonjuk/TM0876-en.txt b/data/chonjuk/TM0876-en.txt
diff --git a/data/TM0876/TM0876-bo.txt → data/chonjuk/chonjuk-bo.txt b/data/TM0876/TM0876-bo.txt → data/chonjuk/chonjuk-bo.txt
@@ -1,4 +1,5 @@
-༄༅།། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པ་ཞེས་བྱ་བ་བཞུགས་སོ།། སངས་རྒྱས་དང་བྱང་ཆུབ་སེམས་དཔའ་ཐམས་ཅད་ལ་ཕྱག་འཚལ་ལོ།།
+བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པ་ཞེས་བྱ་བ་བཞུགས་སོ།།
+སངས་རྒྱས་དང་བྱང་ཆུབ་སེམས་དཔའ་ཐམས་ཅད་ལ་ཕྱག་འཚལ་ལོ།།
 བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང་།། ཕྱག་འོས་ཀུན་ལའང་གུས་པས་ཕྱག་འཚལ་ཏེ།།
 བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི།། ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ།།
 སྔོན་ཆད་མ་བྱུང་བ་ཡང་འདིར་བརྗོད་མེད།། སྡེབ་སྦྱོར་མཁས་པའང་བདག་ལ་ཡོད་མིན་ཏེ།།
@@ -38,9 +39,8 @@
 ཕན་པར་བསམས་པ་ཙམ་གྱིས་ཀྱང་།། སངས་རྒྱས་མཆོད་ལས་ཁྱད་འཕགས་ན།། སེམས་ཅན་མ་ལུས་ཐམས་ཅད་ཀྱི།། བདེ་དོན་བརྩོན་པ་སྨོས་ཅི་དགོས།།
 སྡུག་བསྔལ་འདོར་འདོད་སེམས་ཡོད་ཀྱང་།། སྡུག་བསྔལ་ཉིད་ལ་མངོན་པར་རྒྱུག།
 བདེ་བ་འདོད་ཀྱང་གཏི་མུག་པས།། རང་གི་བདེ་བ་དགྲ་ལྟར་འཇོམས།།
-གང་ཞིག་བདེ་བས་ཕོངས་པ་དང་།། སྡུག་བསྔལ་མང་ལྡན་དེ་དག་ལ།། བདེ་བ་ཀུན་གྱིས་ཚིམ་པ་དང་།། སྡུག་བསྔལ་ཐམས་ཅད་གཅོད་བྱེད་ཅིང་།། གཏི་མུག་ཀྱང་ནི་སེལ་བྱེད་པ།།
-དེ་དང་དགེ་མཚུངས་ག་ལ་ཡོད།། དེ་འདྲའི་བཤེས་ཀྱང་ག་ལ་ཡོད།།
-བསོད་ནམས་དེ་འདྲའང་ག་ལ་ཡོད།།
+གང་ཞིག་བདེ་བས་ཕོངས་པ་དང་།། སྡུག་བསྔལ་མང་ལྡན་དེ་དག་ལ།། བདེ་བ་ཀུན་གྱིས་ཚིམ་པ་དང་།། སྡུག་བསྔལ་ཐམས་ཅད་གཅོད་བྱེད་ཅིང་།།
+གཏི་མུག་ཀྱང་ནི་སེལ་བྱེད་པ།། དེ་དང་དགེ་མཚུངས་ག་ལ་ཡོད།། དེ་འདྲའི་བཤེས་ཀྱང་ག་ལ་ཡོད།། བསོད་ནམས་དེ་འདྲའང་ག་ལ་ཡོད།།
 ཕན་བཏགས་ལན་ལྡོན་གང་ཡིན་པ།། དེ་ཡང་རེ་ཞིག་བསྔགས་འོས་ན།། མ་བཅོལ་ལེགས་པར་བྱེད་པ་ཡི།། བྱང་ཆུབ་སེམས་དཔའ་སྨོས་ཅི་དགོས།།
 འགྲོ་བ་ཉུང་ཟད་ནར་མའི་ཟས་སྦྱོར་བ།། སྐད་ཅིག་ཟས་ཙམ་སྦྱིན་པར་བྱེད་པ་དང་།། བརྙས་བཅས་ཉིན་ཕྱེད་འགྲངས་པར་བྱེད་པ་ཡང་།། དགེ་བ་བྱེད་པ་ཡིན་ཞེས་སྐྱེ་བོས་བཀུར།།
 སེམས་ཅན་གྲངས་མཐའ་ཡས་ལ་དུས་རིང་དུ།། བདེ་བར་གཤེགས་ཀྱི་བདེ་བ་བླ་ན་མེད།། ཡིད་ལ་བསམ་པ་མཐའ་དག་རྫོགས་བྱེད་པའི།། རྟག་ཏུ་སྦྱིན་པ་ལྟ་ཞིག་སྨོས་ཅི་དགོས།།
@@ -1597,10 +1597,4 @@
 བྱང་ཆུབ་སེམས་དཔའི་དགེ་འདུན་གྱིས།། འགྲོ་བ་བདེ་ལ་སྤྱོད་པར་ཤོག།
 འགྲོ་བའི་སྡུག་བསྔལ་སྨན་གཅིག་པུ།། བདེ་བ་ཐམས་ཅད་འབྱུང་བའི་གནས།། བསྟན་པ་རྙེད་དང་བཀུར་སྟི་དང་།། བཅས་ཏེ་ཡུན་རིང་གནས་གྱུར་ཅིག།
 གང་གི་དྲིན་གྱིས་དགེ་བློ་འབྱུང་།། འཇམ་པའི་དབྱངས་ལ་ཕྱག་འཚལ་ལོ།།
-གང་གི་དྲིན་གྱིས་བདག་དར་བ།། དགེ་བའི་བཤེས་ལའང་བདག་ཕྱག་འཚལ།།
-བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པ་ལས།
-བསྔོ་བ་ཞེས་བྱ་བའི་ལེའུ་སྟེ་བཅུ་པ་འོ།།།།
-བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པ་སློབ་དཔོན་ཤཱནྟི་དེ་བས་མཛད་པ་རྫོགས་སོ།།
-༈རྒྱ་གར་གྱི་མཁན་པོ་སརྦ་ཛྙཱ་དེ་བ་དང་ཞུ་ཆེན་གྱི་ལོ་ཙཱ་བ་བནྡེ་དཔལ་བརྩེགས་ཀྱིས་ཁ་ཆེའི་དཔེ་ལས་བསྒྱུར་ཅིང་ཞུས་ཏེ་གཏན་ལ་ཕབ་པ་ལས།
-སླད་ཀྱི་རྒྱ་གར་གྱི་མཁན་པོ་དྷརྨ་ཤྲཱི་བྷ་དྲ་དང་། ཞུ་ཆེན་གྱི་ལོ་ཙཱ་བ་བནྡེ་རིན་ཆེན་བཟང་པོ་དང་། ཤཱཀྱ་བློ་གྲོས་ཀྱིས་ཡུལ་དབུས་ཀྱི་དཔེ་དང་འགྲེལ་པ་དང་མཐུན་པར་བཅོས་ཤིང་བསྒྱུར་ཏེ་གཏན་ལ་ཕབ་པའོ།།།།
-ཡང་དུས་ཕྱིས་རྒྱ་གར་གྱི་མཁན་པོ་སུ་མ་ཏི་ཀཱིརྟི་དང་ཞུ་ཆེན་གྱི་ལོ་ཙཱ་བ་དགེ་སློང་བློ་ལྡན་ཤེས་རབ་ཀྱིས་དག་པར་བཅོས་ཤིང་བསྒྱུར་ཏེ་གཏན་ལ་ཕབ་པའོ།།།།
+གང་གི་དྲིན་གྱིས་བདག་དར་བ།། དགེ་བའི་བཤེས་ལའང་བདག་ཕྱག་འཚལ།།
diff --git a/data/TM0876/TM0876-en.txt → data/chonjuk/chonjuk-en.txt b/data/TM0876/TM0876-en.txt → data/chonjuk/chonjuk-en.txt
@@ -1,4 +1,5 @@
-The Way of the Bodhisattva The Excellence of Bodhichitta Homage to all Buddhas and Bodhisattvas.
+The Way of the Bodhisattva The Excellence of Bodhichitta
+Homage to all Buddhas and Bodhisattvas.
 To those who go in bliss, the dharmakāya they possess, and all their heirs, To all those worthy of respect, I reverently bow.
 According to the scriptures, I shall now in brief describe The practice of the Bodhisattva discipline.
 Here I shall say nothing that has not been said before, And in the art of prosody I have no skill.
@@ -38,9 +39,8 @@ This pain-dispelling draft, This cause of joy for those who wander through the w
 If the simple thought to be of help to others Exceeds in worth the worship of the Buddhas, What need is there to speak of actual deeds That bring about the weal and benefit of beings?
 For beings long to free themselves from misery, But misery itself they follow and pursue.
 They long for joy, but in their ignorance Destroy it, as they would their foe.
-But those who fill with bliss, All beings destitute of joy, Who cut all pain and suffering away, From those weighed down with misery, Who drive away the darkness of their ignorance—
-What virtue could be matched with theirs? What friend could be compared to them?
-What merit is there similar to this?
+But those who fill with bliss, All beings destitute of joy, Who cut all pain and suffering away, From those weighed down with misery,
+Who drive away the darkness of their ignorance— What virtue could be matched with theirs? What friend could be compared to them? What merit is there similar to this?
 If someone who returns a favor Is deserving of some praise, Why need we speak of Bodhisattvas, Those who do good even unsolicited?
 People praise as virtuous donors Those who with contempt support A few with plain and ordinary food: A moment’s gift that feeds for only half a day.
 What need is there to speak of those Who long bestow on countless multitudes The peerless joy of blissful Buddhahood The ultimate fulfillment of their hopes?
@@ -1597,10 +1597,4 @@ The pains and sorrows of all wandering beings— May they ripen wholly on myself
 And may the virtuous company of Bodhisattvas Always bring about the happiness of beings.
 May the Doctrine, only cure for sorrow, Source of every bliss and happiness, Be blessed with wealth, upheld with veneration, And throughout a vast continuance of time, endure!
 And now to Mañjughoṣha I prostrate, Whose kindness is the wellspring of my good intent.
-And to my virtuous friends I also bow Whose inspiration gave me strength to grow.
-
-
-This completes the Bodhisattvacharyavatara, The Way of the Bodhisattva, which was composed by the master Shāntideva.
-The text was translated, edited, and finalized in Tibetan on the basis of a manuscript from Kashmir by the Indian scholar Sarvajñādeva and the monk, translator, and editor Kawa Peltsek.
-At a later time, this version was revised and finalized in accordance with the version from Magadha, together with its commentary, by the Indian scholar Dharmashrībhadra and the Tibetan monks, translators and editors, Rinchen Zangpo and Shākya Lodrö.
-Still later, it was again revised and finalized by the Indian scholar Sumatikīrti and the monk, translator, and editor Ngok Loden Sherab.
+And to my virtuous friends I also bow Whose inspiration gave me strength to grow.