ran exp 1 and 2

OpenPecha · Dec 25, 2024 · 870a563 · 870a563
1 parent 6ee077d
commit 870a563
Show file tree

Hide file tree

Showing 9 changed files with 10,166 additions and 17,314 deletions.
diff --git a/data/.DS_Store b/data/.DS_Store
diff --git a/data/chonjuk/chonjuk_trans_align_better_with_commentary.tsv b/data/chonjuk/chonjuk_trans_align_better_with_commentary.tsv
diff --git a/experiments/01_zero_shot_translation.py b/experiments/01_zero_shot_translation.py
@@ -8,67 +8,20 @@
 # Define the translation prompt
 def create_translation_prompt(tibetan_text):
     return f"""
-# Efficient Tibetan Translation Prompt
-
-You are an expert Tibetan Buddhist text translator. Provide literal English translations following these guidelines:
+Translate the following Buddhist Tibetan passage into English: {tibetan_text} English:
 
 ## Core Instructions
-1. For each Tibetan line:
-   - Break down key terms and particles
-   - Note essential grammatical structures
    - Enclose final English translation in <t> tags
-   - Add only critical technical notes
-
-2. Required elements:
-   - Preserve Sanskrit terms (e.g., dharmakāya)
-   - Use [brackets] for implied words
-   - Keep consistent term translations
-   - Maintain technical precision
-
-## Example Format:
-
-Input:
-བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང་།།
-
-Key terms:
-བདེ་གཤེགས་(Well-gone One), ཆོས་ཀྱི་སྐུ་(dharmakāya), སྲས་(heirs)
-
-<t>The Well-gone Ones who possess the dharmakāya, together with [their] heirs, and</t>
-
-## Multiple Line Example:
-
-བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང་།།
-ཕྱག་འོས་ཀུན་ལའང་གུས་པས་ཕྱག་འཚལ་ཏེ།།
-
-Key terms:
-- Line 1: བདེ་གཤེགས་(Well-gone One), ཆོས་ཀྱི་སྐུ་(dharmakāya)
-- Line 2: ཕྱག་འཚལ་(bow), གུས་པས་(respectfully)
-
-<t>The Well-gone Ones who possess the dharmakāya, together with [their] heirs, and</t>
-<t>To all those worthy of respect, I reverently bow.</t>
-
-Remember:
-- Each translation must use <t> tags
-- Include only essential analysis
-- Focus on accuracy over style
-- Note only crucial technical terms
-
-Input:
-{tibetan_text}
 """
 
 
 # Example usage
 if __name__ == "__main__":
     arg_parser = argparse.ArgumentParser()
-    arg_parser.add_argument("--debug", action="store_true")
-    arg_parser.add_argument("--testing", action="store_true")
+    arg_parser.add_argument("--test", action="store_true")
     args = arg_parser.parse_args()
 
     exp_name = Path(__file__).stem
-    result_fn = Path(__file__).parent / "results.json"
 
-    exp = Experiment(
-        exp_name, claud_sonet_chat, create_translation_prompt, str(result_fn)
-    )
-    exp.run_experiment(debug=args.debug, testing=args.testing)
+    exp = Experiment(exp_name, claud_sonet_chat, create_translation_prompt)
+    exp.run_experiment(test=args.test)
diff --git a/experiments/02_few_shot_translation_basic.py b/experiments/02_few_shot_translation_basic.py
@@ -0,0 +1,57 @@
+import argparse
+from pathlib import Path
+
+from experiment import Experiment
+from llm import claud_sonet_chat
+
+
+# Define the translation prompt
+def create_translation_prompt(tibetan_text):
+    return f"""
+# Efficient Tibetan Translation Prompt
+
+You are an expert Tibetan Buddhist text translator. Provide literal English translations following these guidelines:
+
+## Core Instructions
+1. For each Tibetan line:
+   - Break down key terms and particles
+   - Note essential grammatical structures
+   - Enclose final English translation in <t> tags
+   - Add only critical technical notes
+
+2. Required elements:
+   - Preserve Sanskrit terms (e.g., dharmakāya)
+   - Keep consistent term translations
+   - Maintain technical precision
+
+## Example Format:
+
+Input:
+བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང་། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། །
+
+Key terms:
+བདེ་གཤེགས་(Sugata), ཆོས་ཀྱི་སྐུ་(dharmakāya), སྲས་(Offspring), ཕྱག་འཚལ་(Prostrate)
+
+<t>
+I prostrate with respect to the sugatas, Who have the dharmakaya, and their offspring, And also to all worthy of veneration. I'll teach in brief, according to the scriptures, The way to enter the bodhisattva's vows.</t>
+
+Input:
+{tibetan_text}
+"""
+
+
+# Example usage
+if __name__ == "__main__":
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument("--test", action="store_true")
+    args = arg_parser.parse_args()
+
+    exp_name = Path(__file__).stem
+    result_fn = Path(__file__).parent / "results.json"
+
+    exp = Experiment(
+        exp_name,
+        claud_sonet_chat,
+        create_translation_prompt,
+    )
+    exp.run_experiment(test=args.test)
diff --git a/experiments/02_simple_zero_shot_translation.py b/experiments/02_simple_zero_shot_translation.py
diff --git a/experiments/experiment.py b/experiments/experiment.py
@@ -1,59 +1,39 @@
+import argparse
 import json
 import re
+from pathlib import Path
 from typing import List
 
 from tqdm import tqdm
 
 
 def parse_translations(text: str) -> List[str]:
-    """
-    Extract translations enclosed in <t> tags from text.
-
-    Args:
-        text (str): Input text containing translations in <t> tags
-
-    Returns:
-        List[str]: List of extracted translations
-
-    Example:
-        >>> text = '''
-        Some text here
-        <t>First translation</t>
-        More text
-        <t>Second translation</t>
-        '''
-        >>> parse_translations(text)
-        ['First translation', 'Second translation']
-    """
-    # Pattern matches anything between <t> and </t>, non-greedy
     pattern = r"<t>(.*?)</t>"
-
-    # Find all matches in the text
     translations = re.findall(pattern, text, re.DOTALL)
-
-    # Strip whitespace from each translation
     translations = [t.strip() for t in translations]
-
     return " ".join(translations)
 
 
 class Experiment:
-    def __init__(self, exp_name, llm, prompt_generator, result_fn):
+    def __init__(self, exp_name, llm, prompt_generator):
         self.llm = llm
         self.exp_name = exp_name
         self.prompt_generator = prompt_generator
-        self.result_fn = result_fn
+        self.result_fn = Path(__file__).parent / "results.json"
+
+        assert self.result_fn.exists(), f"Result file {self.result_fn} does not exist."
 
     def get_source_texts(self):
         results = json.load(open(self.result_fn, "r"))
         for text_id, data in tqdm(results.items()):
             yield text_id, data["source"]
 
-    def save_result(self, text_id, response):
+    def save_result(self, text_id, prompt, response):
         results = json.load(open(self.result_fn, "r"))
         if "target_pred" not in results[text_id]:
             results[text_id]["target_pred"] = {}
         results[text_id]["target_pred"][self.exp_name] = {
+            "prompt": prompt,
             "output": response,
             "translation": parse_translations(response),
         }
@@ -66,14 +46,15 @@ def is_translated(self, text_id):
             and self.exp_name in results[text_id]["target_pred"]
         )
 
-    def run_experiment(self, debug=False, testing=False):
+    def run_experiment(self, test=False):
         for source_text_id, source_text in self.get_source_texts():
             if self.is_translated(source_text_id):
                 continue
             prompt = self.prompt_generator(source_text)
             response = self.llm(prompt)
-            self.save_result(source_text_id, response)
-            if debug:
+            self.save_result(source_text_id, prompt, response)
+
+            if test:
                 print(f"Source text ID: {source_text_id}")
                 print(f"Source text: {source_text}")
                 print("-" * 100)
@@ -82,6 +63,4 @@ def run_experiment(self, debug=False, testing=False):
                 print(f"Response: {response}")
                 print("-" * 100)
                 print(f"Translations: {parse_translations(response)}")
-
-            if testing:
                 break
diff --git a/experiments/prompts/zero_shot_translation_prompt.txt b/experiments/prompts/zero_shot_translation_prompt.txt
diff --git a/experiments/results.json b/experiments/results.json
diff --git a/results.py b/results.py
@@ -1,18 +1,33 @@
 import json
 from pathlib import Path
 
-data_path = Path("data") / "TM0876"
-output_fn = data_path / "TM0876.tsv"
+data_path = Path("data") / "chonjuk"
+align_fn = data_path / "chonjuk_trans_align_better_with_commentary.tsv"
 result_fn = "results.json"
 
-assert output_fn.exists(), f"{output_fn} does not exist"
+assert align_fn.exists()
 
 results = {}
 
-with output_fn.open("r") as f:
-    for i, segment_pair in enumerate(f.readlines()):
-        print(segment_pair)
-        bo_line, en_line = segment_pair.strip().split("\t")
-        results[i] = {"source": bo_line, "target_gt": en_line}
+for i, segment_pair in enumerate(align_fn.read_text().splitlines()):
+    # skip header
+    if i == 0:
+        continue
+
+    parts = segment_pair.split("\t")
+    if len(parts) == 2:
+        bo_line, en_line = parts
+        commentry_1, commentry_2 = "", ""
+    elif len(parts) == 3:
+        bo_line, en_line, commentry_1 = parts
+        commentry_2 = ""
+    else:
+        bo_line, en_line, commentry_1, commentry_2 = parts
+    results[i] = {
+        "source": bo_line,
+        "target_gt": en_line,
+        "commentary_1": commentry_1,
+        "commentary_2": commentry_2,
+    }
 
 json.dump(results, open(result_fn, "w"), ensure_ascii=False, indent=2)