Skip to content

Commit

Permalink
ran exp 1 and 2
Browse files Browse the repository at this point in the history
  • Loading branch information
10zinten committed Dec 25, 2024
1 parent 6ee077d commit 870a563
Show file tree
Hide file tree
Showing 9 changed files with 10,166 additions and 17,314 deletions.
Binary file added data/.DS_Store
Binary file not shown.
7 changes: 3 additions & 4 deletions data/chonjuk/chonjuk_trans_align_better_with_commentary.tsv

Large diffs are not rendered by default.

55 changes: 4 additions & 51 deletions experiments/01_zero_shot_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,67 +8,20 @@
# Define the translation prompt
def create_translation_prompt(tibetan_text):
return f"""
# Efficient Tibetan Translation Prompt
You are an expert Tibetan Buddhist text translator. Provide literal English translations following these guidelines:
Translate the following Buddhist Tibetan passage into English: {tibetan_text} English:
## Core Instructions
1. For each Tibetan line:
- Break down key terms and particles
- Note essential grammatical structures
- Enclose final English translation in <t> tags
- Add only critical technical notes
2. Required elements:
- Preserve Sanskrit terms (e.g., dharmakāya)
- Use [brackets] for implied words
- Keep consistent term translations
- Maintain technical precision
## Example Format:
Input:
བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང་།།
Key terms:
བདེ་གཤེགས་(Well-gone One), ཆོས་ཀྱི་སྐུ་(dharmakāya), སྲས་(heirs)
<t>The Well-gone Ones who possess the dharmakāya, together with [their] heirs, and</t>
## Multiple Line Example:
བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང་།།
ཕྱག་འོས་ཀུན་ལའང་གུས་པས་ཕྱག་འཚལ་ཏེ།།
Key terms:
- Line 1: བདེ་གཤེགས་(Well-gone One), ཆོས་ཀྱི་སྐུ་(dharmakāya)
- Line 2: ཕྱག་འཚལ་(bow), གུས་པས་(respectfully)
<t>The Well-gone Ones who possess the dharmakāya, together with [their] heirs, and</t>
<t>To all those worthy of respect, I reverently bow.</t>
Remember:
- Each translation must use <t> tags
- Include only essential analysis
- Focus on accuracy over style
- Note only crucial technical terms
Input:
{tibetan_text}
"""


# Example usage
if __name__ == "__main__":
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument("--debug", action="store_true")
arg_parser.add_argument("--testing", action="store_true")
arg_parser.add_argument("--test", action="store_true")
args = arg_parser.parse_args()

exp_name = Path(__file__).stem
result_fn = Path(__file__).parent / "results.json"

exp = Experiment(
exp_name, claud_sonet_chat, create_translation_prompt, str(result_fn)
)
exp.run_experiment(debug=args.debug, testing=args.testing)
exp = Experiment(exp_name, claud_sonet_chat, create_translation_prompt)
exp.run_experiment(test=args.test)
57 changes: 57 additions & 0 deletions experiments/02_few_shot_translation_basic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import argparse
from pathlib import Path

from experiment import Experiment
from llm import claud_sonet_chat


# Define the translation prompt
def create_translation_prompt(tibetan_text):
return f"""
# Efficient Tibetan Translation Prompt
You are an expert Tibetan Buddhist text translator. Provide literal English translations following these guidelines:
## Core Instructions
1. For each Tibetan line:
- Break down key terms and particles
- Note essential grammatical structures
- Enclose final English translation in <t> tags
- Add only critical technical notes
2. Required elements:
- Preserve Sanskrit terms (e.g., dharmakāya)
- Keep consistent term translations
- Maintain technical precision
## Example Format:
Input:
བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང་། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། །
Key terms:
བདེ་གཤེགས་(Sugata), ཆོས་ཀྱི་སྐུ་(dharmakāya), སྲས་(Offspring), ཕྱག་འཚལ་(Prostrate)
<t>
I prostrate with respect to the sugatas, Who have the dharmakaya, and their offspring, And also to all worthy of veneration. I'll teach in brief, according to the scriptures, The way to enter the bodhisattva's vows.</t>
Input:
{tibetan_text}
"""


# Example usage
if __name__ == "__main__":
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument("--test", action="store_true")
args = arg_parser.parse_args()

exp_name = Path(__file__).stem
result_fn = Path(__file__).parent / "results.json"

exp = Experiment(
exp_name,
claud_sonet_chat,
create_translation_prompt,
)
exp.run_experiment(test=args.test)
31 changes: 0 additions & 31 deletions experiments/02_simple_zero_shot_translation.py

This file was deleted.

45 changes: 12 additions & 33 deletions experiments/experiment.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,39 @@
import argparse
import json
import re
from pathlib import Path
from typing import List

from tqdm import tqdm


def parse_translations(text: str) -> List[str]:
"""
Extract translations enclosed in <t> tags from text.
Args:
text (str): Input text containing translations in <t> tags
Returns:
List[str]: List of extracted translations
Example:
>>> text = '''
Some text here
<t>First translation</t>
More text
<t>Second translation</t>
'''
>>> parse_translations(text)
['First translation', 'Second translation']
"""
# Pattern matches anything between <t> and </t>, non-greedy
pattern = r"<t>(.*?)</t>"

# Find all matches in the text
translations = re.findall(pattern, text, re.DOTALL)

# Strip whitespace from each translation
translations = [t.strip() for t in translations]

return " ".join(translations)


class Experiment:
def __init__(self, exp_name, llm, prompt_generator, result_fn):
def __init__(self, exp_name, llm, prompt_generator):
self.llm = llm
self.exp_name = exp_name
self.prompt_generator = prompt_generator
self.result_fn = result_fn
self.result_fn = Path(__file__).parent / "results.json"

assert self.result_fn.exists(), f"Result file {self.result_fn} does not exist."

def get_source_texts(self):
results = json.load(open(self.result_fn, "r"))
for text_id, data in tqdm(results.items()):
yield text_id, data["source"]

def save_result(self, text_id, response):
def save_result(self, text_id, prompt, response):
results = json.load(open(self.result_fn, "r"))
if "target_pred" not in results[text_id]:
results[text_id]["target_pred"] = {}
results[text_id]["target_pred"][self.exp_name] = {
"prompt": prompt,
"output": response,
"translation": parse_translations(response),
}
Expand All @@ -66,14 +46,15 @@ def is_translated(self, text_id):
and self.exp_name in results[text_id]["target_pred"]
)

def run_experiment(self, debug=False, testing=False):
def run_experiment(self, test=False):
for source_text_id, source_text in self.get_source_texts():
if self.is_translated(source_text_id):
continue
prompt = self.prompt_generator(source_text)
response = self.llm(prompt)
self.save_result(source_text_id, response)
if debug:
self.save_result(source_text_id, prompt, response)

if test:
print(f"Source text ID: {source_text_id}")
print(f"Source text: {source_text}")
print("-" * 100)
Expand All @@ -82,6 +63,4 @@ def run_experiment(self, debug=False, testing=False):
print(f"Response: {response}")
print("-" * 100)
print(f"Translations: {parse_translations(response)}")

if testing:
break
4 changes: 0 additions & 4 deletions experiments/prompts/zero_shot_translation_prompt.txt

This file was deleted.

27,250 changes: 10,067 additions & 17,183 deletions experiments/results.json

Large diffs are not rendered by default.

31 changes: 23 additions & 8 deletions results.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,33 @@
import json
from pathlib import Path

data_path = Path("data") / "TM0876"
output_fn = data_path / "TM0876.tsv"
data_path = Path("data") / "chonjuk"
align_fn = data_path / "chonjuk_trans_align_better_with_commentary.tsv"
result_fn = "results.json"

assert output_fn.exists(), f"{output_fn} does not exist"
assert align_fn.exists()

results = {}

with output_fn.open("r") as f:
for i, segment_pair in enumerate(f.readlines()):
print(segment_pair)
bo_line, en_line = segment_pair.strip().split("\t")
results[i] = {"source": bo_line, "target_gt": en_line}
for i, segment_pair in enumerate(align_fn.read_text().splitlines()):
# skip header
if i == 0:
continue

parts = segment_pair.split("\t")
if len(parts) == 2:
bo_line, en_line = parts
commentry_1, commentry_2 = "", ""
elif len(parts) == 3:
bo_line, en_line, commentry_1 = parts
commentry_2 = ""
else:
bo_line, en_line, commentry_1, commentry_2 = parts
results[i] = {
"source": bo_line,
"target_gt": en_line,
"commentary_1": commentry_1,
"commentary_2": commentry_2,
}

json.dump(results, open(result_fn, "w"), ensure_ascii=False, indent=2)

0 comments on commit 870a563

Please sign in to comment.