generated from OpenPecha/new-repo-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
10,166 additions
and
17,314 deletions.
There are no files selected for viewing
Binary file not shown.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import argparse | ||
from pathlib import Path | ||
|
||
from experiment import Experiment | ||
from llm import claud_sonet_chat | ||
|
||
|
||
# Define the translation prompt | ||
def create_translation_prompt(tibetan_text): | ||
return f""" | ||
# Efficient Tibetan Translation Prompt | ||
You are an expert Tibetan Buddhist text translator. Provide literal English translations following these guidelines: | ||
## Core Instructions | ||
1. For each Tibetan line: | ||
- Break down key terms and particles | ||
- Note essential grammatical structures | ||
- Enclose final English translation in <t> tags | ||
- Add only critical technical notes | ||
2. Required elements: | ||
- Preserve Sanskrit terms (e.g., dharmakāya) | ||
- Keep consistent term translations | ||
- Maintain technical precision | ||
## Example Format: | ||
Input: | ||
བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང་། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། ། | ||
Key terms: | ||
བདེ་གཤེགས་(Sugata), ཆོས་ཀྱི་སྐུ་(dharmakāya), སྲས་(Offspring), ཕྱག་འཚལ་(Prostrate) | ||
<t> | ||
I prostrate with respect to the sugatas, Who have the dharmakaya, and their offspring, And also to all worthy of veneration. I'll teach in brief, according to the scriptures, The way to enter the bodhisattva's vows.</t> | ||
Input: | ||
{tibetan_text} | ||
""" | ||
|
||
|
||
# Example usage | ||
if __name__ == "__main__": | ||
arg_parser = argparse.ArgumentParser() | ||
arg_parser.add_argument("--test", action="store_true") | ||
args = arg_parser.parse_args() | ||
|
||
exp_name = Path(__file__).stem | ||
result_fn = Path(__file__).parent / "results.json" | ||
|
||
exp = Experiment( | ||
exp_name, | ||
claud_sonet_chat, | ||
create_translation_prompt, | ||
) | ||
exp.run_experiment(test=args.test) |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,33 @@ | ||
import json | ||
from pathlib import Path | ||
|
||
data_path = Path("data") / "TM0876" | ||
output_fn = data_path / "TM0876.tsv" | ||
data_path = Path("data") / "chonjuk" | ||
align_fn = data_path / "chonjuk_trans_align_better_with_commentary.tsv" | ||
result_fn = "results.json" | ||
|
||
assert output_fn.exists(), f"{output_fn} does not exist" | ||
assert align_fn.exists() | ||
|
||
results = {} | ||
|
||
with output_fn.open("r") as f: | ||
for i, segment_pair in enumerate(f.readlines()): | ||
print(segment_pair) | ||
bo_line, en_line = segment_pair.strip().split("\t") | ||
results[i] = {"source": bo_line, "target_gt": en_line} | ||
for i, segment_pair in enumerate(align_fn.read_text().splitlines()): | ||
# skip header | ||
if i == 0: | ||
continue | ||
|
||
parts = segment_pair.split("\t") | ||
if len(parts) == 2: | ||
bo_line, en_line = parts | ||
commentry_1, commentry_2 = "", "" | ||
elif len(parts) == 3: | ||
bo_line, en_line, commentry_1 = parts | ||
commentry_2 = "" | ||
else: | ||
bo_line, en_line, commentry_1, commentry_2 = parts | ||
results[i] = { | ||
"source": bo_line, | ||
"target_gt": en_line, | ||
"commentary_1": commentry_1, | ||
"commentary_2": commentry_2, | ||
} | ||
|
||
json.dump(results, open(result_fn, "w"), ensure_ascii=False, indent=2) |