Skip to content

Commit

Permalink
extract glossary from translation output and analyse them
Browse files Browse the repository at this point in the history
  • Loading branch information
10zinten committed Dec 27, 2024
1 parent 6c5a073 commit 7fe3523
Show file tree
Hide file tree
Showing 4 changed files with 29,213 additions and 4 deletions.
42 changes: 42 additions & 0 deletions analyse_translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import json
from collections import defaultdict

import config

results = json.load(open(config.results_fn, "r"))


def if_en_translation_exists(en_word, label_data):
for item in label_data:
if en_word == item["translation"]:
return True
return False


word_translations = defaultdict(lambda: defaultdict(list))
for text_id, data in results.items():
if "glossary" not in data:
continue
for label, glossary in data["glossary"].items():
for line, line_glossary in glossary.items():
for bo_word, en_word in line_glossary.items():
if if_en_translation_exists(en_word, word_translations[bo_word][label]):
continue
word_translations[bo_word][label].append(
{
"translation": en_word,
"source": text_id,
"line": line,
}
)


label = "target_gt"
for word, translations in word_translations.items():
data = translations[label]
if len(data) > 1:
print(f" {label}:")
for item in data:
print(
f" - {word} {item['translation']} ({item['source']}, {item['line']})"
)
3 changes: 3 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from pathlib import Path

results_fn = Path(__file__).parent / "results.json"
126 changes: 126 additions & 0 deletions extract_glossary.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,131 @@
import json
from pathlib import Path

from tqdm import tqdm

from experiments.llm import claud_sonet_chat

results_fn = Path(__file__).parent / "results.json"
results = json.load(open(results_fn, "r"))


def get_source_and_target(text_id):
return results[text_id]["source"], results[text_id]["target_gt"]


def get_experiments_translation(text_id):
for exp_name, pred in results[text_id]["target_pred"].items():
yield exp_name, pred["translation"]


def generate_prompt(source, translation):
return f"""
# Glossary Extraction Prompt
Extract Tibetan to English Glossary from the following translation:
Tibetan: {source} English: {translation}
## Core Instructions
- Find the English term used to each Tibetan term from the English translation
- Ignore Tibetan terms that are not translated to English
- DO NOT create a new translation for missing translation of Tibetan term
- Each Tibetan term should only have one English translation term in the Glossary
- Create a set of glossary for each Tibetan line separately
- Follow the example response format
- Do not include any additional information
## Example Response Format:
1 བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང་།
1.1 Glossary: བདེ་གཤེགས་ (Sugata), ཆོས་ཀྱི་སྐུ་ (dharmakāya), མངའ་ (possess), སྲས་ (offspring)
2 །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ།
2.1 Glossary: ཕྱག་འོས་ (worthy of veneration),
3 །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི།
3.1 བདེ་གཤེགས་ (Sugata), སྲས་ (offspring), སྡོམ་ (vows), འཇུག་པ་ (enter)
"""


def parse_glossary(text):
"""
Parse Tibetan text with glossary into a dictionary format.
Args:
text (str): Multi-line string containing numbered Tibetan text and glossary
Returns:
dict: Dictionary with Tibetan text as keys and glossary dictionaries as values
"""
# Initialize result dictionary
result = {}

# Split text into lines and remove empty lines
lines = [line.strip() for line in text.strip().split("\n") if line.strip()]

current_text = ""

for line in lines:
# If line starts with number but no decimal (main text)
if line[0].isdigit() and "." not in line:
# Remove number and period from start of line
current_text = line.split(" ", 1)[1].strip()
result[current_text] = {}

# If line starts with number and decimal (glossary)
elif "." in line.split(" ")[0]:
# Skip the glossary label
glossary_items = line.split(":", 1)[1].strip()

# Split items by comma and process each
items = [item.strip() for item in glossary_items.split(",")]

for item in items:
# Remove parentheses and split by space
tibetan, english = item.strip().split("(")
tibetan = tibetan.strip()
english = english.rstrip(")").strip()

# Add to dictionary
result[current_text][tibetan] = english

return result


def save_glossary(text_id, glossary, exp_name=None):
if exp_name:
results[text_id]["glossary"][exp_name] = glossary
else:
if "glossary" not in results[text_id]:
results[text_id]["glossary"] = {}
results[text_id]["glossary"]["target_gt"] = glossary


def is_glossary_extracted(text_id, exp_name=None):
if not exp_name:
return (
"glossary" in results[text_id]
and "target_gt" in results[text_id]["glossary"]
)
else:
return exp_name in results[text_id]["glossary"]


def extract_glossary():
for text_id in tqdm(results):
if is_glossary_extracted(text_id):
continue
source, target = get_source_and_target(text_id)
prompt = generate_prompt(source, target)
output = claud_sonet_chat(prompt)
glossary = parse_glossary(output)
save_glossary(text_id, glossary)
for exp_name, llm_translation in get_experiments_translation(text_id):
if is_glossary_extracted(text_id, exp_name=exp_name):
continue
prompt = generate_prompt(source, llm_translation)
output = claud_sonet_chat(prompt)
glossary = parse_glossary(output)
save_glossary(text_id, glossary, exp_name=exp_name)
json.dump(results, open(results_fn, "w"), indent=2, ensure_ascii=False)


if __name__ == "__main__":
extract_glossary()
Loading

0 comments on commit 7fe3523

Please sign in to comment.