add translation similarity analysis reports

OpenPecha · Jan 3, 2025 · ca8c18e · ca8c18e
1 parent d5b9848
commit ca8c18e
Show file tree

Hide file tree

Showing 12 changed files with 9,472 additions and 13,290 deletions.
diff --git a/analyse/analysis.ipynb b/analyse/analysis.ipynb
@@ -0,0 +1,298 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import csv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "translations_fn = Path.cwd().parent / \"reports\" / \"translations_comparison.csv\"\n",
+    "assert translations_fn.exists(), translations_fn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(translations_fn, 'r', newline='') as file:\n",
+    "    reader = csv.DictReader(file)\n",
+    "    translations = list(reader)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(839,\n",
+       " {'Source': 'བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང་། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། །',\n",
+       "  'target_gt': \"I prostrate with respect to the sugatas, Who have the dharmakaya, and their offspring, And also to all worthy of veneration. I'll teach in brief, according to the scriptures, The way to enter the bodhisattva's vows.\",\n",
+       "  '01_zero_shot': \"I respectfully bow to the Dharmakaya of the Sugatas and their spiritual heirs,\\nAnd to all who are worthy of veneration.\\nI will now briefly explain, in accordance with scripture,\\nHow to enter the vows of the Sugatas' heirs.\",\n",
+       "  '03_few_shot_advance': \"I respectfully prostrate to the Sugatas who possess the dharmakāya along with their offspring,\\nAnd also to all who are worthy of veneration.\\nI shall explain in brief, according to the scriptures,\\nThe way to enter the vows of the Sugatas' offspring.\",\n",
+       "  '02_few_shot': \"I respectfully prostrate to the Sugatas who possess the dharmakaya, along with their offspring,\\nAnd to all who are worthy of veneration.\\nI will now briefly explain, in accordance with scripture,\\nHow to enter into the vows of the Sugatas' offspring.\",\n",
+       "  '04_commentary_assisted': 'I respectfully bow to the Sugatas who possess the Dharma body, together with their Sons,\\nAnd to all who are worthy of veneration.\\nI shall explain how to engage in the vows of the Sons of the Sugatas,\\nIn brief, according to the scriptures.',\n",
+       "  '06_glossary_assisted': \"I respectfully prostrate to the Sugatas possessing the Dharmakaya along with their spiritual heirs,\\nAnd to all who are worthy of reverence.\\nI will briefly explain, in accordance with scripture,\\nHow to enter into the vows of the Sugatas' spiritual heirs.\",\n",
+       "  '07_commentary_and_glossary_assisted': \"To Sugatas with Dharmakaya, their offspring, and all\\nWorthy of homage, I bow down with deep respect.\\nThe vows of the Sugatas' heirs, how to enter them,\\nI'll explain briefly, in accord with the scriptures.\"})"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(translations), translations[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reference: The quick brown fox jumps over the lazy dog.\n",
+      "\n",
+      "Top 3 Most Similar Examples:\n",
+      "\n",
+      "1. Text: The quick brown fox jumps over the lazy dog.\n",
+      "   Scores: BLEU=1.000, chrF++=1.000, TER=0.000\n",
+      "\n",
+      "2. Text: The quick brown fox jumps over a lazy dog.\n",
+      "   Scores: BLEU=0.658, chrF++=0.842, TER=0.111\n",
+      "\n",
+      "3. Text: The agile brown fox jumps past the lazy dog.\n",
+      "   Scores: BLEU=0.393, chrF++=0.620, TER=0.222\n",
+      "\n",
+      "Top 3 Moderate Similarity Examples:\n",
+      "\n",
+      "1. Text: The swift fox leaped above the drowsy hound.\n",
+      "   Scores: BLEU=0.060, chrF++=0.198, TER=0.667\n",
+      "\n",
+      "2. Text: A brown fox and a lazy dog in the park.\n",
+      "   Scores: BLEU=0.117, chrF++=0.370, TER=0.889\n",
+      "\n",
+      "3. Text: The rapid brown fox hops across the sleepy dog.\n",
+      "   Scores: BLEU=0.131, chrF++=0.355, TER=0.444\n",
+      "\n",
+      "Top 3 Most Dissimilar Examples:\n",
+      "\n",
+      "1. Text: A cat chases a mouse.\n",
+      "   Scores: BLEU=0.042, chrF++=0.056, TER=1.000\n",
+      "\n",
+      "2. Text: Animals run in the field.\n",
+      "   Scores: BLEU=0.050, chrF++=0.100, TER=0.889\n",
+      "\n",
+      "3. Text: Birds fly in the blue sky.\n",
+      "   Scores: BLEU=0.051, chrF++=0.099, TER=0.889\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sacrebleu\n",
+    "from sacrebleu.metrics import BLEU, CHRF, TER\n",
+    "from typing import List, Tuple, Dict\n",
+    "\n",
+    "class TextSimilarityAnalyzer:\n",
+    "    def __init__(self):\n",
+    "        self.bleu = BLEU()\n",
+    "        self.chrf = CHRF(word_order=2)  # chrF++ uses word_order=2\n",
+    "        self.ter = TER()\n",
+    "\n",
+    "    def calculate_metrics(self, reference: str, hypothesis: str) -> Dict[str, float]:\n",
+    "        \"\"\"\n",
+    "        Calculate BLEU, chrF++, and TER scores for a single pair of texts.\n",
+    "\n",
+    "        Args:\n",
+    "            reference: Reference text\n",
+    "            hypothesis: Hypothesis text to compare against reference\n",
+    "\n",
+    "        Returns:\n",
+    "            Dictionary containing the three metric scores\n",
+    "        \"\"\"\n",
+    "        # Convert single strings to lists as required by sacrebleu\n",
+    "        refs = [reference]\n",
+    "        hyps = [hypothesis]\n",
+    "\n",
+    "        # Calculate scores\n",
+    "        bleu_score = self.bleu.corpus_score(hyps, [refs]).score / 100  # Normalize to 0-1\n",
+    "        chrf_score = self.chrf.corpus_score(hyps, [refs]).score / 100  # Normalize to 0-1\n",
+    "        ter_score = self.ter.corpus_score(hyps, [refs]).score / 100    # Normalize to 0-1\n",
+    "\n",
+    "        return {\n",
+    "            'bleu': bleu_score,\n",
+    "            'chrf': chrf_score,\n",
+    "            'ter': ter_score\n",
+    "        }\n",
+    "\n",
+    "    def find_n_examples_by_similarity(self,\n",
+    "                                    reference: str,\n",
+    "                                    candidates: List[str],\n",
+    "                                    n: int = 3,\n",
+    "                                    weights: Dict[str, float] = None) -> Dict:\n",
+    "        \"\"\"\n",
+    "        Find n most similar, n most dissimilar, and n moderate similarity candidates.\n",
+    "\n",
+    "        Args:\n",
+    "            reference: Reference text\n",
+    "            candidates: List of candidate texts to compare\n",
+    "            n: Number of examples to return for each category\n",
+    "            weights: Optional dictionary with weights for each metric (default: equal weights)\n",
+    "\n",
+    "        Returns:\n",
+    "            Dictionary containing most_similar, moderate, and most_dissimilar lists with their scores\n",
+    "        \"\"\"\n",
+    "        if weights is None:\n",
+    "            weights = {'bleu': 1/3, 'chrf': 1/3, 'ter': 1/3}\n",
+    "\n",
+    "        # Ensure n doesn't exceed one-third of the candidates\n",
+    "        n = min(n, len(candidates) // 3)\n",
+    "\n",
+    "        candidate_scores = []\n",
+    "        for candidate in candidates:\n",
+    "            metrics = self.calculate_metrics(reference, candidate)\n",
+    "\n",
+    "            # Calculate weighted average score (invert TER as lower is better)\n",
+    "            weighted_score = (\n",
+    "                weights['bleu'] * metrics['bleu'] +\n",
+    "                weights['chrf'] * metrics['chrf'] +\n",
+    "                weights['ter'] * (1 - metrics['ter'])  # Invert TER score\n",
+    "            )\n",
+    "\n",
+    "            candidate_scores.append((candidate, weighted_score, metrics))\n",
+    "\n",
+    "        # Sort by weighted score\n",
+    "        candidate_scores.sort(key=lambda x: x[1])\n",
+    "\n",
+    "        def create_result_entry(item):\n",
+    "            return {\n",
+    "                'text': item[0],\n",
+    "                'weighted_score': item[1],\n",
+    "                'metrics': item[2]\n",
+    "            }\n",
+    "\n",
+    "        # Get n most dissimilar (lowest scores)\n",
+    "        most_dissimilar = [create_result_entry(candidate_scores[i])\n",
+    "                          for i in range(n)]\n",
+    "\n",
+    "        # Get n most similar (highest scores)\n",
+    "        most_similar = [create_result_entry(candidate_scores[-(i+1)])\n",
+    "                       for i in range(n)]\n",
+    "\n",
+    "        # Get n moderate examples from the middle\n",
+    "        middle_start = (len(candidate_scores) - n) // 2\n",
+    "        moderate = [create_result_entry(candidate_scores[middle_start + i])\n",
+    "                   for i in range(n)]\n",
+    "\n",
+    "        return {\n",
+    "            'most_similar': most_similar,\n",
+    "            'moderate': moderate,\n",
+    "            'most_dissimilar': most_dissimilar\n",
+    "        }\n",
+    "\n",
+    "# Example usage\n",
+    "if __name__ == \"__main__\":\n",
+    "    analyzer = TextSimilarityAnalyzer()\n",
+    "\n",
+    "    reference = \"The quick brown fox jumps over the lazy dog.\"\n",
+    "    candidates = [\n",
+    "        \"The quick brown fox jumps over the lazy dog.\",      # Identical\n",
+    "        \"The fast brown fox leaps over the tired dog.\",      # Similar\n",
+    "        \"A dog sleeps on the ground.\",                       # Very different\n",
+    "        \"The quick brown fox jumps over a lazy dog.\",        # Minor difference\n",
+    "        \"The rapid brown fox hops across the sleepy dog.\",   # Somewhat similar\n",
+    "        \"A cat chases a mouse.\",                             # Very different\n",
+    "        \"The brown fox quickly jumped over lazy dogs.\",      # Moderately similar\n",
+    "        \"Dogs are lazy animals.\",                            # Very different\n",
+    "        \"A fox and a dog play in the garden.\",              # Moderate\n",
+    "        \"The agile brown fox jumps past the lazy dog.\",      # Similar\n",
+    "        \"Birds fly in the blue sky.\",                        # Very different\n",
+    "        \"The quick brown fox jumped over lazy dogs.\",        # Similar\n",
+    "        \"Animals run in the field.\",                         # Very different\n",
+    "        \"A brown fox and a lazy dog in the park.\",          # Moderate\n",
+    "        \"The swift fox leaped above the drowsy hound.\"      # Moderate\n",
+    "    ]\n",
+    "\n",
+    "    # Optional: custom weights for metrics\n",
+    "    weights = {\n",
+    "        'bleu': 0.4,\n",
+    "        'chrf': 0.4,\n",
+    "        'ter': 0.2\n",
+    "    }\n",
+    "\n",
+    "    # Get 3 examples for each category\n",
+    "    results = analyzer.find_n_examples_by_similarity(\n",
+    "        reference, candidates, n=3, weights=weights\n",
+    "    )\n",
+    "\n",
+    "    print(f\"Reference: {reference}\\n\")\n",
+    "\n",
+    "    print(\"Top 3 Most Similar Examples:\")\n",
+    "    for i, result in enumerate(results['most_similar'], 1):\n",
+    "        print(f\"\\n{i}. Text: {result['text']}\")\n",
+    "        print(f\"   Scores: BLEU={result['metrics']['bleu']:.3f}, \"\n",
+    "              f\"chrF++={result['metrics']['chrf']:.3f}, \"\n",
+    "              f\"TER={result['metrics']['ter']:.3f}\")\n",
+    "\n",
+    "    print(\"\\nTop 3 Moderate Similarity Examples:\")\n",
+    "    for i, result in enumerate(results['moderate'], 1):\n",
+    "        print(f\"\\n{i}. Text: {result['text']}\")\n",
+    "        print(f\"   Scores: BLEU={result['metrics']['bleu']:.3f}, \"\n",
+    "              f\"chrF++={result['metrics']['chrf']:.3f}, \"\n",
+    "              f\"TER={result['metrics']['ter']:.3f}\")\n",
+    "\n",
+    "    print(\"\\nTop 3 Most Dissimilar Examples:\")\n",
+    "    for i, result in enumerate(results['most_dissimilar'], 1):\n",
+    "        print(f\"\\n{i}. Text: {result['text']}\")\n",
+    "        print(f\"   Scores: BLEU={result['metrics']['bleu']:.3f}, \"\n",
+    "              f\"chrF++={result['metrics']['chrf']:.3f}, \"\n",
+    "              f\"TER={result['metrics']['ter']:.3f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}