OpenPecha · jim-gyas · Jan 10, 2025 · Jan 13, 2025 · Jan 13, 2025 · Jan 14, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,6 +17,7 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 dependencies = [
+  "boto3<=1.36.2",
   "anthropic<=0.42.0",
   "evaluate<=0.4.3",
   "jiwer<=3.0.5",

diff --git a/src/stt_data_with_llm/LLM_post_corrector.py b/src/stt_data_with_llm/LLM_post_corrector.py
@@ -4,10 +4,7 @@
 import anthropic
 from dotenv import load_dotenv
 
-from stt_data_with_llm.util import setup_logging
-
 load_dotenv()
-setup_logging("llm_corrector.log")
 
 
 def get_LLM_corrected_text(inference_text, is_valid, reference_text=None):
@@ -26,23 +23,80 @@ def get_LLM_corrected_text(inference_text, is_valid, reference_text=None):
 
     if is_valid and reference_text is not None:
         prompt = f"""
-            I have two sentences: a colloquial sentence and a reference sentence.
-            Your task is to EXACTLY match the spellings from the reference sentence.
-            Do not make any corrections beyond matching the reference sentence exactly, even if you think a word is misspelled.   # noqa
-            If a word appears the same way in both sentences, do not change it.
-            Colloquial sentence: {inference_text}
-            Reference sentence: {reference_text}
-            Give me only the corrected sentence that exactly matches the reference, without any explanation
-            """
+            Your task is to correct Colloquial Tibetan sentences by comparing them with Reference sentences. Here are your specific responsibilities:
+            Main Task:
+            - Correct spelling and grammar mistakes in the Colloquial sentence by carefully comparing with the Reference sentence
+            - Preserve All Chinese-derived terms (regardless of their spelling),The original structure of compound terms basic terms and sentence structure unless there's a clear mistake
+            - Convert written numbers to Tibetan numerals (e.g., བདུན་ཅུ་ to ༧༠)
+            - DO NOT add or remove particles (like དུ་, ནི་, etc.) that aren't present in the original colloquial text
+            - DO NOT modify word order or syntax from the original colloquial text
+            - Particles that appear differently in the Reference sentence (e.g., ཀྱི་ should be ཀྱིས if Reference shows ཀྱིས)
+
+
+            Guidelines:
+            1. DO CHANGE:
+            - Incorrect spellings of pure Tibetan words (e.g., ཚོས་ should be ཆོས་)
+            - Basic Tibetan particle spelling mistakes (like ཀི་ to ཀྱི་)
+            - Particles that appear differently in the Reference sentence (e.g., ཀྱི་ should be ཀྱིས if Reference shows ཀྱིས)
+            - Obviously incorrect syllable formation in pure Tibetan words
+            - Number formats (convert to Tibetan numerals)
+            - Clear grammatical errors in Tibetan particles
+            - Words that appear in the Reference sentence with different spelling (e.g., ཐབས་ should be འཐབ་ if Reference shows འཐབ་)
+            2. DO NOT CHANGE:
+            - Any Chinese words written in Tibetan script (MOST IMPORTANT)
+            - ANY term that might be derived from Chinese
+            - DO NOT add or remove particles (like དུ་, ནི་, etc.) that aren't present in the original colloquial text
+
+
+            3. PRESERVE EXACTLY:
+            - All Chinese-derived terms (regardless of their spelling)
+            - The original structure of compound terms
+
+            Example:
+            Colloquial: ཚོས་ལུགས་ལས་དོན་གཅོས་ཀྱི་ཐབས་གཅོག་གཅིག་ྒྱུར་ལས་དོན་འབྲེལ་ཡོད་ལས་བྱེད་དགོན་པ་ཁག་ཏུ་ཉུས་ཞིབ་ཀི་དགོན་སྡེ་ཁག
+            Reference: འཐབ་ཕྱོགས་གཅིག་གྱུར་ལས་དོན་དང་འབྲེལ་ཡོད་ལས་བྱེད་དགོན་པ་ཁག་ཏུ་ཉུལ་ཞིབ་ཀྱིས
+            Corrected: ཆོས་ལུགས་ལས་དོན་གཅོས་ཀྱི་འཐབ་གཅོག་གཅིག་གྱུར་ལས་དོན་འབྲེལ་ཡོད་ལས་བྱེད་དགོན་པ་ཁག་ཏུ་ཉུལ་ཞིབ་ཀྱི་དགོན་སྡེ་ཁག
+
+            Format:
+            Colloquial: {inference_text}
+            Reference: {reference_text}
+            Output: Return only the corrected sentence without any explanation or additional
+            Note:
+            - Output ONLY the corrected sentence with no additional text or explanations
+            - If you notice spelling mistakes in the Reference sentence, rely on standard Tibetan orthography rather than the Reference sentence
+            - Don't add or delete words from the Colloquial sentence unless there's a clear mistake
+            - Cross-reference spellings with the Reference sentence when available
+            - Use the Reference sentence for both spelling verification and correct word forms
+            """  # noqa
     else:
         prompt = f"""
-            You are a Tibetan Language Expert. I want you to look for any spelling and grammar mistakes in the following Tibetan
-            sentence. Make sure that you don't change the terms and sentence if its not grammatically incorrect.
+            You are a Tibetan Language Expert specializing in modern Chinese-Tibetan terminology. Analyze the following Tibetan sentence with these STRICT requirements:
+
+            1. DO CHANGE:
+            - Incorrect spellings of pure Tibetan words (e.g., ཚོས་ should be ཆོས་)
+            - Basic Tibetan particle spelling mistakes (like ཀི་ to ཀྱི་)
+            - Obviously incorrect syllable formation in pure Tibetan words
+            - Number formats (convert to Tibetan numerals)
+            - Clear grammatical errors in Tibetan particles
+
+            2. DO NOT CHANGE:
+            - Any Chinese words written in Tibetan script (MOST IMPORTANT)
+            - ANY term that might be derived from Chinese
+
+            3. PRESERVE EXACTLY:
+            - All Chinese-derived terms (regardless of their spelling)
+            - The original structure of compound terms
+
             Tibetan sentence: {inference_text}
-            Output: output should be only the corrected sentence.
-            Give me only the corrected sentence without any explanation
-            """
 
+            Output: Return only the corrected sentence without any explanation or additional
+            CRITICAL:
+            - Preserve ALL Chinese-derived terms exactly as written
+            - DO correct misspelled pure Tibetan words (like ཚོས་ to ཆོས་)
+            - Only correct obvious Tibetan grammar particles and numbering
+
+            Remember: When in doubt about whether a term is Chinese-derived, preserve the original spelling.
+      """  # noqa
     try:
         # Make API call to Claude
         response = client.messages.create(

diff --git a/src/stt_data_with_llm/audio_parser.py b/src/stt_data_with_llm/audio_parser.py
@@ -15,14 +15,12 @@
     AUDIO_SEG_UPPER_LIMIT,
     HYPER_PARAMETERS,
 )
-from stt_data_with_llm.util import setup_logging
 
 # load the evnironment variable
 load_dotenv()
 
 USE_AUTH_TOKEN = os.getenv("use_auth_token")
 # Call the setup_logging function at the beginning of your script
-setup_logging("audio_parser.log")
 
 
 def sec_to_millis(seconds):
@@ -86,24 +84,6 @@ def initialize_vad_pipeline():
     return vad_pipeline
 
 
-def save_segment(segment, folder, prefix, id, start_ms, end_ms):
-    """Saves an audio segment to WAV file with standardized naming.
-
-    Args:
-        segment (AudioSegment): Audio segment to save
-        folder (str): Output directory path
-        prefix (str): Filename prefix
-        id (int): Segment Identifier
-        start_ms (float): Segment start time in milliseconds
-        end_ms (float): Segment end time in milliseconds
-    """
-    segment.export(
-        f"{folder}/{prefix}_{id:04}_{int(start_ms)}_to_{int(end_ms)}.wav",  # noqa: E231
-        format="wav",
-        parameters=["-ac", "1", "-ar", "16000"],
-    )
-
-
 def convert_to_16K(audio_data):
     """Converts audio data to 16kHz mono WAV format.
 
@@ -167,7 +147,6 @@ def chop_long_segment_duration(
     sampling_rate,
     full_audio_id,
     split_audio,
-    output_folder,
     counter,
 ):
     """Splits an audio segment into smaller chunks if its duration exceeds the specified upper limit.
@@ -192,35 +171,21 @@ def chop_long_segment_duration(
     while chop_length > upper_limit:
         chop_length = chop_length / 2
     for chop_index in range(int(segment_split_duration / chop_length)):
-        segment_split_chop = original_audio_segment[
-            sec_to_millis(
-                vad_span.start
-                + frame_to_sec(split_start, sampling_rate)
-                + chop_length * chop_index
-            ) : sec_to_millis(  # noqa: E203
-                vad_span.start
-                + frame_to_sec(split_start, sampling_rate)
-                + chop_length * (chop_index + 1)
-            )
-        ]
-        segment_key = f"{full_audio_id}_{counter:04}"  # noqa: E231
-        split_audio[segment_key] = segment_split_chop.raw_data
-        save_segment(
-            segment=segment_split_chop,
-            folder=output_folder,
-            prefix=full_audio_id,
-            id=counter,
-            start_ms=sec_to_millis(
-                vad_span.start
-                + frame_to_sec(split_start, sampling_rate)
-                + chop_length * chop_index
-            ),
-            end_ms=sec_to_millis(
-                vad_span.start
-                + frame_to_sec(split_start, sampling_rate)
-                + chop_length * (chop_index + 1)
-            ),
+        start_ms = sec_to_millis(
+            vad_span.start
+            + frame_to_sec(split_start, sampling_rate)
+            + chop_length * chop_index
+        )
+        end_ms = sec_to_millis(  # noqa: E203
+            vad_span.start
+            + frame_to_sec(split_start, sampling_rate)
+            + chop_length * (chop_index + 1)
         )
+        segment_split_chop = original_audio_segment[start_ms:end_ms]
+        segment_key = (
+            f"{full_audio_id}_{counter:04}_{int(start_ms)}_to_{int(end_ms)}"  # noqa
+        )
+        split_audio[segment_key] = segment_split_chop
         counter += 1
     return counter
 
@@ -233,7 +198,6 @@ def process_non_mute_segments(
     lower_limit,
     upper_limit,
     full_audio_id,
-    output_folder,
     counter,
     split_audio,
 ):
@@ -255,31 +219,19 @@ def process_non_mute_segments(
         int: The updated counter after processing the non-mute segments.
     """  # noqa: E501
     for split_start, split_end in non_mute_segment_splits:
-        segment_split = original_audio_segment[
-            sec_to_millis(
-                vad_span.start + frame_to_sec(split_start, sampling_rate)
-            ) : sec_to_millis(  # noqa: E203
-                vad_span.start + frame_to_sec(split_end, sampling_rate)
-            )
-        ]
+        start_ms = sec_to_millis(
+            vad_span.start + frame_to_sec(split_start, sampling_rate)
+        )
+        end_ms = sec_to_millis(  # noqa: E203
+            vad_span.start + frame_to_sec(split_end, sampling_rate)
+        )
+        segment_split = original_audio_segment[start_ms:end_ms]
         segment_split_duration = (
             vad_span.start + frame_to_sec(split_end, sampling_rate)
         ) - (vad_span.start + frame_to_sec(split_start, sampling_rate))
         if lower_limit <= segment_split_duration <= upper_limit:
-            segment_key = f"{full_audio_id}_{counter:04}"  # noqa: E231
-            split_audio[segment_key] = segment_split.raw_data
-            save_segment(
-                segment=segment_split,
-                folder=output_folder,
-                prefix=full_audio_id,
-                id=counter,
-                start_ms=sec_to_millis(
-                    vad_span.start + frame_to_sec(split_start, sampling_rate)
-                ),
-                end_ms=sec_to_millis(
-                    vad_span.start + frame_to_sec(split_end, sampling_rate)
-                ),
-            )
+            segment_key = f"{full_audio_id}_{counter:04}_{int(start_ms)}_to_{int(end_ms)}"  # noqa: E231
+            split_audio[segment_key] = segment_split
             counter += 1
         elif segment_split_duration > upper_limit:
             counter = chop_long_segment_duration(
@@ -291,7 +243,6 @@ def process_non_mute_segments(
                 sampling_rate,
                 full_audio_id,
                 split_audio,
-                output_folder,
                 counter,
             )
     return counter
@@ -321,10 +272,6 @@ def get_split_audio(
     with open(temp_audio_file, "wb") as f:
         f.write(audio_data)
 
-    output_folder = f"data/split_audio/{full_audio_id}"
-
-    if not os.path.exists(output_folder):
-        os.makedirs(output_folder)
     # initialize vad pipeline
     pipeline = initialize_vad_pipeline()
     vad = pipeline(temp_audio_file)
@@ -335,21 +282,13 @@ def get_split_audio(
 
     counter = 1
     for vad_span in vad.get_timeline().support():
-        vad_segment = original_audio_segment[
-            sec_to_millis(vad_span.start) : sec_to_millis(vad_span.end)  # noqa: E203
-        ]
+        start_ms = sec_to_millis(vad_span.start)
+        end_ms = sec_to_millis(vad_span.end)
+        vad_segment = original_audio_segment[start_ms:end_ms]
         vad_span_length = vad_span.end - vad_span.start
         if lower_limit <= vad_span_length <= upper_limit:
-            segment_key = f"{full_audio_id}_{counter:04}"  # noqa: E231
-            split_audio[segment_key] = vad_segment.raw_data
-            save_segment(
-                segment=vad_segment,
-                folder=output_folder,
-                prefix=full_audio_id,
-                id=counter,
-                start_ms=sec_to_millis(vad_span.start),
-                end_ms=sec_to_millis(vad_span.end),
-            )
+            segment_key = f"{full_audio_id}_{counter:04}_{int(start_ms)}_to_{int(end_ms)}"  # noqa: E231
+            split_audio[segment_key] = vad_segment
             counter += 1
         elif vad_span_length > upper_limit:
             non_mute_segment_splits = librosa.effects.split(
@@ -370,7 +309,6 @@ def get_split_audio(
                 lower_limit,
                 upper_limit,
                 full_audio_id,
-                output_folder,
                 counter,
                 split_audio,
             )

diff --git a/src/stt_data_with_llm/catalog_parser.py b/src/stt_data_with_llm/catalog_parser.py
@@ -2,10 +2,7 @@
 
 import pandas as pd
 
-from stt_data_with_llm.util import setup_logging
-
 # Call the setup_logging function at the beginning of your script
-setup_logging("catalog_parse.log")
 
 
 def read_spreadsheet(sheet_id):
@@ -34,22 +31,33 @@ def read_spreadsheet(sheet_id):
         return pd.DataFrame()
 
 
-def parse_catalog(google_sheet_id):
+def parse_catalog(google_sheet_id, start_sr_no=None, end_sr_no=None):
     """
-    Parses an audio transcription catalog from a Google Spreadsheet.
+    Parses an audio transcription catalog from a Google Spreadsheet within a specified range of Sr.no values.
 
     Args:
-        audio_url (str): The URL of the Google Spreadsheet containing the audio transcription catalog.
+        google_sheet_id (str): The ID of the Google Spreadsheet containing the audio transcription catalog.
+        start_sr_no (int, optional): The starting Sr.no to process. If None, starts from the beginning.
+        end_sr_no (int, optional): The ending Sr.no to process. If None, processes until the end.
 
     Returns:
         dict: A dictionary where keys are unique IDs (e.g., "full_audio_id") and values are dictionaries of audio data.
     """
+
     catalog_df = read_spreadsheet(google_sheet_id)
 
     # Check if the catalog DataFrame is empty
     if catalog_df.empty:
         logging.warning("Catalog DataFrame is empty.")
         return {}
+    # Convert Sr.no column to numeric, replacing any non-numeric values with NaN
+    catalog_df["Sr.no"] = pd.to_numeric(catalog_df["Sr.no"], errors="coerce")
+
+    # Filter the DataFrame based on the specified range
+    if start_sr_no is not None:
+        catalog_df = catalog_df[catalog_df["Sr.no"] >= start_sr_no]
+    if end_sr_no is not None:
+        catalog_df = catalog_df[catalog_df["Sr.no"] <= end_sr_no]
 
     audio_transcription_catalog = {}
 

diff --git a/src/stt_data_with_llm/config.py b/src/stt_data_with_llm/config.py
@@ -45,4 +45,4 @@
 API_URL = "https://wpgzw4at8o6876h0.us-east-1.aws.endpoints.huggingface.cloud"
 
 # Validation
-CER_THRESHOLD = 0.4
+CER_THRESHOLD = 0.8