diff --git a/pyproject.toml b/pyproject.toml
index c124016..3e7bee0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,6 +17,7 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 dependencies = [
+  "boto3<=1.36.2",
   "anthropic<=0.42.0",
   "evaluate<=0.4.3",
   "jiwer<=3.0.5",
diff --git a/src/stt_data_with_llm/LLM_post_corrector.py b/src/stt_data_with_llm/LLM_post_corrector.py
index 0cf19e2..111555c 100644
--- a/src/stt_data_with_llm/LLM_post_corrector.py
+++ b/src/stt_data_with_llm/LLM_post_corrector.py
@@ -4,10 +4,7 @@
 import anthropic
 from dotenv import load_dotenv
 
-from stt_data_with_llm.util import setup_logging
-
 load_dotenv()
-setup_logging("llm_corrector.log")
 
 
 def get_LLM_corrected_text(inference_text, is_valid, reference_text=None):
@@ -26,23 +23,80 @@ def get_LLM_corrected_text(inference_text, is_valid, reference_text=None):
 
     if is_valid and reference_text is not None:
         prompt = f"""
-            I have two sentences: a colloquial sentence and a reference sentence.
-            Your task is to EXACTLY match the spellings from the reference sentence.
-            Do not make any corrections beyond matching the reference sentence exactly, even if you think a word is misspelled.   # noqa
-            If a word appears the same way in both sentences, do not change it.
-            Colloquial sentence: {inference_text}
-            Reference sentence: {reference_text}
-            Give me only the corrected sentence that exactly matches the reference, without any explanation
-            """
+            Your task is to correct Colloquial Tibetan sentences by comparing them with Reference sentences. Here are your specific responsibilities:
+            Main Task:
+            - Correct spelling and grammar mistakes in the Colloquial sentence by carefully comparing with the Reference sentence
+            - Preserve All Chinese-derived terms (regardless of their spelling),The original structure of compound terms basic terms and sentence structure unless there's a clear mistake
+            - Convert written numbers to Tibetan numerals (e.g., བདུན་ཅུ་ to ༧༠)
+            - DO NOT add or remove particles (like དུ་, ནི་, etc.) that aren't present in the original colloquial text
+            - DO NOT modify word order or syntax from the original colloquial text
+            - Particles that appear differently in the Reference sentence (e.g., ཀྱི་ should be ཀྱིས if Reference shows ཀྱིས)
+
+
+            Guidelines:
+            1. DO CHANGE:
+            - Incorrect spellings of pure Tibetan words (e.g., ཚོས་ should be ཆོས་)
+            - Basic Tibetan particle spelling mistakes (like ཀི་ to ཀྱི་)
+            - Particles that appear differently in the Reference sentence (e.g., ཀྱི་ should be ཀྱིས if Reference shows ཀྱིས)
+            - Obviously incorrect syllable formation in pure Tibetan words
+            - Number formats (convert to Tibetan numerals)
+            - Clear grammatical errors in Tibetan particles
+            - Words that appear in the Reference sentence with different spelling (e.g., ཐབས་ should be འཐབ་ if Reference shows འཐབ་)
+            2. DO NOT CHANGE:
+            - Any Chinese words written in Tibetan script (MOST IMPORTANT)
+            - ANY term that might be derived from Chinese
+            - DO NOT add or remove particles (like དུ་, ནི་, etc.) that aren't present in the original colloquial text
+
+
+            3. PRESERVE EXACTLY:
+            - All Chinese-derived terms (regardless of their spelling)
+            - The original structure of compound terms
+
+            Example:
+            Colloquial: ཚོས་ལུགས་ལས་དོན་གཅོས་ཀྱི་ཐབས་གཅོག་གཅིག་ྒྱུར་ལས་དོན་འབྲེལ་ཡོད་ལས་བྱེད་དགོན་པ་ཁག་ཏུ་ཉུས་ཞིབ་ཀི་དགོན་སྡེ་ཁག
+            Reference: འཐབ་ཕྱོགས་གཅིག་གྱུར་ལས་དོན་དང་འབྲེལ་ཡོད་ལས་བྱེད་དགོན་པ་ཁག་ཏུ་ཉུལ་ཞིབ་ཀྱིས
+            Corrected: ཆོས་ལུགས་ལས་དོན་གཅོས་ཀྱི་འཐབ་གཅོག་གཅིག་གྱུར་ལས་དོན་འབྲེལ་ཡོད་ལས་བྱེད་དགོན་པ་ཁག་ཏུ་ཉུལ་ཞིབ་ཀྱི་དགོན་སྡེ་ཁག
+
+            Format:
+            Colloquial: {inference_text}
+            Reference: {reference_text}
+            Output: Return only the corrected sentence without any explanation or additional
+            Note:
+            - Output ONLY the corrected sentence with no additional text or explanations
+            - If you notice spelling mistakes in the Reference sentence, rely on standard Tibetan orthography rather than the Reference sentence
+            - Don't add or delete words from the Colloquial sentence unless there's a clear mistake
+            - Cross-reference spellings with the Reference sentence when available
+            - Use the Reference sentence for both spelling verification and correct word forms
+            """  # noqa
     else:
         prompt = f"""
-            You are a Tibetan Language Expert. I want you to look for any spelling and grammar mistakes in the following Tibetan
-            sentence. Make sure that you don't change the terms and sentence if its not grammatically incorrect.
+            You are a Tibetan Language Expert specializing in modern Chinese-Tibetan terminology. Analyze the following Tibetan sentence with these STRICT requirements:
+
+            1. DO CHANGE:
+            - Incorrect spellings of pure Tibetan words (e.g., ཚོས་ should be ཆོས་)
+            - Basic Tibetan particle spelling mistakes (like ཀི་ to ཀྱི་)
+            - Obviously incorrect syllable formation in pure Tibetan words
+            - Number formats (convert to Tibetan numerals)
+            - Clear grammatical errors in Tibetan particles
+
+            2. DO NOT CHANGE:
+            - Any Chinese words written in Tibetan script (MOST IMPORTANT)
+            - ANY term that might be derived from Chinese
+
+            3. PRESERVE EXACTLY:
+            - All Chinese-derived terms (regardless of their spelling)
+            - The original structure of compound terms
+
             Tibetan sentence: {inference_text}
-            Output: output should be only the corrected sentence.
-            Give me only the corrected sentence without any explanation
-            """
 
+            Output: Return only the corrected sentence without any explanation or additional
+            CRITICAL:
+            - Preserve ALL Chinese-derived terms exactly as written
+            - DO correct misspelled pure Tibetan words (like ཚོས་ to ཆོས་)
+            - Only correct obvious Tibetan grammar particles and numbering
+
+            Remember: When in doubt about whether a term is Chinese-derived, preserve the original spelling.
+      """  # noqa
     try:
         # Make API call to Claude
         response = client.messages.create(
diff --git a/src/stt_data_with_llm/audio_parser.py b/src/stt_data_with_llm/audio_parser.py
index 091dc4c..91b30b1 100644
--- a/src/stt_data_with_llm/audio_parser.py
+++ b/src/stt_data_with_llm/audio_parser.py
@@ -15,14 +15,12 @@
     AUDIO_SEG_UPPER_LIMIT,
     HYPER_PARAMETERS,
 )
-from stt_data_with_llm.util import setup_logging
 
 # load the evnironment variable
 load_dotenv()
 
 USE_AUTH_TOKEN = os.getenv("use_auth_token")
 # Call the setup_logging function at the beginning of your script
-setup_logging("audio_parser.log")
 
 
 def sec_to_millis(seconds):
@@ -86,24 +84,6 @@ def initialize_vad_pipeline():
     return vad_pipeline
 
 
-def save_segment(segment, folder, prefix, id, start_ms, end_ms):
-    """Saves an audio segment to WAV file with standardized naming.
-
-    Args:
-        segment (AudioSegment): Audio segment to save
-        folder (str): Output directory path
-        prefix (str): Filename prefix
-        id (int): Segment Identifier
-        start_ms (float): Segment start time in milliseconds
-        end_ms (float): Segment end time in milliseconds
-    """
-    segment.export(
-        f"{folder}/{prefix}_{id:04}_{int(start_ms)}_to_{int(end_ms)}.wav",  # noqa: E231
-        format="wav",
-        parameters=["-ac", "1", "-ar", "16000"],
-    )
-
-
 def convert_to_16K(audio_data):
     """Converts audio data to 16kHz mono WAV format.
 
@@ -167,7 +147,6 @@ def chop_long_segment_duration(
     sampling_rate,
     full_audio_id,
     split_audio,
-    output_folder,
     counter,
 ):
     """Splits an audio segment into smaller chunks if its duration exceeds the specified upper limit.
@@ -192,35 +171,21 @@ def chop_long_segment_duration(
     while chop_length > upper_limit:
         chop_length = chop_length / 2
     for chop_index in range(int(segment_split_duration / chop_length)):
-        segment_split_chop = original_audio_segment[
-            sec_to_millis(
-                vad_span.start
-                + frame_to_sec(split_start, sampling_rate)
-                + chop_length * chop_index
-            ) : sec_to_millis(  # noqa: E203
-                vad_span.start
-                + frame_to_sec(split_start, sampling_rate)
-                + chop_length * (chop_index + 1)
-            )
-        ]
-        segment_key = f"{full_audio_id}_{counter:04}"  # noqa: E231
-        split_audio[segment_key] = segment_split_chop.raw_data
-        save_segment(
-            segment=segment_split_chop,
-            folder=output_folder,
-            prefix=full_audio_id,
-            id=counter,
-            start_ms=sec_to_millis(
-                vad_span.start
-                + frame_to_sec(split_start, sampling_rate)
-                + chop_length * chop_index
-            ),
-            end_ms=sec_to_millis(
-                vad_span.start
-                + frame_to_sec(split_start, sampling_rate)
-                + chop_length * (chop_index + 1)
-            ),
+        start_ms = sec_to_millis(
+            vad_span.start
+            + frame_to_sec(split_start, sampling_rate)
+            + chop_length * chop_index
+        )
+        end_ms = sec_to_millis(  # noqa: E203
+            vad_span.start
+            + frame_to_sec(split_start, sampling_rate)
+            + chop_length * (chop_index + 1)
         )
+        segment_split_chop = original_audio_segment[start_ms:end_ms]
+        segment_key = (
+            f"{full_audio_id}_{counter:04}_{int(start_ms)}_to_{int(end_ms)}"  # noqa
+        )
+        split_audio[segment_key] = segment_split_chop
         counter += 1
     return counter
 
@@ -233,7 +198,6 @@ def process_non_mute_segments(
     lower_limit,
     upper_limit,
     full_audio_id,
-    output_folder,
     counter,
     split_audio,
 ):
@@ -255,31 +219,19 @@ def process_non_mute_segments(
         int: The updated counter after processing the non-mute segments.
     """  # noqa: E501
     for split_start, split_end in non_mute_segment_splits:
-        segment_split = original_audio_segment[
-            sec_to_millis(
-                vad_span.start + frame_to_sec(split_start, sampling_rate)
-            ) : sec_to_millis(  # noqa: E203
-                vad_span.start + frame_to_sec(split_end, sampling_rate)
-            )
-        ]
+        start_ms = sec_to_millis(
+            vad_span.start + frame_to_sec(split_start, sampling_rate)
+        )
+        end_ms = sec_to_millis(  # noqa: E203
+            vad_span.start + frame_to_sec(split_end, sampling_rate)
+        )
+        segment_split = original_audio_segment[start_ms:end_ms]
         segment_split_duration = (
             vad_span.start + frame_to_sec(split_end, sampling_rate)
         ) - (vad_span.start + frame_to_sec(split_start, sampling_rate))
         if lower_limit <= segment_split_duration <= upper_limit:
-            segment_key = f"{full_audio_id}_{counter:04}"  # noqa: E231
-            split_audio[segment_key] = segment_split.raw_data
-            save_segment(
-                segment=segment_split,
-                folder=output_folder,
-                prefix=full_audio_id,
-                id=counter,
-                start_ms=sec_to_millis(
-                    vad_span.start + frame_to_sec(split_start, sampling_rate)
-                ),
-                end_ms=sec_to_millis(
-                    vad_span.start + frame_to_sec(split_end, sampling_rate)
-                ),
-            )
+            segment_key = f"{full_audio_id}_{counter:04}_{int(start_ms)}_to_{int(end_ms)}"  # noqa: E231
+            split_audio[segment_key] = segment_split
             counter += 1
         elif segment_split_duration > upper_limit:
             counter = chop_long_segment_duration(
@@ -291,7 +243,6 @@ def process_non_mute_segments(
                 sampling_rate,
                 full_audio_id,
                 split_audio,
-                output_folder,
                 counter,
             )
     return counter
@@ -321,10 +272,6 @@ def get_split_audio(
     with open(temp_audio_file, "wb") as f:
         f.write(audio_data)
 
-    output_folder = f"data/split_audio/{full_audio_id}"
-
-    if not os.path.exists(output_folder):
-        os.makedirs(output_folder)
     # initialize vad pipeline
     pipeline = initialize_vad_pipeline()
     vad = pipeline(temp_audio_file)
@@ -335,21 +282,13 @@ def get_split_audio(
 
     counter = 1
     for vad_span in vad.get_timeline().support():
-        vad_segment = original_audio_segment[
-            sec_to_millis(vad_span.start) : sec_to_millis(vad_span.end)  # noqa: E203
-        ]
+        start_ms = sec_to_millis(vad_span.start)
+        end_ms = sec_to_millis(vad_span.end)
+        vad_segment = original_audio_segment[start_ms:end_ms]
         vad_span_length = vad_span.end - vad_span.start
         if lower_limit <= vad_span_length <= upper_limit:
-            segment_key = f"{full_audio_id}_{counter:04}"  # noqa: E231
-            split_audio[segment_key] = vad_segment.raw_data
-            save_segment(
-                segment=vad_segment,
-                folder=output_folder,
-                prefix=full_audio_id,
-                id=counter,
-                start_ms=sec_to_millis(vad_span.start),
-                end_ms=sec_to_millis(vad_span.end),
-            )
+            segment_key = f"{full_audio_id}_{counter:04}_{int(start_ms)}_to_{int(end_ms)}"  # noqa: E231
+            split_audio[segment_key] = vad_segment
             counter += 1
         elif vad_span_length > upper_limit:
             non_mute_segment_splits = librosa.effects.split(
@@ -370,7 +309,6 @@ def get_split_audio(
                 lower_limit,
                 upper_limit,
                 full_audio_id,
-                output_folder,
                 counter,
                 split_audio,
             )
diff --git a/src/stt_data_with_llm/catalog_parser.py b/src/stt_data_with_llm/catalog_parser.py
index 4067b22..9f44e88 100644
--- a/src/stt_data_with_llm/catalog_parser.py
+++ b/src/stt_data_with_llm/catalog_parser.py
@@ -2,10 +2,7 @@
 
 import pandas as pd
 
-from stt_data_with_llm.util import setup_logging
-
 # Call the setup_logging function at the beginning of your script
-setup_logging("catalog_parse.log")
 
 
 def read_spreadsheet(sheet_id):
@@ -34,22 +31,33 @@ def read_spreadsheet(sheet_id):
         return pd.DataFrame()
 
 
-def parse_catalog(google_sheet_id):
+def parse_catalog(google_sheet_id, start_sr_no=None, end_sr_no=None):
     """
-    Parses an audio transcription catalog from a Google Spreadsheet.
+    Parses an audio transcription catalog from a Google Spreadsheet within a specified range of Sr.no values.
 
     Args:
-        audio_url (str): The URL of the Google Spreadsheet containing the audio transcription catalog.
+        google_sheet_id (str): The ID of the Google Spreadsheet containing the audio transcription catalog.
+        start_sr_no (int, optional): The starting Sr.no to process. If None, starts from the beginning.
+        end_sr_no (int, optional): The ending Sr.no to process. If None, processes until the end.
 
     Returns:
         dict: A dictionary where keys are unique IDs (e.g., "full_audio_id") and values are dictionaries of audio data.
     """
+
     catalog_df = read_spreadsheet(google_sheet_id)
 
     # Check if the catalog DataFrame is empty
     if catalog_df.empty:
         logging.warning("Catalog DataFrame is empty.")
         return {}
+    # Convert Sr.no column to numeric, replacing any non-numeric values with NaN
+    catalog_df["Sr.no"] = pd.to_numeric(catalog_df["Sr.no"], errors="coerce")
+
+    # Filter the DataFrame based on the specified range
+    if start_sr_no is not None:
+        catalog_df = catalog_df[catalog_df["Sr.no"] >= start_sr_no]
+    if end_sr_no is not None:
+        catalog_df = catalog_df[catalog_df["Sr.no"] <= end_sr_no]
 
     audio_transcription_catalog = {}
 
diff --git a/src/stt_data_with_llm/config.py b/src/stt_data_with_llm/config.py
index 9d22282..4eaddec 100644
--- a/src/stt_data_with_llm/config.py
+++ b/src/stt_data_with_llm/config.py
@@ -45,4 +45,4 @@
 API_URL = "https://wpgzw4at8o6876h0.us-east-1.aws.endpoints.huggingface.cloud"
 
 # Validation
-CER_THRESHOLD = 0.4
+CER_THRESHOLD = 0.8
diff --git a/src/stt_data_with_llm/inference_transcript.py b/src/stt_data_with_llm/inference_transcript.py
index 193d210..10e1d69 100644
--- a/src/stt_data_with_llm/inference_transcript.py
+++ b/src/stt_data_with_llm/inference_transcript.py
@@ -1,18 +1,15 @@
+import io
 import logging
 import os
-import wave
-from io import BytesIO
 
 import requests
 from dotenv import load_dotenv
 
-from stt_data_with_llm.config import API_URL, CHANNELS, SAMPLE_RATE, SAMPLE_WIDTH
-from stt_data_with_llm.util import setup_logging
+from stt_data_with_llm.config import API_URL
 
 load_dotenv()
 TOKEN_ID = os.getenv("token_id")
 # Call the setup_logging function at the beginning of your script
-setup_logging("inference_log.log")
 
 INFERENCE_HEADERS = {
     "Accept": "application/json",
@@ -21,34 +18,6 @@
 }
 
 
-def convert_raw_to_wav_in_memory(raw_audio, sample_rate, channels, sample_width):
-    """
-    Converts raw audio data to a valid WAV format in memory.
-
-    Args:
-        raw_audio (bytes): Raw audio data.
-        sample_rate (int): Audio sample rate.
-        channels (int): Number of audio channels.
-        sample_width (int): Number of bytes per sample.
-
-    Returns:
-        BytesIO: In-memory WAV file if conversion is successful, None otherwise.
-    """
-    try:
-        wav_buffer = BytesIO()
-        with wave.open(wav_buffer, "wb") as wav_file:
-            wav_file.setnchannels(channels)
-            wav_file.setsampwidth(sample_width)
-            wav_file.setframerate(sample_rate)
-            wav_file.writeframes(raw_audio)
-        wav_buffer.seek(0)  # Reset buffer to the beginning
-        logging.info("Raw audio successfully converted to WAV format in memory.")
-        return wav_buffer
-    except Exception as e:
-        logging.error(f"Error converting raw audio to WAV in memory: {e}")
-        return None
-
-
 def query_audio_api(wav_buffer):
     """
     Sends the WAV audio data to the Hugging Face API for inference.
@@ -70,7 +39,7 @@ def query_audio_api(wav_buffer):
         return None
 
 
-def get_audio_inference_text(raw_audio):
+def get_audio_inference_text(audio_segment):
     """
     Generates the inference transcript for raw audio data.
 
@@ -82,14 +51,19 @@ def get_audio_inference_text(raw_audio):
     """
     try:
         # Convert raw audio to WAV format in memory
-        wav_buffer = convert_raw_to_wav_in_memory(
-            raw_audio, SAMPLE_RATE, CHANNELS, SAMPLE_WIDTH
-        )
-        if not wav_buffer:
+        # wav_buffer = convert_raw_to_wav_in_memory(
+        #     raw_audio, SAMPLE_RATE, CHANNELS, SAMPLE_WIDTH
+        # )"""
+        if not audio_segment:
             return ""
         logging.info("Running inference on audio segment")
+        # Convert AudioSegment to WAV format in memory
+        buffer = io.BytesIO()
+        audio_segment.export(buffer, format="wav")
+        buffer.seek(0)  # Reset buffer to the beginning
+
         # Send the WAV data to the API for transcription
-        response = query_audio_api(wav_buffer)
+        response = query_audio_api(buffer)
         if not response or "text" not in response:
             return ""
         transcript = response["text"]
diff --git a/src/stt_data_with_llm/main.py b/src/stt_data_with_llm/main.py
index 1736d33..c750530 100644
--- a/src/stt_data_with_llm/main.py
+++ b/src/stt_data_with_llm/main.py
@@ -1,5 +1,10 @@
+import csv
+import io
+import json
 import logging
+import os
 
+import boto3
 from fast_antx.core import transfer
 
 from stt_data_with_llm.audio_parser import get_audio, get_split_audio
@@ -15,9 +20,10 @@
     calculate_cer,
     get_inference_transcript,
     get_original_text,
+    setup_logging,
 )
 
-logging.basicConfig(filename="./pipeline.log", level=logging.INFO)
+setup_logging("stt_llm_corrector.log")
 
 
 def transfer_segmentation(inference_transcript, reference_transcript):
@@ -38,6 +44,13 @@ def transfer_segmentation(inference_transcript, reference_transcript):
     return reference_transcript_with_inference_segmentation
 
 
+def audio_segment_to_bytes(audio_segment):
+    buffer = io.BytesIO()
+    audio_segment.export(buffer, format="wav")
+    audio_data = buffer.getvalue()
+    return audio_data
+
+
 def is_valid_transcript(inference_transcript, reference_transcript):
     """Validates the reference transcript by comparing it to the inference transcript
     using the Character Error Rate (CER) metric.
@@ -78,6 +91,8 @@ def post_process_audio_transcript_pairs(audio_data_info):
     if not is_valid_transcript(
         validation_inference_transcript, validation_original_text
     ):
+        logging.info("Validation_original_transcript: %s", validation_original_text)
+        logging.info("Validation_inference_text: %s", validation_inference_transcript)
         return None, full_audio_id
     reference_transcript_with_inference_segmentation = transfer_segmentation(
         inference_transcript, reference_transcript
@@ -96,7 +111,7 @@ def post_process_audio_transcript_pairs(audio_data_info):
             seg_LLM_corrected_text = get_LLM_corrected_text(
                 seg_inference_text, True, seg_reference_text
             )
-        post_process_audio_transcript_pairs[audio_seg_id] = {
+        post_processed_audio_transcript_pairs[audio_seg_id] = {
             "audio_seg_data": audio_seg_data,
             "inference_transcript": seg_inference_text,
             "reference_transcript": seg_reference_text,
@@ -105,23 +120,183 @@ def post_process_audio_transcript_pairs(audio_data_info):
     return post_processed_audio_transcript_pairs, full_audio_id
 
 
+def extract_duration_from_filename(file_name):
+    """
+    Extracts start_ms and end_ms from the file_name and calculates the duration in seconds.
+
+    Args:
+        file_name (str): The file name in the format "full_audio_id_counter_start_ms_to_end_ms".
+
+    Returns:
+        float: The duration of the audio segment in seconds.
+    """
+    try:
+        # Extract start_ms and end_ms from the file_name
+        parts = file_name.split("_")
+        start_ms = int(parts[-3])
+        end_ms = int(parts[-1])  # Last part is end_ms
+
+        # Calculate duration
+        duration_ms = end_ms - start_ms
+        return round(duration_ms / 1000, 2)
+    except Exception as e:
+        logging.error(f"Error extracting duration from file name {file_name}: {e}")
+        return 0.0  # Default to 0 if there's an error
+
+
+def upload_to_s3(bucket_name, file_name, audio_segment):
+    session = boto3.Session()
+    s3 = session.client("s3")
+    try:
+        # Convert AudioSegment to bytes
+        audio_data = audio_segment_to_bytes(audio_segment)
+
+        # Upload the file to S3
+        s3.put_object(
+            Bucket=bucket_name,
+            Key=file_name,
+            Body=audio_data,
+            ContentType="audio/wav",
+            ContentDisposition="inline",
+        )
+
+        # Generate the CloudFront URL
+        cleaned_file_name = (
+            file_name.split("/", 1)[1] if "/" in file_name else file_name
+        )
+        cloudfront_url = f"https://d38pmlk0v88drf.cloudfront.net/stt_news_auto_data/{cleaned_file_name}"  # noqa
+        logging.info(f"File uploaded to S3 and accessible at: {cloudfront_url}")
+        return cloudfront_url
+    except Exception as e:
+        logging.error(f"Error uploading file to S3: {e}")
+        return None
+
+
 def save_post_processed_audio_transcript_pairs(
     post_processed_audio_transcript_pairs, audio_data_info
 ):
-    # Save post processed audio transcript pairs in csv
-    pass
+    os.makedirs("data/corrected_inference", exist_ok=True)
+    output_file = "data/corrected_inference/processed_audio_transcript.csv"
+    # Define Csv column headers
+    headers = [
+        "file_name",
+        "audio_url",
+        "inference_transcript",
+        "reference_transcript",
+        "LLM_corrected_text",
+        "audio_duration",
+        "speaker_name",
+        "speaker_gender",
+        "news_channel",
+        "publishing_year",
+    ]
+    # Extract the metadata from the catalog
+    speaker_name = audio_data_info.get("speaker_name", "")
+    speaker_gender = audio_data_info.get("speaker_gender", "")
+    news_channel = audio_data_info.get("news_channel", "")
+    publishing_year = audio_data_info.get("publishing_year", "")
+    # S3 bucket name
+    s3_bucket_name = "monlam.ai.stt"
+
+    try:
+        with open(output_file, "a", newline="", encoding="utf-8") as file:
+            writer = csv.DictWriter(file, fieldnames=headers)
+            if file.tell() == 0:
+                writer.writeheader()
+            for (
+                audio_seg_id,
+                audio_seg_data,
+            ) in post_processed_audio_transcript_pairs.items():
+
+                # Prepare file name and upload to S3
+                file_name = f"stt_news_auto_data/{audio_seg_id}.wav"
+                audio_url = upload_to_s3(
+                    s3_bucket_name, file_name, audio_seg_data["audio_seg_data"]
+                )
+
+                # Extract duration from the audio_seg_id (file_name)
+                audio_duration = extract_duration_from_filename(audio_seg_id)
+                # Write a row to the CSV file
+                writer.writerow(
+                    {
+                        "file_name": audio_seg_id,
+                        "audio_url": audio_url,
+                        "inference_transcript": audio_seg_data["inference_transcript"],
+                        "reference_transcript": audio_seg_data["reference_transcript"],
+                        "LLM_corrected_text": audio_seg_data["LLM_corrected_text"],
+                        "audio_duration": audio_duration,
+                        "speaker_name": speaker_name,
+                        "speaker_gender": speaker_gender,
+                        "news_channel": news_channel,
+                        "publishing_year": publishing_year,
+                    }
+                )
+        logging.info(f"Processed audio transcript pairs saved to {output_file}")
+    except Exception as e:
+        logging.error(f"Error saving processed audio transcript pairs: {e}")
 
 
-def get_audio_transcript_pairs(audio_transcription_catalog_url):
-    audio_transcription_datas = parse_catalog(audio_transcription_catalog_url)
+def get_audio_transcript_pairs(
+    audio_transcription_catalog_url, start_sr_no=None, end_sr_no=None
+):
+    os.makedirs("data/corrected_audio_transcript_checkpoint", exist_ok=True)
+    os.makedirs("data/invalid_transcript_checkpoint", exist_ok=True)
+    # Load checkpoint if it exists
+    checkpoint_file = (
+        "data/corrected_audio_transcript_checkpoint/processing_checkpoint.json"
+    )
+    invalid_transcript_checkpoint_file = (
+        "data/invalid_transcript_checkpoint/invalid_transcript_checkpoint.json"
+    )
+    if os.path.exists(checkpoint_file):
+        with open(checkpoint_file) as file:
+            processed_ids = set(json.load(file))
+    else:
+        processed_ids = set()
+    if os.path.exists(invalid_transcript_checkpoint_file):
+        with open(invalid_transcript_checkpoint_file) as invalid_file:
+            invalid_transcript_ids = set(json.load(invalid_file))
+    else:
+        invalid_transcript_ids = set()
+
+    audio_transcription_datas = parse_catalog(
+        audio_transcription_catalog_url, start_sr_no, end_sr_no
+    )
     for data_id, audio_data_info in audio_transcription_datas.items():
-        (
-            post_processed_audio_transcript_pairs,
-            full_audio_id,
-        ) = post_process_audio_transcript_pairs(audio_data_info)
-        if post_processed_audio_transcript_pairs:
-            save_post_processed_audio_transcript_pairs(
-                post_processed_audio_transcript_pairs, audio_data_info
+        full_audio_id = audio_data_info.get("full_audio_id", "")
+        if full_audio_id in processed_ids or full_audio_id in invalid_transcript_ids:
+            logging.info(
+                f"Skipping already processed or invalid audio data with ID {full_audio_id}"
             )
-        else:
-            logging.info(f"Audio data with ID {full_audio_id} has invalid transcript")
+            continue
+        try:
+            (
+                post_processed_audio_transcript_pairs,
+                full_audio_id,
+            ) = post_process_audio_transcript_pairs(audio_data_info)
+            if post_processed_audio_transcript_pairs:
+                save_post_processed_audio_transcript_pairs(
+                    post_processed_audio_transcript_pairs, audio_data_info
+                )
+                processed_ids.add(full_audio_id)
+                # save the checkpoint after each successful processing
+                with open(checkpoint_file, "w") as file:
+                    json.dump(list(processed_ids), file)
+            else:
+                logging.info(
+                    f"Audio data with ID {full_audio_id} has invalid transcript"
+                )
+                invalid_transcript_ids.add(full_audio_id)
+                # save the checkpoint after each successful processing
+                with open(invalid_transcript_checkpoint_file, "w") as file:
+                    json.dump(list(invalid_transcript_ids), file)
+        except Exception as e:
+            logging.error(f"Error processing audio data with ID {full_audio_id}: {e}")
+
+    logging.info("Processing completed")
+
+
+if __name__ == "__main__":
+    # Replace with your actual spreadsheet ID
+    google_spread_sheet_id = "1Iy01o2hsrhWpbOQzFfC1gOVqw4j1AMp7poEU2eu7WN0"
+    get_audio_transcript_pairs(google_spread_sheet_id)
diff --git a/tests/data/expected_catalog_data.json b/tests/data/expected_catalog_data.json
deleted file mode 100644
index 93f5cfe..0000000
--- a/tests/data/expected_catalog_data.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "0":{
-            "full_audio_id": "STT_NW0802",
-            "sr_no": 1,
-            "audio_url": "https://www.rfa.org/tibetan/sargyur/golok-china-religious-restriction-08202024054225.html/@@stream",
-            "reference_transcript": "A",
-            "speaker_name": "བདེ་སྐྱིད་ཀུན་སྒྲོལ།",
-            "speaker_gender": "",
-            "news_channel": "RFA",
-            "publishing_year": "2024.08.20"
-    },
-    "1":{
-        "full_audio_id": "STT_NW0805",
-        "sr_no": 2,
-        "audio_url": "https://www.rfa.org/tibetan/sargyur/vpn-china-restriction-08152024081404.html/@@stream",
-        "reference_transcript": "B",
-        "speaker_name": "བདེ་སྐྱིད་ཀུན་སྒྲོལ།",
-        "speaker_gender": "",
-        "news_channel": "RFA",
-        "publishing_year": "2024.08.15"
-    },
-    "2":{
-    "full_audio_id": "",
-    "sr_no": 3,
-    "audio_url": "",
-    "reference_transcript": "D",
-    "speaker_name": "",
-    "speaker_gender": "",
-    "news_channel": "RFA",
-    "publishing_year": "2021.01.28"
-}
-}
diff --git a/tests/test_audio_parser.py b/tests/test_audio_parser.py
index 06e03ab..5bd5bc1 100644
--- a/tests/test_audio_parser.py
+++ b/tests/test_audio_parser.py
@@ -82,7 +82,7 @@ def mock_initialize_pipeline(seg_id):
             )
             num_split = len(split_audio_data)
             num_of_seg_in_audios[seg_id] = num_split
-        expected_num_of_seg_in_audios = "tests/data/expected_audio_data.json"
+        expected_num_of_seg_in_audios = "tests/test_data/expected_audio_data.json"
         with open(expected_num_of_seg_in_audios, encoding="utf-8") as file:
             expected_num_split = json.load(file)
         assert num_of_seg_in_audios == expected_num_split
diff --git a/tests/test_catalog_parser.py b/tests/test_catalog_parser.py
index cf741b2..080e054 100644
--- a/tests/test_catalog_parser.py
+++ b/tests/test_catalog_parser.py
@@ -10,16 +10,15 @@ def test_catalog_parser():
     Main function to parse the catalog and save the audio transcription data as JSON.
     """
     # Replace with your actual spreadsheet ID
-    google_spread_sheet_id = "14pCi8pxD_Ms3i3RAcBWNrCT9MocnRKD49jTLxDHzDe0"
+    google_spread_sheet_id = "1Iy01o2hsrhWpbOQzFfC1gOVqw4j1AMp7poEU2eu7WN0"
 
     # Parse the catalog
 
-    audio_transcription_catalog = parse_catalog(google_spread_sheet_id)
-    expected_output_json_path = "tests/data/expected_catalog_data.json"
+    audio_transcription_catalog = parse_catalog(google_spread_sheet_id, 1, 20)
+    expected_output_json_path = "tests/test_data/expected_catalog_data.json"
     with open(expected_output_json_path, encoding="utf-8") as file:
-        expected_output_json = json.load(file)
-
-    assert audio_transcription_catalog == expected_output_json
+        expected_output = json.load(file)
+    assert audio_transcription_catalog == expected_output
 
 
 if __name__ == "__main__":
diff --git a/tests/data/expected_audio_data.json b/tests/test_data/expected_audio_data.json
similarity index 100%
rename from tests/data/expected_audio_data.json
rename to tests/test_data/expected_audio_data.json
diff --git a/tests/test_data/expected_catalog_data.json b/tests/test_data/expected_catalog_data.json
new file mode 100644
index 0000000..013df61
--- /dev/null
+++ b/tests/test_data/expected_catalog_data.json
@@ -0,0 +1,32 @@
+{
+    "0": {
+        "full_audio_id": "STT_NW0802",
+        "sr_no": 2,
+        "audio_url": "https://www.rfa.org/tibetan/sargyur/golok-china-religious-restriction-08202024054225.html/@@stream",
+        "reference_transcript": "ཕྱི་ལོ་ ༢༠༢༤ ཟླ་ ༨ ནང་རྒྱ་ནག་གཞུང་གིས་མཚོ་སྔོན་ཞིང་ཆེན་མགོ་ལོག་ཁུལ་བཙུགས་ནས་ལོ་འཁོར་ ༧༠ འཁོར་བའི་མཛད་སྒོ་འཚོགས་ཡོད་པ་བཞིན། ཕྱི་ཟླ་ ༨ པའི་ནང་རྒྱ་ནག་གཞུང་གིས་མགོ་ལོག་ཁུལ་མངའ་ཁུལ་གྱི་ས་གནས་གང་སར་དམ་དྲག་ཤུགས་ཆེར་བྱས་ཡོད་པ་དང་། ལྷག་པར་དུ་དགོན་སྡེ་ཁག་ཏུ་ཆོས་ཕྱོགས་ཀྱི་བྱེད་སྒོ་ལ་དམ་དྲག་དང་བདེ་འཇགས་ཆེད་ཡིན་སྐོར་བརྗོད་ནས། དགོན་པའི་འདུ་ཁང་དང་གྲྭ་ཤག་ཁག་ཏུ་སྲིད་གཞུང་གི་ལས་བྱེད་གང་བྱུང་ཡོང་སྟེ་སྔོགས་བཤེར་དང་། ས་གནས་ཁག་ཏུ་དར་ལྕོགས་འཐེན་པ་དང་ཆོས་ལུགས་ཀྱི་མཚོན་རྟགས་རིགས་གཙང་བཟོ་བྱེད་བཞིན་པ། དགོན་པ་དང་མི་མང་གི་ཆོས་ཕྱོགས་ཀྱི་བྱེད་སྒོར་དོ་དམ་སྟངས་འཛིན་སྔར་ལྷག་བྱེད་བཞིན་ཡོད་པ། བོད་ནང་གི་འབྲེལ་ཡོད་བོད་མི་གཉིས་ནས་འདི་ག་ཨེ་ཤེ་ཡ་རང་དབང་རླུང་འཕྲིན་ཁང་ལ་འགྲེལ་བརྗོད་གནང་བྱུང་།\nགོང་ཞུས་འབྲེལ་ཡོད་བོད་མི་ཞིག་ནས་ཉེ་སྔོན་མགོ་ལོག་རྨ་ཆེན་རྫོང་ཁོངས་སུ་ཡོད་པའི་རཱ་རྒྱ་གངས་ལྗོངས་ཤེས་རིག་ནོར་བུའི་གླིང་སློབ་གྲྭ་གཏོར་བཤིག་དང་འབྲེལ། རྨ་ཆེན་རྫོང་གི་མི་མང་དང་དགོན་སྡེ་ཁག་ཏུ་ཚོགས་འདུ་འདྲ་མིན་ཚོགས་ནས། རྒྱ་ནག་དམར་ཤོག་ཚོགས་པའི་ཆབ་སྲིད་སློབ་གསོ་སྤྲད་པ་དང་། དགོན་པའི་ནང་དུ་ཆབ་སྲིད་ཀྱི་ཉེན་ཁ་ཡོད་པའི་ཅ་དངོས་ཉར་ཚགས་མི་ཆོག་སྐོར་བརྗོད་ནས་འདུ་ཁང་དང་གྲྭ་ཤག་རེ་རེ་བཞིན་སྔོགས་བཤེར་དང་། དེ་བཞིན་མགོ་ལོག་པདྨ་རྫོང་སོགས་ནས་གྲོང་ཚོ་ཁག་ཏུ་ཉུལ་ཞིབ་ཀྱིས། ཡུལ་མི་ཚོའི་ཁྱིམ་ཚང་གི་སྒོ་ཁར་འཐེན་པའི་དར་ལྕོགས་དང་། ཡུལ་དེར་མི་འདས་གྲོངས་ཕྱིན་ཚེ་དར་ལྕོགས་འཐེན་སྲོལ་ཡོད་པ་སོགས་ཀྱི་ཆོས་ལུགས་ཆོ་ག་གཉེར་མི་ཆོག་པའི་དམ་དྲག་ཤུགས་ཆེར་བྱེད་བཞིན་ཡོད་སྐོར་འགྲེལ་བརྗོད་གནང་བྱུང་།\nསྤྱིར་རྒྱ་ནག་གཞུང་གིས་འདི་ལོའི་ཟླ་ ༧ པའི་ནང་རྒྱ་ནག་གཞུང་གི་འབྲེལ་ཡོད་མི་སྣ་ཚོས་བོད་མདོ་སྨད་རྔ་པ་རྫོང་གི་ཡུལ་ཚོ་དང་གྲོང་སྡེ་ཁག་ཏུ་ཉུལ་ཞིབ་ཀྱིས། སྡོད་ཁང་གི་ཕྱི་ཕྱོགས་སུ་ཆོས་ལུགས་ཀྱི་མཚོན་རྟགས་ཡོད་པའི་ཅ་དངོས་རིགས་གང་ཡང་བཞག་མི་ཆོག་པའི་བཙན་ཐབས་སྣ་ཚོགས་བྱས་ཡོད་པ་རེད་ལ། ཡང་འབྲེལ་ཡོད་བོད་མི་གཞན་དེས་མགོ་ལོག་དར་ལོག་རྫོང་ཁོངས་སུ་བོད་མི་ཏང་ཡོན་ལས་བྱེད་ཚོ་ཆོས་ལུགས་བྱེད་སྒོར་ཞུགས་མི་ཆོག་པ་དང་། ལུས་ལ་ཆོས་ལུགས་ཀྱི་མཚོན་རྟགས་དང་དགོན་པ་ཁག་ཏུ་གནས་མཇལ་ལ་རྩ་ནས་འགྲོ་མི་ཆོག་པའི་དམ་དྲག་དང་། ས་གནས་ཁག་ཏུ་ཁྲིམས་ལུགས་སློབ་གསོ་ཞེས་མི་མང་བསྡུ་བསྐོངས་ཀྱིས་རྒྱ་ནག་གཅིག་གྱུར་དང་། རྒྱ་ནག་དམར་ཤོག་ཚོགས་པར་དགའ་ཞེན་དགོས་པ་དྲིལ་བསྒྲགས་བྱེད་བཞིན་ཡོད་པ་འགྲེལ་བརྗོད་གནང་གི་འདུག\nརྒྱ་ནག་གཞུང་འབྲེལ་གསར་ལམ་ཁག་ཏུ་མགོ་ལོག་རྨ་ཆེན་རྫོང་དུ། འཐབ་ཕྱོགས་གཅིག་གྱུར་ལས་དོན་དང་འབྲེལ་ཡོད་ལས་བྱེད་དགོན་པ་ཁག་ཏུ་ཉུལ་ཞིབ་ཀྱིས། ཆོས་ཕྱོགས་ལས་དོན་གྱི་བདེ་འཇགས་ཉེན་ཁ་ལ་ཞིབ་ཚགས་ཀྱིས་ལྟ་ཞིབ་བྱས་ཏེ་རྫོང་ཡོངས་སུ་ཆོས་ཕྱོགས་ཀྱི་བདེ་འཇགས་ཉེན་ཁ་མི་ཡོང་བའི་འགན་ལེན་ཆེད་ཞིབ་བཤེར་དང་། མ་འོངས་པར་མུ་མཐུད་ཆོས་ལུགས་བྱེད་སྒོ་དང་དགོན་སྡེའི་དོ་དམ་ཐད་ཤུགས་སྣོན་བྱ་རྒྱུ་ཡིན་སྐོར་བཀོད་འདུག་ལ། དེ་བཞིན་གནས་ཚུལ་གཞན་ཁག་ཏུ་མགོ་ལོག་ཁུལ་འབྲིང་རིམ་མི་དམངས་ཁྲིམས་ཁང་གིས། དབྱར་ཁ་དང་བསྟུན་མི་མང་འདུ་སྡོད་ཁུལ་དུ་སྐོར་སྐྱོད་ཀྱིས་ཁྲིམས་འགལ་གྱི་གྱོད་དོན་སྐོར་གནས་ཚུལ་བསྡུ་ལེན་དང་། དམར་ཤོག་ཚོགས་པའི་དྲིལ་བསྒྲགས་སློབ་གསོ་སྤེལ་བཞིན་པའི་གནས་ཚུལ་སོགས་བཀོད་འདུག\nའདས་པའི་ལོ་ཤས་ནས་བཟུང་རྒྱ་ནག་གཞུང་གིས་བོད་ནང་གི་དགོན་སྡེ་ཁག་ཏུ། ནང་བསྟན་རྒྱ་སྒྱུར་གྱི་ལས་འགུལ་འོག་དམ་དྲག་ཤུགས་ཆེར་བྱེད་བཞིན་པོ་ལྟར། འདི་ལོའི་ཕྱི་ཟླ་ ༥ པའི་ནང་དགོན་པའི་དགེ་འདུན་པར་དོ་དམ་ཤུགས་སྣོན་ཞེས། དགོན་སྡེ་ཁག་ཏུ་དྲ་ལམ་བདེ་འཇགས་ཐད་གཟབ་ནན་བྱས་ཏེ། རྒྱ་ནག་གཞུང་གི་ཆབ་སྲིད་དང་འགལ་བའི་སྐད་ཆ་མི་བཤད་པ་དང་། ཡི་གེ་མི་བསྐུར་བ། མི་རིགས་མཐུན་སྒྲིལ་གྱི་ཕྱོགས་ལ་གནོད་འཚེ་གཏོང་བའི་སྐད་ཆ་དང་ཡི་གེ་སྐུར་མི་ཆོག་པ་སོགས་ཀྱི་དམ་དྲག་དང་། དཔལ་ཡུལ་དར་ཐང་དགོན་ཆེན་གྱི་དགེ་འདུན་པ་རྒྱལ་བ་ལགས་ནས། སྤྱི་ཚོགས་དྲ་ལམ་དུ་མཆན་ཞིག་བྲིས་པ་ཆབ་སྲིད་དང་འགལ་ཡོད་སྐོར་བརྗོད་ནས་འཛིན་བཟུང་གིས་གར་སོང་ཆ་མེད་དུ་གྱུར་བ་སོགས་ཀྱི་གནས་ཚུལ་བྱུང་ཡོད་པ་རེད།\nགསར་འགོད་པ། བདེ་སྐྱིད་ཀུན་སྒྲོལ།\nདྲྭ་ཐོག་སྤེལ་མཁན། བདེ་སྐྱིད་ཀུན་སྒྲོལ།",
+        "speaker_name": "བདེ་སྐྱིད་ཀུན་སྒྲོལ།",
+        "speaker_gender": "",
+        "news_channel": "RFA",
+        "publishing_year": "2024.08.20"
+    },
+    "1": {
+        "full_audio_id": "STT_NW0805",
+        "sr_no": 5,
+        "audio_url": "https://www.rfa.org/tibetan/sargyur/vpn-china-restriction-08152024081404.html/@@stream",
+        "reference_transcript": "ཕྱི་ལོ་ ༢༠༢༤ ཟླ་ ༧ ཟླ་མཇུག་ཙམ་ལ་རྒྱ་ནག་གཞུང་གིས་བསྐྱར་དུ་རྒྱ་ནག་རྒྱལ་ཡོངས་སུ་མི་མང་ཐོག་སྟངས་འཛིན་ཤུགས་ཆེ་རུ་གཏོང་ཐབས་སུ། སྤྱི་ཚོགས་དྲ་ལམ་སྒོ་འབྱེད་ཆེད་མི་སྒེར་གྱི་ཞིབ་ཕྲའི་གནས་ཚུལ་མཁོ་སྤྲོད་བྱེད་དགོས་པའི་སྲིད་བྱུས་ཤིག་གསལ་བསྒྲགས་བྱས་ཡོད་ན་ཡང་། ཕྱི་ཟླ་ ༨ ཟླ་འགོ་ནས་བཟུང་བོད་ཀྱི་རྒྱལ་ས་ལྷ་ས་གཙོས་བོད་ནང་གི་ས་ཁུལ་མང་དག་ཅིག་ཏུ་སྤྱི་ཚོགས་དྲ་ལམ་ཐོག་སྟངས་འཛིན་གྱི་བྱ་ཐབས་ལག་བསྟར་བྱེད་བཞིན་ཡོད་པ་དང་།   བོད་ཀྱི་རྒྱལ་ས་ལྷ་ས་སོགས་སུ་སྤྱི་ཚོགས་དྲ་ལམ་ནང་གཞུང་གི་བཀོད་ཁྱབ་བཞིན་ཐོ་འགོད་མ་བྱས་ཚེ་གཞུང་ངོས་ནས་འབོད་འགུག་གིས་བསྐྱར་དུ་ཐོ་འགོད་དང་འབྲེལ་ཁ་པར་ལ་ཞིབ་བཤེར་བྱེད་ཀྱི་ཡོད་ཅིང་།  དམིགས་བཀར་དེ་སྔ་ (VPN) དྲ་ལམ་བརྒྱུད་གཞུང་ཕྱོགས་ཀྱི་བཀག་འགོག་ལས་གཡོལ་ཏེ་བོད་ནང་ནས་འཛུལ་མི་ཐུབ་པའི་དྲ་ཚིགས་ཁ་ཕྱེ་ཡོད་མེད་དང་།  ཕྱི་ཕྱོགས་སུ་འབྲེལ་བ་བྱས་ཡོད་མེད་སོགས་འདྲི་རྩད་བྱེད་བཞིན་ཡོད་པའི་སྐོར་བོད་ནང་གི་འབྲེལ་ཡོད་བོད་མི་གཉིས་ཀྱིས་འདི་ག་ཨེ་ཤེ་ཡ་རང་དབང་རླུང་འཕྲིན་ཁང་ལ་འགྲེལ་བརྗོད་གནང་བྱུང་།\nརྒྱ་ནག་གཞུང་འབྲེལ་གྱིས་གསལ་བསྒྲགས་བཏང་བའི་ནང་རྒྱ་ནག་གི་སྤྱི་འབངས་སྒེར་གྱི་གནས་ཚུལ་སྲུང་སྐྱོབ་དང་དྲ་རྒྱའི་སྤྱི་ཚོགས་ཀྱི་དོ་དམ་ཚད་ལྡན་ཡོང་ཆེད་རྒྱ་ནག་མི་དམངས་སྤྱི་མཐུན་རྒྱལ་ཁབ་ཀྱི་གློག་འཕྲིན་དྲ་བའི་གཡོ་སྒྱུ་འགོག་ཐབས་ཀྱི་ཁྲིམས་ལུགས་སྒྲིག་གཞི་ཁག་གཞིར་བཟུང་སྟེ་མི་མང་བདེ་འཇགས་ལས་ཁུངས་དང་། རྒྱལ་ཡོངས་དྲ་བའི་བརྡ་འཕྲིན་ལས་ཁུངས་སོགས་ཀྱིས་མཉམ་འབྲེལ་ངང་ལས་གཞི་དེ་སྤེལ་རྒྱུ་ཡིན་པས་མི་མང་ངོས་ནས་དེར་བསམ་ཚུལ་བཤད་དབང་ཡོད་པ་སོགས་བསྒྲགས་ཡོད་ན་ཡང་འབྲེལ་ཡོད་གཞན་དེས་བོད་མི་ཚོར་བསམ་ཚུལ་ཞིག་བཤད་ས་གནའ་ནས་ད་བར་ཡོད་མ་མྱོང་ལ།   ད་རེས་ཡིན་ན་ཡང་བསམ་ཚུལ་བཤད་ཆོག་པ་ཞིག་རྩ་བ་ནས་མ་བྱུང་པར་གཞུང་གིས་ལག་བསྟར་བྱེད་བཞིན་འདུག   དེ་སྔ་ཐོབ་ཐང་ལག་འཁྱེར་མེད་པར་ཁ་པར་ཨང་གྲངས་བརྒྱུད་སྤྱི་ཚོགས་དྲ་ལམ་ཁག་ལ་ཐོ་འགོད་བྱེད་ཐུབ་པ་དང་།  ཐོབ་ཐང་ལག་འཁྱེར་དང་སྦྲེལ་དགོས་པའི་ཆ་རྐྱེན་འདོན་བཞིན་ཡོད་ཀྱང་མ་མཐུད་ན་ཡང་ལས་ཀ་མང་པོ་ཞིག་བྱེད་ཐུབ་ཀྱི་ཡོད་པ་དང་།  ཐེངས་འདིར་དེ་སྔ་དང་མི་འདྲ་བར་གློག་རྡུལ་ལག་འཁྱེར་དང་མཉམ་དུ་སྦྲེལ་ནས་ཐོ་འགོད་བྱས་པ་ཞིག་ཆགས་ཡོད་སྐོར་འགྲེལ་བརྗོད་གནང་བྱུང་།\nགནས་ཚུལ་འགྲེལ་བརྗོད་པ་གཅིག་གིས་གསུངས་དོན། སྤྱི་ཚོགས་དྲ་ལམ་བེད་སྤྱོད་བྱེད་པར་བསྐྱར་དུ་ཐོ་འགོད་ཞིབ་བཤེར་ངེས་པར་དུ་བྱེད་དགོས་འདུག་ལ།  དེ་ནི་མི་སྒེར་གྱི་ཁ་པར་དང་སྤྱི་འབངས་ཀྱི་ཐོབ་ཐང་ལག་འཁྱེར་སོགས་དང་འབྲེལ་མཐུད་བྱས་ཏེ་གཞུང་གི་སྟངས་འཛིན་འོག་ཡོད་པའི་སྟབས་བདེའི་གསང་བའི་ཨང་གྲངས་ཤིག་མཁོ་སྤྲོད་བྱས་པ་དང་།   དེ་ནས་བསྐྱར་དུ་ཐོ་འགོད་བྱ་དགོས་ཡོད་འདུག་ཅེས་གསུངས་བྱུང་ལ།    གཙོ་བོ་སྤྱི་ཚོགས་དྲ་ལམ་སྐད་འཕྲིན་དང་མགྱོགས་འཕྲིན།  ཁྱུག་དབྱིངས་སོགས་བེད་སྤྱོད་ཆེད་ཡིན་པ་དང་།  སྤྱིར་དེ་སྔ་ནས་སྤྱི་འབངས་ཀྱི་ཐོབ་ཐང་ལག་འཁྱེར་སོགས་མཁོ་སྤྲོད་བྱས་ཡོད་པས་ཁྱད་པར་ག་རེ་ཡོད་མེད་སོགས་ཞིབ་ཕྲ་ཤེས་རྟོགས་ཐུབ་ཀྱི་མེད་སྐོར་བརྗོད་བྱུང་ལ།   བོད་ནང་གི་སྤྱི་ཚོགས་དྲ་ལམ་ཁག་ཏུ་ཐད་གཏོང་བྱ་སྐབས་ངེས་པར་དུ་མི་སྒེར་གྱི་ཞིབ་ཕྲའི་གནས་ཚུལ་མཁོ་སྤྲོད་དགོས་པ་དང་དམ་དྲག་ཤུགས་ཆེ་ཡོད་སྐོར་གྱི་གནས་ཚུལ་གླེང་མཁན་ཡོད་པ་མཐོང་རྒྱུ་འདུག་ཅེས་གསུངས་བྱུང་།\nསྤྱིར་རྒྱ་ནག་གཞུང་གིས་དེང་སྐབས་བོད་ཀྱི་རྒྱལ་ས་ལྷ་ས་ཁུལ་དུ་དངོས་ཤུགས་གཉིས་ནས་དམ་དྲག་ཤུགས་ཆེར་བྱེད་བཞིན་ཡོད་པ་དང་།  ཕྱི་ཟླ་ ༨ ཟླ་འགོར་ལྷ་ས་སྤྱི་བདེ་ཉེན་རྟོག་དྲ་རྒྱའི་སྟེང་གསལ་བསྒྲགས་སྤེལ་ཏེ། ལྷ་སར་དྲ་རྒྱའི་ལྕགས་རི་བརྒལ་མཁན་གསུམ་རྩད་ཆོད་འཛིན་བཟུང་གིས་(སྲིད་འཛིན་)ཆད་པ་བཅད་དེ་གློད་ཡོད་པ་བཀོད་འདུག་ལ། དེ་བཞིན་ཕྱི་ཟླ་༨ ཚེས་༡༣ ཉིན་ལྷ་སའི་ཉེན་རྟོག་ཕྱོགས་ནས་མི་མང་བདེ་འཇགས་དང་བརྟན་ལྷིང་ཆེད། ལྷ་སའི་ཉེན་རྟོག་ཚན་པས་ཉིན་མཚན་གཉིས་ལ་དྲ་ལམ་དང་སྲང་ལམ་ཡོངས་སུ་ཞིབ་བཤེར་བྱེད་བཞིན་པ་བརྗོད་འདུག་ལ།  སྤྱིར་རྒྱ་ནག་གཞུང་གིས་འདས་པའི་ལོ་ཤས་ནས་བཟུང་སྤྱི་ཚོགས་དྲ་ལམ་ཁག་ནང་བོད་མི་ཚོར་དམ་དྲག་དང་།  ཐ་ན་རྒྱ་ནག་གི་སྤྱི་ཚོགས་དྲ་ལམ་མགྱོགས་འཕྲིན་ཟེར་བའི་ནང་བོད་མི་ཚོས་བོད་ཡིག་དང་བོད་སྐད་བེད་སྤྱོད་བྱས་ན་བཀག་བསྡོམས་བྱེད་བཞིན་པ་སོགས་ཀྱི་གནས་ཚུལ་ཐོན་ཡོད།\nགསར་འགོད་པ། བདེ་སྐྱིད་ཀུན་སྒྲོལ།\nརྩོམ་སྒྲིག་པ། བསྟན་འཛིན་པདྨ། བཀྲ་ཤིས་དབང་ཕྱུག\nདྲྭ་ཐོག་སྤེལ་མཁན། བདེ་སྐྱིད་ཀུན་སྒྲོལ།",
+        "speaker_name": "བདེ་སྐྱིད་ཀུན་སྒྲོལ།",
+        "speaker_gender": "",
+        "news_channel": "RFA",
+        "publishing_year": "2024.08.15"
+    },
+    "2": {
+        "full_audio_id": "STT_NW0811",
+        "sr_no": 11,
+        "audio_url": "https://www.rfa.org/tibetan/sargyur/gonpo-kyi-demans-dorjee-tashi-release-04282023020304.html/@@stream",
+        "reference_transcript": "ཉེ་སྔོན་ཕྱི་ཟླ་༤ཚེས་༢༥ཉིན་བོད་ཀྱི་ཚོང་བ་ཆེན་མོ་དང་རྒྱ་ནག་གི་བཙོན་འོག་ཏུ་ཚེ་བཙོན་གྱི་ཁྲིམས་ཆད་འཁུར་བཞིན་པ་རྡོ་རྗེ་བཀྲ་ཤིས་ལགས་ཀྱི་གཅེན་མོ་མགོན་པོ་སྐྱིད་ནས། རྒྱ་ནག་གཞུང་འབྲེལ་བོད་ལྗོངས་མི་མང་ཁྲིམས་ཁང་མདུན་དུ་རྡོ་རྗེ་བཀྲ་ཤིས་ལགས་ཀྱི་ཁྲིམས་ཆད་ལ་བསྐྱར་ཞིབ་དང་དྲང་བདེན་གྱི་ཐག་གཅོད་དགོས་པའི་ངོ་རྒོལ་དང་འབྲེལ། རྒྱ་ནག་གཞུང་འབྲེལ་ལྷ་ས་སྤྱི་བདེ་མི་སྣས་མགོན་པོ་སྐྱིད་དང་ཁོང་གི་བཟའ་ཟླ་ཆོས་སྐྱོང་གཉིས་འཛིན་བཟུང་བཙན་ཁྲིད་ཀྱིས། ལྷ་ས་བཀག་སྐྱིལ་ཁང་དུ་ཉིན་མཚན་གཉིས་རིང་བརྡུང་རྡེག་བྱས་ཡོད་པ། བོད་ཕྱི་ནང་གཉིས་ཀྱི་འབྲེལ་ཡོད་བོད་མི་གཉིས་ནས་འདི་ག་ཨེ་ཤི་ཡ་རང་དབང་རླུང་འཕྲིན་ཁང་ལ་གནས་ཚུལ་མཁོ་སྤྲོད་གནང་བྱུང་།\nགོང་ཞུས་བོད་ནང་གི་གནས་ཚུལ་མཁོ་སྤྲོད་པས་མུ་མཐུད་བརྗོད་དོན་ལ། ཕྱི་ཚེས་༢༥ཉིན་གཅེན་མོ་མགོན་པོ་སྐྱིད་དང་ཁོང་གི་བཟའ་ཟླ་གཉིས་བོད་ལྗོངས་མི་མང་ཁྲིམས་ཁང་མདུན་དུ་ངོ་རྒོལ་དང་འབྲེལ། རྒྱ་ནག་ཉེན་རྟོག་པས་མགོན་པོ་སྐྱིད་ཀྱི་ངོ་རྒོལ་སྐབས་མི་མང་གིས་མི་མཐོང་ཆེད་རས་ཡོལ་ནག་པོས་མཐའ་སྐོར་བ་སོགས་བྱས་ཡོད་པ་དང་། ཕྱི་ཚེས་༢༦ཉིན་གྱི་སྔ་དྲོ་མགོན་པོ་སྐྱིད་དང་ཁོང་གི་བཟའ་ཟླ་གཉིས་བསྐྱར་དུ་འཛིན་བཟུང་གིས་ལྷ་ས་བཀག་སྐྱིལ་ཁང་དུ་ཕྱི་ཚེས་༢༧གྱི་མཚན་དཀྱིལ་བར་རྡུང་རྡེག་འདྲི་རྩད་བྱས་ཏེ་གློད་ཡོད་ཀྱང་། བསྐྱར་དུ་ངོ་རྒོལ་རིགས་བྱས་ཚེ་ཁོང་གཉིས་དངོས་སུ་འཛིན་བཟུང་བྱ་རྒྱུའི་འཇིག་སྐུལ་བྱས་ཡོད་པ་སོགས་འགྲེལ་བརྗོད་གནང་བྱུང་།\nཡང་འབྲེལ་ཡོད་ནས་མགོན་པོ་སྐྱིད་ལགས་ནས་དྲ་ལམ་དུ་ངོ་རྒོལ་གྱི་བརྙན་ཞིག་མཁོ་སྤྲོད་གནང་བའི་ནང་། བརྙན་ནང་དུ་མགོན་པོ་སྐྱིད་ལགས་ནས་རྡོ་རྗེ་བཀྲ་ཤིས་ལགས་ཀྱི་ཚེ་བཙོན་ཁྲིམས་ཆད་ལ་ཁྲིམས་ཞིབ་པས་བསྐྱར་ཞིབ་བྱ་བཅུག་དགོས་པ་དང་། ཁོང་ལ་ཁྲིམས་ཁང་གིས་དྲང་བདེན་མིན་པའི་ཁྲིམས་ཐག་བཅད་པར་བསྐྱར་དུ་ཐག་གཅོད་དགོས་པའི་དོན་ཚན་གཉིས་སྟོན་བཞིན། ཁོང་མོ་དང་བཟའ་ཟླ་གཉིས་འཛིན་བཟུང་དང་བཅར་བརྡུང་བྱས་པའི་སྐོར་གསལ་བཤད་དང་འབྲེལ། ད་ལྟའི་ཆར་ལྷ་ས་སྤྱི་བདེ་ཉེན་རྟོག་པ་ཟེར་བས་ཁོང་གཉིས་དངོས་སུ་འཛིན་བཟུང་བྱ་རྒྱུ་བརྗོད་སོང་ན་ཡང་། ཉེས་པ་གང་ཡང་མེད་པའི་ཁོ་མོའི་སྤུན་མཆེད་རྡོ་རྗེ་བཀྲ་ཤིས་ལགས་ཀློད་གྲོལ་མ་བཏང་བར་བསྐྱར་དུ་འཛིན་བཟུང་བྱ་བཅུག་རྒྱུ་དང་། མ་བསད་བར་དུ་ལས་འགུལ་མཚམས་འཇོག་བྱ་རྒྱུ་མིན་པ་སོགས་བརྗོད་འགྲེལ་བརྗོད་གནང་གི་འདུག\nསྤྱིར་ཉེ་སྔོན་ཕྱི་ཟླ་༢ནང་རྒྱ་ནག་གཞུང་གིས་བོད་ཀྱི་ཚོང་བ་ཆེན་མོ་རྡོ་རྗེ་བཀྲ་ཤིས་ལགས་ཀྱི་ནང་མི་ཚོར། ངོ་རྒོལ་ལས་འགུལ་དང་ཞུ་གཏུགས་རིགས་མཚམས་འཇོག་དགོས་པ་དང་། རྡོ་རྗེ་བཀྲ་ཤིས་ལགས་ཀྱི་ཁྲིམས་ཆད་ལ་བསྐྱར་ཞིབ་བྱ་རྒྱུ་ཡིན་པ་ཁས་ལེན་བྱས་ཡོད་ཀྱང་། ཕྱི་ཟླ་༤ཚེས་མགོར་རྒྱ་ནག་གཞུང་འབྲེལ་ལྷ་ས་ཉེན་རྟོག་ཚན་པས་རྡོ་རྗེ་བཀྲ་ཤིས་ལགས་ཀྱི་ཁྲིམས་ཆད་ལ་འགྱུར་བ་གང་ཡང་མེད་པར་བརྗོད་ཡོད་སྟབས། ཁོང་གི་གཅེན་མོ་གཙོས་ནང་མི་ཚོས་ཕྱི་ཟླ་༤ནང་རྡོ་རྗེ་བཀྲ་ཤིས་ལགས་བཙོན་འཇུག་བྱས་ཡོད་པའི་བོད་ལྗོངས་ཁྲིམས་ཁང་མདུན་དུ་ངོ་རྒོལ་རིམས་པར་བྱ་དགོས་བྱུང་ཡོད་པ་རེད།\nརྒྱ་ནག་གཞུང་གིས་རྡོ་རྗེ་བཀྲ་ཤིས་ལགས་སུ་ཕྱི་ལོ་༢༠༠༨ལོར་བོད་ནང་ས་བྱི་ཞི་རྒོལ་ཆེན་མོའི་སྐབས་བོད་མི་ཞི་རྒོལ་བ་ཚོར་རྒྱབ་སྐྱོར་བྱས་ལུགས་དང་བཙན་བྱོལ་བོད་མིའི་སྤྱི་ཚོགས་ཁྲོད་མ་དངུལ་གྱི་ཞལ་འདེབས་སྤྲད་ལུགས་བརྗོད་ནས་ཚེ་བཙོན་གྱི་ཁྲིམས་ཐག་བཅད་དེ་ལོ་ངོ་༡༥འགྲོ་བཞིན་ཡོད་པ་དང་། ཁོང་གི་གཅེན་པོ་རྡོ་རྗེ་ཚེ་བརྟན་ལགས་དང་གཅེན་མོ་མགོན་པོ་སྐྱིད་དང་། དེ་བཞིན་གཅེན་མོའི་བཟའ་ཟླ་ཆོས་སྐྱོང་ལགས་གསུམ་གྱིས་ཐེངས་མང་ལ་ཡོངས་གྲགས་ངང་ཁོང་ཀློད་གྲོལ་དགོས་པའི་ཞུ་གཏུག་ངོ་རྒོལ་ལས་འགུལ་སྤེལ་ཡོད་པ་རེད་ལ། རྒྱ་ནག་གཞུང་གིས་རྡོ་རྗེ་ཚེ་བརྟན་ཡང་འཛིན་བཟུང་གིས་མི་ལོ་༦བཙོན་འཇུག་ཁྲིམས་ཐག་བཅད་མྱོང་བ་དང་། མགོན་པོ་སྐྱིད་ཀྱང་སྔ་རྗེས་སུ་ཐེངས་༥ལ་འཛིན་བཟུང་དང་འདྲི་རྩད་གཅར་བརྡུང་ཐེངས་མང་བྱས་ཡོད་པའི་གནས་ཚུལ་ཐོན་ཡོད་པ་རེད།\nགསར་འགོད་པ། སངས་རྒྱས་དཀོན་མཆོག  ཡེ་ཤེས་ཟླ་བ།དྲྭ་ཐོག་སྤེལ་མཁན། ཡེ་ཤེས་ཟླ་བ།",
+        "speaker_name": "སངས་རྒྱས་དཀོན་མཆོག།",
+        "speaker_gender": "",
+        "news_channel": "RFA",
+        "publishing_year": "2023.04.28"
+    }
+}
diff --git a/tests/test_validation.py b/tests/test_validation.py
index 4214741..352ec8d 100644
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -11,7 +11,7 @@ def test_validation():
         "ཕྱི་ཟླ་བརྒྱད་པའི་ནང་རྒྱ་ནག་གཞུང་གི་མགོ་ལོག་མངའ་ཁུལ་གྱི་ས་གནས་གང་སར་དམ་བསྒྲགས་ཤུགས་ཆེར་ཆེ་ཡོད་པ་དང་།",
         "",
     )
-    assert not is_valid_transcript(
+    assert is_valid_transcript(
         "ལྷག་པར་དགན་སྡེ་ཁག་ཏུ་ཆོས་ཕྱོགས་ཀྱི་བྱེད་སྒོ་ལ་དམ་སྒྲགས་དང་།",
         "ལྷག་པར་དུ་དགོན་སྡེ་ཁག་ཏུ་ཆོས་ཕྱོགས་ཀྱི་བྱེད་སྒོ་ལ་དམ་དྲག་དང་བདེ་འཇགས་ཆེད་ཡིན་སྐོར་བརྗོད་ནས།",
     )