Merge pull request #36 from rmusser01/main

Fixes for transcription pipeline; Transcription display on the audio tab; RAG Libary -> RAG Library; checkbox for VAD
the-crypt-keeper · Oct 14, 2024 · 08cebf5 · 08cebf5
2 parents 9503e7f + 4e560c6
commit 08cebf5
Show file tree

Hide file tree

Showing 12 changed files with 65 additions and 26 deletions.
diff --git a/App_Function_Libraries/Audio/Audio_Files.py b/App_Function_Libraries/Audio/Audio_Files.py
@@ -268,18 +268,23 @@ def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key
     progress = []
     all_transcriptions = []
     all_summaries = []
-
+    #v2
     def format_transcription_with_timestamps(segments):
         if keep_timestamps:
             formatted_segments = []
             for segment in segments:
                 start = segment.get('Time_Start', 0)
                 end = segment.get('Time_End', 0)
-                text = segment.get('Text', '')
+                text = segment.get('Text', '').strip()  # Ensure text is stripped of leading/trailing spaces
+
+                # Add the formatted timestamp and text to the list, followed by a newline
                 formatted_segments.append(f"[{start:.2f}-{end:.2f}] {text}")
-            return " ".join(formatted_segments)
+
+            # Join the segments with a newline to ensure proper formatting
+            return "\n".join(formatted_segments)
         else:
-            return " ".join([segment.get('Text', '') for segment in segments])
+            # Join the text without timestamps
+            return "\n".join([segment.get('Text', '').strip() for segment in segments])
 
     def update_progress(message):
         progress.append(message)

diff --git a/App_Function_Libraries/Audio/Audio_Transcription_Lib.py b/App_Function_Libraries/Audio/Audio_Transcription_Lib.py
@@ -16,6 +16,7 @@
 import gc
 import json
 import logging
+import multiprocessing
 import os
 import queue
 import sys
@@ -45,7 +46,7 @@
 whisper_model_instance = None
 config = load_comprehensive_config()
 processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
-
+total_thread_count = multiprocessing.cpu_count()
 
 
 class WhisperModel(OriginalWhisperModel):
@@ -55,7 +56,7 @@ class WhisperModel(OriginalWhisperModel):
     valid_model_sizes = [
         "tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium",
         "large-v1", "large-v2", "large-v3", "large", "distil-large-v2", "distil-medium.en",
-        "distil-small.en", "distil-large-v3"
+        "distil-small.en", "distil-large-v3",
     ]
 
     def __init__(
@@ -64,7 +65,7 @@ def __init__(
         device: str = processing_choice,
         device_index: Union[int, List[int]] = 0,
         compute_type: str = "default",
-        cpu_threads: int = 16,
+        cpu_threads: int = 0,#total_thread_count, FIXME - I think this should be 0
         num_workers: int = 1,
         download_root: Optional[str] = None,
         local_files_only: bool = False,
@@ -199,11 +200,13 @@ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='me
             return segments
 
         logging.info('speech-to-text: Starting transcription...')
-        options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
+        # FIXME - revisit this
+        options = dict(language=selected_source_lang, beam_size=10, best_of=10, vad_filter=vad_filter)
         transcribe_options = dict(task="transcribe", **options)
         # use function and config at top of file
         logging.debug("speech-to-text: Using whisper model: %s", whisper_model)
         whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
+        # faster_whisper transcription right here - FIXME -test batching - ha
         segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
 
         segments = []
@@ -216,7 +219,7 @@ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='me
             logging.debug("Segment: %s", chunk)
             segments.append(chunk)
             # Print to verify its working
-            print(f"{segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
+            logging.info(f"{segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
 
             # Log it as well.
             logging.debug(

diff --git a/App_Function_Libraries/Gradio_UI/RAG_Chat_tab.py b/App_Function_Libraries/Gradio_UI/RAG_Chat_tab.py
@@ -9,7 +9,7 @@
 #
 # Local Imports
 
-from App_Function_Libraries.RAG.RAG_Libary_2 import enhanced_rag_pipeline
+from App_Function_Libraries.RAG.RAG_Library_2 import enhanced_rag_pipeline
 #
 ########################################################################################################################
 #

diff --git a/App_Function_Libraries/Gradio_UI/RAG_QA_Chat_tab.py b/App_Function_Libraries/Gradio_UI/RAG_QA_Chat_tab.py
@@ -15,7 +15,7 @@
 from App_Function_Libraries.Books.Book_Ingestion_Lib import read_epub
 from App_Function_Libraries.DB.DB_Manager import DatabaseError, get_paginated_files, add_media_with_keywords
 from App_Function_Libraries.PDF.PDF_Ingestion_Lib import extract_text_and_format_from_pdf
-from App_Function_Libraries.RAG.RAG_Libary_2 import generate_answer, enhanced_rag_pipeline
+from App_Function_Libraries.RAG.RAG_Library_2 import generate_answer, enhanced_rag_pipeline
 from App_Function_Libraries.RAG.RAG_QA_Chat import search_database, rag_qa_chat
 # Eventually... FIXME
 from App_Function_Libraries.RAG.RAG_QA_Chat import load_chat_history, save_chat_history

diff --git a/App_Function_Libraries/Gradio_UI/Video_transcription_tab.py b/App_Function_Libraries/Gradio_UI/Video_transcription_tab.py
@@ -43,6 +43,7 @@ def create_video_transcription_tab():
                                        lines=5)
                 video_file_input = gr.File(label="Upload Video File (Optional)", file_types=["video/*"])
                 diarize_input = gr.Checkbox(label="Enable Speaker Diarization", value=False)
+                vad_checkbox = gr.Checkbox(label="Enable Voice-Audio-Detection(VAD)", value=True)
                 whisper_model_input = gr.Dropdown(choices=whisper_models, value="medium", label="Whisper Model")
 
                 with gr.Row():
@@ -185,7 +186,7 @@ def update_prompts(preset_name):
                 download_summary = gr.File(label="Download All Summaries as Text")
 
             @error_handler
-            def process_videos_with_error_handling(inputs, start_time, end_time, diarize, whisper_model,
+            def process_videos_with_error_handling(inputs, start_time, end_time, diarize, vad_use, whisper_model,
                                                    custom_prompt_checkbox, custom_prompt, chunking_options_checkbox,
                                                    chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking,
                                                    use_multi_level_chunking, chunk_language, api_name,
@@ -301,7 +302,7 @@ def process_videos_with_error_handling(inputs, start_time, end_time, diarize, wh
                                     input_item, 2, whisper_model,
                                     custom_prompt,
                                     start_seconds, api_name, api_key,
-                                    False, False, False, False, 0.01, None, keywords, None, diarize,
+                                    vad_use, False, False, False, 0.01, None, keywords, None, diarize,
                                     end_time=end_seconds,
                                     include_timestamps=timestamp_option,
                                     metadata=video_metadata,
@@ -425,7 +426,7 @@ def process_videos_with_error_handling(inputs, start_time, end_time, diarize, wh
                         None
                     )
 
-            def process_videos_wrapper(url_input, video_file, start_time, end_time, diarize, whisper_model,
+            def process_videos_wrapper(url_input, video_file, start_time, end_time, diarize, vad_use, whisper_model,
                                        custom_prompt_checkbox, custom_prompt, chunking_options_checkbox,
                                        chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking,
                                        use_multi_level_chunking, chunk_language, summarize_recursively, api_name,
@@ -460,7 +461,7 @@ def process_videos_wrapper(url_input, video_file, start_time, end_time, diarize,
                         raise ValueError("No input provided. Please enter URLs or upload a video file.")
 
                     result = process_videos_with_error_handling(
-                        inputs, start_time, end_time, diarize, whisper_model,
+                        inputs, start_time, end_time, diarize, vad_use, whisper_model,
                         custom_prompt_checkbox, custom_prompt, chunking_options_checkbox,
                         chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking,
                         use_multi_level_chunking, chunk_language, api_name,
@@ -507,6 +508,7 @@ def process_url_with_metadata(input_item, num_speakers, whisper_model, custom_pr
                 try:
                     logging.info(f"Starting process_url_metadata for URL: {input_item}")
                     # Create download path
+
                     download_path = create_download_directory("Video_Downloads")
                     logging.info(f"Download path created at: {download_path}")
 
@@ -743,15 +745,37 @@ def toggle_confabulation_output(checkbox_value):
                 inputs=[confab_checkbox],
                 outputs=[confabulation_output]
             )
+
             process_button.click(
                 fn=process_videos_wrapper,
                 inputs=[
-                    url_input, video_file_input, start_time_input, end_time_input, diarize_input, whisper_model_input,
-                    custom_prompt_checkbox, custom_prompt_input, chunking_options_checkbox,
-                    chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking,
-                    use_multi_level_chunking, chunk_language, summarize_recursively, api_name_input, api_key_input,
-                    keywords_input, use_cookies_input, cookies_input, batch_size_input,
-                    timestamp_option, keep_original_video, confab_checkbox, overwrite_checkbox
+                    url_input,
+                    video_file_input,
+                    start_time_input,
+                    end_time_input,
+                    diarize_input,
+                    vad_checkbox,
+                    whisper_model_input,
+                    custom_prompt_checkbox,
+                    custom_prompt_input,
+                    chunking_options_checkbox,
+                    chunk_method,
+                    max_chunk_size,
+                    chunk_overlap,
+                    use_adaptive_chunking,
+                    use_multi_level_chunking,
+                    chunk_language,
+                    summarize_recursively,
+                    api_name_input,
+                    api_key_input,
+                    keywords_input,
+                    use_cookies_input,
+                    cookies_input,
+                    batch_size_input,
+                    timestamp_option,
+                    keep_original_video,
+                    confab_checkbox,
+                    overwrite_checkbox
                 ],
                 outputs=[progress_output, error_output, results_output, download_transcription, download_summary, confabulation_output]
             )
diff --git a/App_Function_Libraries/RAG/RAG_Libary_2.py → App_Function_Libraries/RAG/RAG_Library_2.py b/App_Function_Libraries/RAG/RAG_Libary_2.py → App_Function_Libraries/RAG/RAG_Library_2.py
diff --git a/App_Function_Libraries/RAG/RAG_QA_Chat.py b/App_Function_Libraries/RAG/RAG_QA_Chat.py
@@ -12,7 +12,7 @@
 #
 # Local Imports
 from App_Function_Libraries.DB.DB_Manager import db, search_db, DatabaseError, get_media_content
-from App_Function_Libraries.RAG.RAG_Libary_2 import generate_answer
+from App_Function_Libraries.RAG.RAG_Library_2 import generate_answer
 #
 ########################################################################################################################
 #

diff --git a/App_Function_Libraries/Summarization/Summarization_General_Lib.py b/App_Function_Libraries/Summarization/Summarization_General_Lib.py
@@ -1151,7 +1151,7 @@ def perform_transcription(video_path, offset, whisper_model, vad_filter, diarize
 
         return audio_file_path, diarized_segments
 
-    # Non-diarized transcription (existing functionality)
+    # Non-diarized transcription
     if os.path.exists(segments_json_path):
         logging.info(f"Segments file already exists: {segments_json_path}")
         try:

diff --git a/Docs/Issues/ISSUES.md b/Docs/Issues/ISSUES.md
@@ -12,3 +12,10 @@ Writing
     https://github.com/EMNLP-2024-CritiCS/Collective-Critics-for-Creative-Story-Generation
 
 
+Update model suggestions for RAG vs Chatting/General use
+    https://huggingface.co/THUDM/glm-4-9b-chat/blob/main/README_en.md
+    https://huggingface.co/byroneverson/glm-4-9b-chat-abliterated-gguf/tree/main
+
+Whisper pipeline
+    https://huggingface.co/spaces/aadnk/faster-whisper-webui
+    https://huggingface.co/spaces/zhang082799/openai-whisper-large-v3-turbo
diff --git a/Tests/RAG/test_RAG_Library_2.py b/Tests/RAG/test_RAG_Library_2.py
@@ -12,7 +12,7 @@
 sys.path.append(parent_dir)
 
 # Import the functions to test
-from App_Function_Libraries.RAG.RAG_Libary_2 import (
+from App_Function_Libraries.RAG.RAG_Library_2 import (
     fetch_relevant_media_ids,
     perform_vector_search,
     perform_full_text_search

diff --git a/Tests/RAG/test_enhanced_rag_pipeline.py b/Tests/RAG/test_enhanced_rag_pipeline.py
@@ -9,7 +9,7 @@
 sys.path.append(parent_dir)
 
 # Now import the necessary modules
-from App_Function_Libraries.RAG.RAG_Libary_2 import enhanced_rag_pipeline
+from App_Function_Libraries.RAG.RAG_Library_2 import enhanced_rag_pipeline
 from App_Function_Libraries.RAG.Embeddings_Create import create_embeddings_batch
 from App_Function_Libraries.RAG.ChromaDB_Library import vector_search
 

diff --git a/summarize.py b/summarize.py
@@ -69,7 +69,7 @@
 #
 # Global variables
 whisper_models = ["small", "medium", "small.en", "medium.en", "medium", "large", "large-v1", "large-v2", "large-v3",
-                  "distil-large-v2", "distil-medium.en", "distil-small.en"]
+                  "distil-large-v2", "distil-medium.en", "distil-small.en", ]
 server_mode = False
 share_public = False