[TASK] Allow precission config int8_float16 for faster-whisper

[TASK] Allow setting beam_size, cpu_threads and num_workers
Sharrnah · Mar 13, 2023 · d20a5e4 · d20a5e4
1 parent cf06522
commit d20a5e4
Show file tree

Hide file tree

Showing 4 changed files with 36 additions and 24 deletions.
diff --git a/Models/STT/faster_whisper.py b/Models/STT/faster_whisper.py
@@ -192,12 +192,12 @@ def download_model(model: str, compute_type: str = "float32"):
     model_cache_path = Path(".cache/whisper")
     os.makedirs(model_cache_path, exist_ok=True)
     model_path = Path(model_cache_path / (model + "-ct2"))
-    if compute_type == "float16":
+    if compute_type == "float16" or compute_type == "int8_float16" or compute_type == "int16" or compute_type == "int8":
         model_path = Path(model_cache_path / (model + "-ct2-fp16"))
 
     pretrained_lang_model_file = Path(model_path / "model.bin")
 
-    if not Path(model_path).exists() or pretrained_lang_model_file.is_file():
+    if not Path(model_path).exists() or not pretrained_lang_model_file.is_file():
         print("downloading faster-whisper...")
         if not downloader.download_extract(MODEL_LINKS[model][compute_type]["urls"],
                                            str(model_cache_path.resolve()),
@@ -208,23 +208,26 @@ def download_model(model: str, compute_type: str = "float32"):
 class FasterWhisper:
     model = None
 
-    def __init__(self, model: str, device: str = "cpu", compute_type: str = "float32"):
+    def __init__(self, model: str, device: str = "cpu", compute_type: str = "float32", cpu_threads: int = 0,
+                 num_workers: int = 1):
         if self.model is None:
-            self.load_model(model, device, compute_type)
+            self.load_model(model, device, compute_type, cpu_threads, num_workers)
 
-    def load_model(self, model: str, device: str = "cpu", compute_type: str = "float32"):
+    def load_model(self, model: str, device: str = "cpu", compute_type: str = "float32", cpu_threads: int = 0,
+                   num_workers: int = 1):
         model_cache_path = Path(".cache/whisper")
         os.makedirs(model_cache_path, exist_ok=True)
         model_path = Path(model_cache_path / (model + "-ct2"))
-        if compute_type == "float16":
+        if compute_type == "float16" or compute_type == "int8_float16":
             model_path = Path(model_cache_path / (model + "-ct2-fp16"))
 
         print("loading faster-whisper...")
-        self.model = WhisperModel(str(Path(model_path).resolve()), device=device, compute_type=compute_type)
+        self.model = WhisperModel(str(Path(model_path).resolve()), device=device, compute_type=compute_type,
+                                  cpu_threads=cpu_threads, num_workers=num_workers)
 
     def transcribe(self, audio_sample, task, language, condition_on_previous_text,
                    initial_prompt, logprob_threshold, no_speech_threshold,
-                   temperature) -> dict:
+                   temperature, beam_size) -> dict:
 
         result_segments, audio_info = self.model.transcribe(audio_sample, task=task,
                                                             language=language,
@@ -233,6 +236,7 @@ def transcribe(self, audio_sample, task, language, condition_on_previous_text,
                                                             log_prob_threshold=logprob_threshold,
                                                             no_speech_threshold=no_speech_threshold,
                                                             temperature=temperature,
+                                                            beam_size=beam_size,
                                                             without_timestamps=True
                                                             )
 

diff --git a/audioWhisper.py b/audioWhisper.py
@@ -275,12 +275,11 @@ def main(ctx, devices, device_index, sample_rate, dynamic_energy, open_browser,
     # Load faster-whisper model
     if settings.GetOption("faster_whisper"):
         whisper_model = settings.GetOption("model")
-        if settings.GetOption("fp16"):
-            compute_dtype = "float16"
-        else:
-            compute_dtype = "float32"
+        whisper_precision = settings.GetOption("whisper_precision")
         # download the model here since its only possible in the main thread
-        faster_whisper.download_model(whisper_model, compute_dtype)
+        websocket.set_loading_state("downloading_whisper_model", True)
+        faster_whisper.download_model(whisper_model, whisper_precision)
+        websocket.set_loading_state("downloading_whisper_model", False)
 
     # prepare the plugin timer calls
     call_plugin_timer()

diff --git a/audioprocessor.py b/audioprocessor.py
@@ -187,15 +187,15 @@ def send_message(predicted_text, result_obj):
 
 
 def load_whisper(model, ai_device):
+    cpu_threads = settings.GetOption("whisper_cpu_threads")
+    num_workers = settings.GetOption("whisper_num_workers")
     if not settings.GetOption("faster_whisper"):
         return whisper.load_model(model, download_root=".cache/whisper", device=ai_device)
     else:
-        if settings.GetOption("fp16"):
-            compute_dtype = "float16"
-        else:
-            compute_dtype = "float32"
+        compute_dtype = settings.GetOption("whisper_precision")
 
-        return faster_whisper.FasterWhisper(model, device=ai_device, compute_type=compute_dtype)
+        return faster_whisper.FasterWhisper(model, device=ai_device, compute_type=compute_dtype,
+                                            cpu_threads=cpu_threads, num_workers=num_workers)
 
 
 def convert_audio(audio_bytes: bytes):
@@ -238,7 +238,7 @@ def whisper_worker():
         whisper_condition_on_previous_text = settings.GetOption("condition_on_previous_text")
         whisper_logprob_threshold = settings.GetOption("logprob_threshold")
         whisper_no_speech_threshold = settings.GetOption("no_speech_threshold")
-        whisper_fp16 = settings.GetOption("fp16")
+        whisper_beam_size = settings.GetOption("beam_size")
 
         whisper_temperature_fallback = settings.GetOption("temperature_fallback")
         whisper_temperature_fallback_option = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0)
@@ -268,13 +268,17 @@ def whisper_worker():
 
             if not settings.GetOption("faster_whisper"):
                 # official whisper model
+                whisper_fp16 = False
+                if settings.GetOption("whisper_precision") == "float16":  # set precision
+                    whisper_fp16 = True
                 result = audio_model.transcribe(audio_sample, task=whisper_task, language=whisper_language,
                                                 condition_on_previous_text=whisper_condition_on_previous_text,
                                                 initial_prompt=whisper_initial_prompt,
                                                 logprob_threshold=whisper_logprob_threshold,
                                                 no_speech_threshold=whisper_no_speech_threshold,
                                                 fp16=whisper_fp16,
-                                                temperature=whisper_temperature_fallback_option
+                                                temperature=whisper_temperature_fallback_option,
+                                                beam_size=whisper_beam_size
                                                 )
             else:
                 # faster whisper
@@ -284,7 +288,8 @@ def whisper_worker():
                                                 initial_prompt=whisper_initial_prompt,
                                                 logprob_threshold=whisper_logprob_threshold,
                                                 no_speech_threshold=whisper_no_speech_threshold,
-                                                temperature=whisper_temperature_fallback_option)
+                                                temperature=whisper_temperature_fallback_option,
+                                                beam_size=whisper_beam_size)
 
             whisper_result_handling(result)
         except Exception as e:

diff --git a/settings.py b/settings.py
@@ -33,14 +33,17 @@
     "initial_prompt": "",  # initial prompt for Whisper. for example "Umm, let me think like, hmm... Okay, here's what I'm, like, thinking." will give more filler words.
     "logprob_threshold": "-1.0",
     "no_speech_threshold": "0.6",
+    "whisper_precision": "float32",  # for original Whisper can be "float16" or "float32", for faster-whisper "default", "auto", "int8", "int8_float16", "int16", "float16", "float32".
+    "faster_whisper": False,  # Set to True to use faster whisper.
+    "temperature_fallback": True,  # Set to False to disable temperature fallback which is the reason for some slowdowns, but decreases quality.
+    "beam_size": 5,  # Beam size for beam search. (higher = more accurate, but slower)
+    "whisper_cpu_threads": 0,  # Number of threads to use when running on CPU (4 by default)
+    "whisper_num_workers": 1,  # When transcribe() is called from multiple Python threads
     "vad_enabled": True,  # Enable Voice activity detection (VAD)
     "vad_on_full_clip": False,  # Make an additional VAD check on the full clip (Not only on each frame).
     "vad_confidence_threshold": "0.4",  # Voice activity detection (VAD) confidence threshold. Can be 0-1
     "vad_num_samples": 3000,  # Voice activity detection (VAD) sample size (how many audio samples should be tested).
     "vad_thread_num": 1,  # number of threads to use for VAD.
-    "fp16": False,  # Set to True to use FP16 instead of FP32.
-    "faster_whisper": False,  # Set to True to use faster whisper.
-    "temperature_fallback": True,  # Set to False to disable temperature fallback which is the reason for some slowdowns, but decreases quality.
 
     # OSC settings
     "osc_ip": "127.0.0.1",  # OSC IP address. set to "0" to disable.
@@ -161,6 +164,7 @@ def GetAvailableSettingValues():
         "llm_model": ["flan", "bloomz", "gptj", "pygmalion"],
         "tts_prosody_rate": ["", "x-slow", "slow", "medium", "fast", "x-fast"],
         "tts_prosody_pitch": ["", "x-low", "low", "medium", "high", "x-high"],
+        "whisper_precision": ["float32", "float16", "int16", "int8_float16", "int8"],
     }
 
     return possible_settings