Skip to content

Commit

Permalink
[TASK] Allow precission config int8_float16 for faster-whisper
Browse files Browse the repository at this point in the history
[TASK] Allow setting beam_size, cpu_threads and num_workers
  • Loading branch information
Sharrnah committed Mar 13, 2023
1 parent cf06522 commit d20a5e4
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 24 deletions.
20 changes: 12 additions & 8 deletions Models/STT/faster_whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,12 +192,12 @@ def download_model(model: str, compute_type: str = "float32"):
model_cache_path = Path(".cache/whisper")
os.makedirs(model_cache_path, exist_ok=True)
model_path = Path(model_cache_path / (model + "-ct2"))
if compute_type == "float16":
if compute_type == "float16" or compute_type == "int8_float16" or compute_type == "int16" or compute_type == "int8":
model_path = Path(model_cache_path / (model + "-ct2-fp16"))

pretrained_lang_model_file = Path(model_path / "model.bin")

if not Path(model_path).exists() or pretrained_lang_model_file.is_file():
if not Path(model_path).exists() or not pretrained_lang_model_file.is_file():
print("downloading faster-whisper...")
if not downloader.download_extract(MODEL_LINKS[model][compute_type]["urls"],
str(model_cache_path.resolve()),
Expand All @@ -208,23 +208,26 @@ def download_model(model: str, compute_type: str = "float32"):
class FasterWhisper:
model = None

def __init__(self, model: str, device: str = "cpu", compute_type: str = "float32"):
def __init__(self, model: str, device: str = "cpu", compute_type: str = "float32", cpu_threads: int = 0,
num_workers: int = 1):
if self.model is None:
self.load_model(model, device, compute_type)
self.load_model(model, device, compute_type, cpu_threads, num_workers)

def load_model(self, model: str, device: str = "cpu", compute_type: str = "float32"):
def load_model(self, model: str, device: str = "cpu", compute_type: str = "float32", cpu_threads: int = 0,
num_workers: int = 1):
model_cache_path = Path(".cache/whisper")
os.makedirs(model_cache_path, exist_ok=True)
model_path = Path(model_cache_path / (model + "-ct2"))
if compute_type == "float16":
if compute_type == "float16" or compute_type == "int8_float16":
model_path = Path(model_cache_path / (model + "-ct2-fp16"))

print("loading faster-whisper...")
self.model = WhisperModel(str(Path(model_path).resolve()), device=device, compute_type=compute_type)
self.model = WhisperModel(str(Path(model_path).resolve()), device=device, compute_type=compute_type,
cpu_threads=cpu_threads, num_workers=num_workers)

def transcribe(self, audio_sample, task, language, condition_on_previous_text,
initial_prompt, logprob_threshold, no_speech_threshold,
temperature) -> dict:
temperature, beam_size) -> dict:

result_segments, audio_info = self.model.transcribe(audio_sample, task=task,
language=language,
Expand All @@ -233,6 +236,7 @@ def transcribe(self, audio_sample, task, language, condition_on_previous_text,
log_prob_threshold=logprob_threshold,
no_speech_threshold=no_speech_threshold,
temperature=temperature,
beam_size=beam_size,
without_timestamps=True
)

Expand Down
9 changes: 4 additions & 5 deletions audioWhisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,12 +275,11 @@ def main(ctx, devices, device_index, sample_rate, dynamic_energy, open_browser,
# Load faster-whisper model
if settings.GetOption("faster_whisper"):
whisper_model = settings.GetOption("model")
if settings.GetOption("fp16"):
compute_dtype = "float16"
else:
compute_dtype = "float32"
whisper_precision = settings.GetOption("whisper_precision")
# download the model here since its only possible in the main thread
faster_whisper.download_model(whisper_model, compute_dtype)
websocket.set_loading_state("downloading_whisper_model", True)
faster_whisper.download_model(whisper_model, whisper_precision)
websocket.set_loading_state("downloading_whisper_model", False)

# prepare the plugin timer calls
call_plugin_timer()
Expand Down
21 changes: 13 additions & 8 deletions audioprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,15 +187,15 @@ def send_message(predicted_text, result_obj):


def load_whisper(model, ai_device):
cpu_threads = settings.GetOption("whisper_cpu_threads")
num_workers = settings.GetOption("whisper_num_workers")
if not settings.GetOption("faster_whisper"):
return whisper.load_model(model, download_root=".cache/whisper", device=ai_device)
else:
if settings.GetOption("fp16"):
compute_dtype = "float16"
else:
compute_dtype = "float32"
compute_dtype = settings.GetOption("whisper_precision")

return faster_whisper.FasterWhisper(model, device=ai_device, compute_type=compute_dtype)
return faster_whisper.FasterWhisper(model, device=ai_device, compute_type=compute_dtype,
cpu_threads=cpu_threads, num_workers=num_workers)


def convert_audio(audio_bytes: bytes):
Expand Down Expand Up @@ -238,7 +238,7 @@ def whisper_worker():
whisper_condition_on_previous_text = settings.GetOption("condition_on_previous_text")
whisper_logprob_threshold = settings.GetOption("logprob_threshold")
whisper_no_speech_threshold = settings.GetOption("no_speech_threshold")
whisper_fp16 = settings.GetOption("fp16")
whisper_beam_size = settings.GetOption("beam_size")

whisper_temperature_fallback = settings.GetOption("temperature_fallback")
whisper_temperature_fallback_option = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0)
Expand Down Expand Up @@ -268,13 +268,17 @@ def whisper_worker():

if not settings.GetOption("faster_whisper"):
# official whisper model
whisper_fp16 = False
if settings.GetOption("whisper_precision") == "float16": # set precision
whisper_fp16 = True
result = audio_model.transcribe(audio_sample, task=whisper_task, language=whisper_language,
condition_on_previous_text=whisper_condition_on_previous_text,
initial_prompt=whisper_initial_prompt,
logprob_threshold=whisper_logprob_threshold,
no_speech_threshold=whisper_no_speech_threshold,
fp16=whisper_fp16,
temperature=whisper_temperature_fallback_option
temperature=whisper_temperature_fallback_option,
beam_size=whisper_beam_size
)
else:
# faster whisper
Expand All @@ -284,7 +288,8 @@ def whisper_worker():
initial_prompt=whisper_initial_prompt,
logprob_threshold=whisper_logprob_threshold,
no_speech_threshold=whisper_no_speech_threshold,
temperature=whisper_temperature_fallback_option)
temperature=whisper_temperature_fallback_option,
beam_size=whisper_beam_size)

whisper_result_handling(result)
except Exception as e:
Expand Down
10 changes: 7 additions & 3 deletions settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,17 @@
"initial_prompt": "", # initial prompt for Whisper. for example "Umm, let me think like, hmm... Okay, here's what I'm, like, thinking." will give more filler words.
"logprob_threshold": "-1.0",
"no_speech_threshold": "0.6",
"whisper_precision": "float32", # for original Whisper can be "float16" or "float32", for faster-whisper "default", "auto", "int8", "int8_float16", "int16", "float16", "float32".
"faster_whisper": False, # Set to True to use faster whisper.
"temperature_fallback": True, # Set to False to disable temperature fallback which is the reason for some slowdowns, but decreases quality.
"beam_size": 5, # Beam size for beam search. (higher = more accurate, but slower)
"whisper_cpu_threads": 0, # Number of threads to use when running on CPU (4 by default)
"whisper_num_workers": 1, # When transcribe() is called from multiple Python threads
"vad_enabled": True, # Enable Voice activity detection (VAD)
"vad_on_full_clip": False, # Make an additional VAD check on the full clip (Not only on each frame).
"vad_confidence_threshold": "0.4", # Voice activity detection (VAD) confidence threshold. Can be 0-1
"vad_num_samples": 3000, # Voice activity detection (VAD) sample size (how many audio samples should be tested).
"vad_thread_num": 1, # number of threads to use for VAD.
"fp16": False, # Set to True to use FP16 instead of FP32.
"faster_whisper": False, # Set to True to use faster whisper.
"temperature_fallback": True, # Set to False to disable temperature fallback which is the reason for some slowdowns, but decreases quality.

# OSC settings
"osc_ip": "127.0.0.1", # OSC IP address. set to "0" to disable.
Expand Down Expand Up @@ -161,6 +164,7 @@ def GetAvailableSettingValues():
"llm_model": ["flan", "bloomz", "gptj", "pygmalion"],
"tts_prosody_rate": ["", "x-slow", "slow", "medium", "fast", "x-fast"],
"tts_prosody_pitch": ["", "x-low", "low", "medium", "high", "x-high"],
"whisper_precision": ["float32", "float16", "int16", "int8_float16", "int8"],
}

return possible_settings

0 comments on commit d20a5e4

Please sign in to comment.