From 260e554c1a574e6b85af6322f20b32ecb1f2a94e Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:35:02 +0000 Subject: [PATCH 1/4] Fixes OOM Errors - too high RAM usage by VAD Reported problems: https://github.com/SYSTRAN/faster-whisper/issues/1193 https://github.com/SYSTRAN/faster-whisper/issues/1169 VAD implementations consumes humongous memory amounts [original Silero doesn't have this problem] This PR should fix the OOM problem. Alt solution could be removing 'lru_cache'. --- faster_whisper/vad.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/faster_whisper/vad.py b/faster_whisper/vad.py index 9605931c..fa790acf 100644 --- a/faster_whisper/vad.py +++ b/faster_whisper/vad.py @@ -260,8 +260,9 @@ def __init__(self, encoder_path, decoder_path): ) from e opts = onnxruntime.SessionOptions() - opts.inter_op_num_threads = 0 - opts.intra_op_num_threads = 0 + opts.inter_op_num_threads = 1 + opts.intra_op_num_threads = 1 + opts.enable_cpu_mem_arena = False opts.log_severity_level = 4 self.encoder_session = onnxruntime.InferenceSession( From 8068472f5c4c02352002bc562931e6351a619ac4 Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:34:22 +0000 Subject: [PATCH 2/4] Reduce RAM usage further --- faster_whisper/vad.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/faster_whisper/vad.py b/faster_whisper/vad.py index fa790acf..2375c8af 100644 --- a/faster_whisper/vad.py +++ b/faster_whisper/vad.py @@ -302,7 +302,15 @@ def __call__( batched_audio = batched_audio.reshape(-1, num_samples + context_size_samples) - encoder_output = self.encoder_session.run(None, {"input": batched_audio})[0] + batch_process_size = 10000 + num_segments = batched_audio.shape[0] + encoder_outputs = [] + for start in range(0, num_segments, batch_process_size): + end = min(start + batch_process_size, num_segments) + encoder_output = self.encoder_session.run(None, {"input": batched_audio[start:end]})[0] + encoder_outputs.append(encoder_output) + + encoder_output = np.concatenate(encoder_outputs, axis=0) encoder_output = encoder_output.reshape(batch_size, -1, 128) decoder_outputs = [] From ce6e29808d7cd7feb628eb79d84acbc249bb6680 Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:44:12 +0000 Subject: [PATCH 3/4] style --- faster_whisper/vad.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/faster_whisper/vad.py b/faster_whisper/vad.py index 2375c8af..f91be787 100644 --- a/faster_whisper/vad.py +++ b/faster_whisper/vad.py @@ -307,7 +307,9 @@ def __call__( encoder_outputs = [] for start in range(0, num_segments, batch_process_size): end = min(start + batch_process_size, num_segments) - encoder_output = self.encoder_session.run(None, {"input": batched_audio[start:end]})[0] + encoder_output = self.encoder_session.run( + None, {"input": batched_audio[start:end]} + )[0] encoder_outputs.append(encoder_output) encoder_output = np.concatenate(encoder_outputs, axis=0) From 4863a1f14404b329a92f0456b8857daf276a05c8 Mon Sep 17 00:00:00 2001 From: Mahmoud Ashraf Date: Thu, 12 Dec 2024 15:20:13 +0300 Subject: [PATCH 4/4] .. --- faster_whisper/vad.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/faster_whisper/vad.py b/faster_whisper/vad.py index f91be787..1f7d2057 100644 --- a/faster_whisper/vad.py +++ b/faster_whisper/vad.py @@ -302,13 +302,12 @@ def __call__( batched_audio = batched_audio.reshape(-1, num_samples + context_size_samples) - batch_process_size = 10000 + encoder_batch_size = 10000 num_segments = batched_audio.shape[0] encoder_outputs = [] - for start in range(0, num_segments, batch_process_size): - end = min(start + batch_process_size, num_segments) + for i in range(0, num_segments, encoder_batch_size): encoder_output = self.encoder_session.run( - None, {"input": batched_audio[start:end]} + None, {"input": batched_audio[i : i + encoder_batch_size]} )[0] encoder_outputs.append(encoder_output)