From be4aedd8e5d365b0542e132ed8f0f8f5fbe93ddf Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 14 Sep 2024 12:08:56 +0800 Subject: [PATCH] Update Pascal API to support max speech duration in VAD --- .github/workflows/dot-net.yaml | 2 ++ java-api-examples/VadRemoveSilence.java | 1 + lazarus-examples/generate_subtitles/my_init.pas | 3 ++- .../vad-remove-non-speech-segments-from-file.py | 9 +++++++++ sherpa-onnx/pascal-api/sherpa_onnx.pas | 9 +++++++-- 5 files changed, 21 insertions(+), 3 deletions(-) diff --git a/.github/workflows/dot-net.yaml b/.github/workflows/dot-net.yaml index 5299b19ce..36637a9e2 100644 --- a/.github/workflows/dot-net.yaml +++ b/.github/workflows/dot-net.yaml @@ -93,6 +93,8 @@ jobs: git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface cd huggingface + git fetch + git pull mkdir -p windows-for-dotnet cp -v ../sherpa-onnx-*.tar.bz2 ./windows-for-dotnet diff --git a/java-api-examples/VadRemoveSilence.java b/java-api-examples/VadRemoveSilence.java index 3af1caa7f..511a508e4 100644 --- a/java-api-examples/VadRemoveSilence.java +++ b/java-api-examples/VadRemoveSilence.java @@ -19,6 +19,7 @@ public static void main(String[] args) { .setMinSilenceDuration(0.25f) .setMinSpeechDuration(0.5f) .setWindowSize(512) + .setMaxSpeechDuration(5.0f) .build(); VadModelConfig config = diff --git a/lazarus-examples/generate_subtitles/my_init.pas b/lazarus-examples/generate_subtitles/my_init.pas index 55df79f15..d57448b6d 100644 --- a/lazarus-examples/generate_subtitles/my_init.pas +++ b/lazarus-examples/generate_subtitles/my_init.pas @@ -48,8 +48,9 @@ function CreateVad(VadFilename: AnsiString): TSherpaOnnxVoiceActivityDetector; WindowSize := 512; {Please don't change it unless you know the details} Config.SileroVad.Model := VadFilename; - Config.SileroVad.MinSpeechDuration := 0.5; + Config.SileroVad.MinSpeechDuration := 0.25; Config.SileroVad.MinSilenceDuration := 0.5; + Config.SileroVad.MaxSpeechDuration := 5.0; Config.SileroVad.Threshold := 0.5; Config.SileroVad.WindowSize := WindowSize; Config.NumThreads:= 2; diff --git a/python-api-examples/vad-remove-non-speech-segments-from-file.py b/python-api-examples/vad-remove-non-speech-segments-from-file.py index f559e6519..ad4814487 100755 --- a/python-api-examples/vad-remove-non-speech-segments-from-file.py +++ b/python-api-examples/vad-remove-non-speech-segments-from-file.py @@ -90,6 +90,15 @@ def main(): config = sherpa_onnx.VadModelConfig() config.silero_vad.model = args.silero_vad_model + config.silero_vad.threshold = 0.5 + config.silero_vad.min_silence_duration = 0.25 # seconds + config.silero_vad.min_speech_duration = 0.25 # seconds + + # If the current segment is larger than this value, then it increases + # the threshold to 0.9 internally. After detecting this segment, + # it resets the threshold to its original value. + config.silero_vad.max_speech_duration = 5 # seconds + config.sample_rate = sample_rate window_size = config.silero_vad.window_size diff --git a/sherpa-onnx/pascal-api/sherpa_onnx.pas b/sherpa-onnx/pascal-api/sherpa_onnx.pas index 987b31f14..7f05793e1 100644 --- a/sherpa-onnx/pascal-api/sherpa_onnx.pas +++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas @@ -341,6 +341,7 @@ TSherpaOnnxSileroVadModelConfig = record MinSilenceDuration: Single; MinSpeechDuration: Single; WindowSize: Integer; + MaxSpeechDuration: Single; function ToString: AnsiString; class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig); end; @@ -594,6 +595,7 @@ SherpaOnnxSileroVadModelConfig = record MinSilenceDuration: cfloat; MinSpeechDuration: cfloat; WindowSize: cint32; + MaxSpeechDuration: cfloat; end; SherpaOnnxVadModelConfig = record SileroVad: SherpaOnnxSileroVadModelConfig; @@ -1402,10 +1404,11 @@ function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString; 'Threshold := %.2f, ' + 'MinSilenceDuration := %.2f, ' + 'MinSpeechDuration := %.2f, ' + - 'WindowSize := %d' + + 'WindowSize := %d, ' + + 'MaxSpeechDuration := %.2f' + ')', [Self.Model, Self.Threshold, Self.MinSilenceDuration, - Self.MinSpeechDuration, Self.WindowSize + Self.MinSpeechDuration, Self.WindowSize, Self.MaxSpeechDuration ]); end; @@ -1415,6 +1418,7 @@ function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString; Dest.MinSilenceDuration := 0.5; Dest.MinSpeechDuration := 0.25; Dest.WindowSize := 512; + Dest.MaxSpeechDuration := 5.0; end; function TSherpaOnnxVadModelConfig.ToString: AnsiString; @@ -1569,6 +1573,7 @@ constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelC C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration; C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration; C.SileroVad.WindowSize := Config.SileroVad.WindowSize; + C.SileroVad.MaxSpeechDuration := Config.SileroVad.MaxSpeechDuration; C.SampleRate := Config.SampleRate; C.NumThreads := Config.NumThreads;