Skip to content

Commit

Permalink
Add APIs about max speech duration in VAD for various programming lan…
Browse files Browse the repository at this point in the history
…guages (#1349)
  • Loading branch information
csukuangfj authored Sep 14, 2024
1 parent 1423ddb commit e7ffcbd
Show file tree
Hide file tree
Showing 31 changed files with 88 additions and 9 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/dot-net.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ jobs:
git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
cd huggingface
git fetch
git pull
mkdir -p windows-for-dotnet
cp -v ../sherpa-onnx-*.tar.bz2 ./windows-for-dotnet
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ void main(List<String> arguments) async {
model: sileroVad,
minSilenceDuration: 0.25,
minSpeechDuration: 0.5,
maxSpeechDuration: 5.0,
);

final vadConfig = sherpa_onnx.VadModelConfig(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ void main(List<String> arguments) async {
model: sileroVad,
minSilenceDuration: 0.25,
minSpeechDuration: 0.5,
maxSpeechDuration: 5.0,
);

final vadConfig = sherpa_onnx.VadModelConfig(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ void main(List<String> arguments) async {
model: sileroVad,
minSilenceDuration: 0.25,
minSpeechDuration: 0.5,
maxSpeechDuration: 5.0,
);

final vadConfig = sherpa_onnx.VadModelConfig(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ void main(List<String> arguments) async {
model: sileroVad,
minSilenceDuration: 0.25,
minSpeechDuration: 0.5,
maxSpeechDuration: 5.0,
);

final vadConfig = sherpa_onnx.VadModelConfig(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ void main(List<String> arguments) async {
model: sileroVad,
minSilenceDuration: 0.25,
minSpeechDuration: 0.5,
maxSpeechDuration: 5.0,
);

final vadConfig = sherpa_onnx.VadModelConfig(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ void main(List<String> arguments) async {
model: sileroVad,
minSilenceDuration: 0.25,
minSpeechDuration: 0.5,
maxSpeechDuration: 5.0,
);

final vadConfig = sherpa_onnx.VadModelConfig(
Expand Down
3 changes: 3 additions & 0 deletions flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,9 @@ final class SherpaOnnxSileroVadModelConfig extends Struct {

@Int32()
external int windowSize;

@Float()
external double maxSpeechDuration;
}

final class SherpaOnnxVadModelConfig extends Struct {
Expand Down
7 changes: 5 additions & 2 deletions flutter/sherpa_onnx/lib/src/vad.dart
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,20 @@ class SileroVadModelConfig {
this.threshold = 0.5,
this.minSilenceDuration = 0.5,
this.minSpeechDuration = 0.25,
this.windowSize = 512});
this.windowSize = 512,
this.maxSpeechDuration = 5.0});

@override
String toString() {
return 'SileroVadModelConfig(model: $model, threshold: $threshold, minSilenceDuration: $minSilenceDuration, minSpeechDuration: $minSpeechDuration, windowSize: $windowSize)';
return 'SileroVadModelConfig(model: $model, threshold: $threshold, minSilenceDuration: $minSilenceDuration, minSpeechDuration: $minSpeechDuration, windowSize: $windowSize, maxSpeechDuration: $maxSpeechDuration)';
}

final String model;
final double threshold;
final double minSilenceDuration;
final double minSpeechDuration;
final int windowSize;
final double maxSpeechDuration;
}

class VadModelConfig {
Expand Down Expand Up @@ -127,6 +129,7 @@ class VoiceActivityDetector {
c.ref.sileroVad.minSilenceDuration = config.sileroVad.minSilenceDuration;
c.ref.sileroVad.minSpeechDuration = config.sileroVad.minSpeechDuration;
c.ref.sileroVad.windowSize = config.sileroVad.windowSize;
c.ref.sileroVad.maxSpeechDuration = config.sileroVad.maxSpeechDuration;

c.ref.sampleRate = config.sampleRate;
c.ref.numThreads = config.numThreads;
Expand Down
1 change: 1 addition & 0 deletions go-api-examples/vad-asr-paraformer/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ func main() {
config.SileroVad.MinSilenceDuration = 0.5
config.SileroVad.MinSpeechDuration = 0.25
config.SileroVad.WindowSize = 512
config.SileroVad.MaxSpeechDuration = 5.0
config.SampleRate = 16000
config.NumThreads = 1
config.Provider = "cpu"
Expand Down
1 change: 1 addition & 0 deletions go-api-examples/vad-asr-whisper/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ func main() {
config.SileroVad.MinSilenceDuration = 0.5
config.SileroVad.MinSpeechDuration = 0.25
config.SileroVad.WindowSize = 512
config.SileroVad.MaxSpeechDuration = 5.0
config.SampleRate = 16000
config.NumThreads = 1
config.Provider = "cpu"
Expand Down
1 change: 1 addition & 0 deletions java-api-examples/VadNonStreamingParaformer.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ public static Vad createVad() {
.setMinSilenceDuration(0.25f)
.setMinSpeechDuration(0.5f)
.setWindowSize(512)
.setMaxSpeechDuration(5.0f)
.build();

VadModelConfig config =
Expand Down
1 change: 1 addition & 0 deletions java-api-examples/VadNonStreamingSenseVoice.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ public static Vad createVad() {
.setMinSilenceDuration(0.25f)
.setMinSpeechDuration(0.5f)
.setWindowSize(512)
.setMaxSpeechDuration(5.0f)
.build();

VadModelConfig config =
Expand Down
1 change: 1 addition & 0 deletions java-api-examples/VadRemoveSilence.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ public static void main(String[] args) {
.setMinSilenceDuration(0.25f)
.setMinSpeechDuration(0.5f)
.setWindowSize(512)
.setMaxSpeechDuration(5.0f)
.build();

VadModelConfig config =
Expand Down
3 changes: 2 additions & 1 deletion lazarus-examples/generate_subtitles/my_init.pas
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ function CreateVad(VadFilename: AnsiString): TSherpaOnnxVoiceActivityDetector;
WindowSize := 512; {Please don't change it unless you know the details}

Config.SileroVad.Model := VadFilename;
Config.SileroVad.MinSpeechDuration := 0.5;
Config.SileroVad.MinSpeechDuration := 0.25;
Config.SileroVad.MinSilenceDuration := 0.5;
Config.SileroVad.MaxSpeechDuration := 5.0;
Config.SileroVad.Threshold := 0.5;
Config.SileroVad.WindowSize := WindowSize;
Config.NumThreads:= 2;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ function createVad() {
threshold: 0.5,
minSpeechDuration: 0.25,
minSilenceDuration: 0.5,
maxSpeechDuration: 5,
windowSize: 512,
},
sampleRate: 16000,
Expand Down
1 change: 1 addition & 0 deletions nodejs-examples/test-vad-with-non-streaming-asr-whisper.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ function createVad() {
threshold: 0.5,
minSpeechDuration: 0.25,
minSilenceDuration: 0.5,
maxSpeechDuration: 5,
windowSize: 512,
},
sampleRate: 16000,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,15 @@ def main():

config = sherpa_onnx.VadModelConfig()
config.silero_vad.model = args.silero_vad_model
config.silero_vad.threshold = 0.5
config.silero_vad.min_silence_duration = 0.25 # seconds
config.silero_vad.min_speech_duration = 0.25 # seconds

# If the current segment is larger than this value, then it increases
# the threshold to 0.9 internally. After detecting this segment,
# it resets the threshold to its original value.
config.silero_vad.max_speech_duration = 5 # seconds

config.sample_rate = sample_rate

window_size = config.silero_vad.window_size
Expand Down
3 changes: 3 additions & 0 deletions scripts/dotnet/SileroVadModelConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ public SileroVadModelConfig()
MinSilenceDuration = 0.5F;
MinSpeechDuration = 0.25F;
WindowSize = 512;
MaxSpeechDuration = 5.0F;
}

[MarshalAs(UnmanagedType.LPStr)]
Expand All @@ -26,5 +27,7 @@ public SileroVadModelConfig()
public float MinSpeechDuration;

public int WindowSize;

public float MaxSpeechDuration;
}
}
2 changes: 2 additions & 0 deletions scripts/go/sherpa_onnx.go
Original file line number Diff line number Diff line change
Expand Up @@ -771,6 +771,7 @@ type SileroVadModelConfig struct {
MinSilenceDuration float32
MinSpeechDuration float32
WindowSize int
MaxSpeechDuration float32
}

type VadModelConfig struct {
Expand Down Expand Up @@ -849,6 +850,7 @@ func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float3
c.silero_vad.min_silence_duration = C.float(config.SileroVad.MinSilenceDuration)
c.silero_vad.min_speech_duration = C.float(config.SileroVad.MinSpeechDuration)
c.silero_vad.window_size = C.int(config.SileroVad.WindowSize)
c.silero_vad.max_speech_duration = C.float(config.SileroVad.MaxSpeechDuration)

c.sample_rate = C.int(config.SampleRate)
c.num_threads = C.int(config.NumThreads)
Expand Down
3 changes: 3 additions & 0 deletions scripts/node-addon-api/lib/vad.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ config = {
sileroVad: {
model: "./silero_vad.onnx",
threshold: 0.5,
minSilenceDuration: 0.5,
minSpeechDuration: 0.25,
maxSpeechDuration: 5,
}
}
*/
Expand Down
1 change: 1 addition & 0 deletions scripts/node-addon-api/src/vad.cc
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ static SherpaOnnxSileroVadModelConfig GetSileroVadConfig(
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_silence_duration, minSilenceDuration);
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_speech_duration, minSpeechDuration);
SHERPA_ONNX_ASSIGN_ATTR_INT32(window_size, windowSize);
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(max_speech_duration, maxSpeechDuration);

return c;
}
Expand Down
3 changes: 3 additions & 0 deletions sherpa-onnx/c-api/c-api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,9 @@ SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector(
vad_config.silero_vad.window_size =
SHERPA_ONNX_OR(config->silero_vad.window_size, 512);

vad_config.silero_vad.max_speech_duration =
SHERPA_ONNX_OR(config->silero_vad.max_speech_duration, 20);

vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000);
vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1);
vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu");
Expand Down
5 changes: 5 additions & 0 deletions sherpa-onnx/c-api/c-api.h
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
float min_speech_duration;

int window_size;

// If a speech segment is longer than this value, then we increase
// the threshold to 0.9. After finishing detecting the segment,
// the threshold value is reset to its original value.
float max_speech_duration;
} SherpaOnnxSileroVadModelConfig;

SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@ public class SileroVadModelConfig {
private final float minSilenceDuration;
private final float minSpeechDuration;
private final int windowSize;
private final float maxSpeechDuration;

private SileroVadModelConfig(Builder builder) {
this.model = builder.model;
this.threshold = builder.threshold;
this.minSilenceDuration = builder.minSilenceDuration;
this.minSpeechDuration = builder.minSpeechDuration;
this.windowSize = builder.windowSize;
this.maxSpeechDuration = builder.maxSpeechDuration;
}

public static Builder builder() {
Expand All @@ -41,12 +43,17 @@ public int getWindowSize() {
return windowSize;
}

public float getMaxSpeechDuration() {
return maxSpeechDuration;
}

public static class Builder {
private String model = "";
private float threshold = 0.5f;
private float minSilenceDuration = 0.25f;
private float minSpeechDuration = 0.5f;
private int windowSize = 512;
private float maxSpeechDuration = 5.0f;

public SileroVadModelConfig build() {
return new SileroVadModelConfig(this);
Expand Down Expand Up @@ -77,5 +84,10 @@ public Builder setWindowSize(int windowSize) {
this.windowSize = windowSize;
return this;
}

public Builder setMaxSpeechDuration(float maxSpeechDuration) {
this.maxSpeechDuration = maxSpeechDuration;
return this;
}
}
}
4 changes: 4 additions & 0 deletions sherpa-onnx/jni/voice-activity-detector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ static VadModelConfig GetVadModelConfig(JNIEnv *env, jobject config) {
fid = env->GetFieldID(silero_vad_config_cls, "windowSize", "I");
ans.silero_vad.window_size = env->GetIntField(silero_vad_config, fid);

fid = env->GetFieldID(silero_vad_config_cls, "maxSpeechDuration", "F");
ans.silero_vad.max_speech_duration =
env->GetFloatField(silero_vad_config, fid);

fid = env->GetFieldID(cls, "sampleRate", "I");
ans.sample_rate = env->GetIntField(config, fid);

Expand Down
1 change: 1 addition & 0 deletions sherpa-onnx/kotlin-api/Vad.kt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ data class SileroVadModelConfig(
var minSilenceDuration: Float = 0.25F,
var minSpeechDuration: Float = 0.25F,
var windowSize: Int = 512,
var maxSpeechDuration: Float = 5.0F,
)

data class VadModelConfig(
Expand Down
9 changes: 7 additions & 2 deletions sherpa-onnx/pascal-api/sherpa_onnx.pas
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,7 @@ TSherpaOnnxSileroVadModelConfig = record
MinSilenceDuration: Single;
MinSpeechDuration: Single;
WindowSize: Integer;
MaxSpeechDuration: Single;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
end;
Expand Down Expand Up @@ -594,6 +595,7 @@ SherpaOnnxSileroVadModelConfig = record
MinSilenceDuration: cfloat;
MinSpeechDuration: cfloat;
WindowSize: cint32;
MaxSpeechDuration: cfloat;
end;
SherpaOnnxVadModelConfig = record
SileroVad: SherpaOnnxSileroVadModelConfig;
Expand Down Expand Up @@ -1402,10 +1404,11 @@ function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString;
'Threshold := %.2f, ' +
'MinSilenceDuration := %.2f, ' +
'MinSpeechDuration := %.2f, ' +
'WindowSize := %d' +
'WindowSize := %d, ' +
'MaxSpeechDuration := %.2f' +
')',
[Self.Model, Self.Threshold, Self.MinSilenceDuration,
Self.MinSpeechDuration, Self.WindowSize
Self.MinSpeechDuration, Self.WindowSize, Self.MaxSpeechDuration
]);
end;

Expand All @@ -1415,6 +1418,7 @@ function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString;
Dest.MinSilenceDuration := 0.5;
Dest.MinSpeechDuration := 0.25;
Dest.WindowSize := 512;
Dest.MaxSpeechDuration := 5.0;
end;

function TSherpaOnnxVadModelConfig.ToString: AnsiString;
Expand Down Expand Up @@ -1569,6 +1573,7 @@ constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelC
C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration;
C.SileroVad.WindowSize := Config.SileroVad.WindowSize;
C.SileroVad.MaxSpeechDuration := Config.SileroVad.MaxSpeechDuration;

C.SampleRate := Config.SampleRate;
C.NumThreads := Config.NumThreads;
Expand Down
6 changes: 4 additions & 2 deletions swift-api-examples/SherpaOnnx.swift
Original file line number Diff line number Diff line change
Expand Up @@ -550,14 +550,16 @@ func sherpaOnnxSileroVadModelConfig(
threshold: Float = 0.5,
minSilenceDuration: Float = 0.25,
minSpeechDuration: Float = 0.5,
windowSize: Int = 512
windowSize: Int = 512,
maxSpeechDuration: Float = 5.0
) -> SherpaOnnxSileroVadModelConfig {
return SherpaOnnxSileroVadModelConfig(
model: toCPointer(model),
threshold: threshold,
min_silence_duration: minSilenceDuration,
min_speech_duration: minSpeechDuration,
window_size: Int32(windowSize)
window_size: Int32(windowSize),
max_speech_duration: maxSpeechDuration
)
}

Expand Down
Loading

0 comments on commit e7ffcbd

Please sign in to comment.