-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrecord.rs
203 lines (174 loc) · 6.37 KB
/
record.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
/*
wget https://github.com/thewh1teagle/vad-rs/releases/download/v0.1.0/silero_vad.onnx
cargo run --example record silero_vad.onnx
*/
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
use cpal::Sample;
use eyre::{bail, Result};
use once_cell::sync::Lazy;
use ringbuffer::{AllocRingBuffer, RingBuffer};
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::{Arc, Mutex};
use std::time::Instant;
use vad_rs::{Normalizer, Vad, VadStatus};
// Options
static MIN_SPEECH_DUR: Lazy<usize> = Lazy::new(|| 700); // 0.6s
static MIN_SILENCE_DUR: Lazy<usize> = Lazy::new(|| 700);
// Vad
static VAD_BUF: Lazy<Mutex<AllocRingBuffer<f32>>> =
Lazy::new(|| Mutex::new(AllocRingBuffer::new(16000)));
// State
static IS_SPEECH: Lazy<Arc<AtomicBool>> = Lazy::new(|| Arc::new(AtomicBool::new(false)));
static SPEECH_DUR: Lazy<Arc<AtomicUsize>> = Lazy::new(|| Arc::new(AtomicUsize::new(0)));
static SILENCE_DUR: Lazy<Arc<AtomicUsize>> = Lazy::new(|| Arc::new(AtomicUsize::new(0)));
static NORMALIZER: Lazy<Arc<Mutex<Option<Normalizer>>>> = Lazy::new(|| Arc::new(None.into()));
fn build_stream<T>(
device: &cpal::Device,
config: &cpal::StreamConfig,
sample_rate: u32,
channels: u16,
vad_handle: Arc<Mutex<Vad>>,
) -> Result<cpal::Stream>
where
T: Sample + cpal::SizedSample,
{
let err_fn = move |err| {
eprintln!("an error occurred on stream: {}", err);
};
Ok(device.build_input_stream(
config,
move |data: &[T], _: &_| {
on_stream_data::<T>(data, sample_rate, channels, vad_handle.clone());
},
err_fn,
None,
)?)
}
fn main() -> Result<()> {
let vad_model_path = std::env::args()
.nth(1)
.expect("Please specify vad model filename");
let vad = Vad::new(vad_model_path, 16000).unwrap();
let vad_handle = Arc::new(Mutex::new(vad));
let host = cpal::default_host();
// Set up the input device and stream with the default input config.
let device = host
.default_input_device()
.expect("failed to find input device");
println!("Input device: {}", device.name()?);
let config = device
.default_input_config()
.expect("Failed to get default input config");
println!("Default input config: {:?}", config);
// A flag to indicate that recording is in progress.
println!("Begin recording...");
let sample_rate = config.sample_rate().0;
let channels = config.channels();
// Loudness normalization
let normalizer = Normalizer::new(config.channels().into(), config.sample_rate().0);
*NORMALIZER.lock().unwrap() = Some(normalizer);
let stream = match config.sample_format() {
cpal::SampleFormat::I8 => build_stream::<i8>(
&device,
&config.into(),
sample_rate,
channels,
vad_handle.clone(),
)?,
cpal::SampleFormat::I16 => build_stream::<i16>(
&device,
&config.into(),
sample_rate,
channels,
vad_handle.clone(),
)?,
cpal::SampleFormat::I32 => build_stream::<i32>(
&device,
&config.into(),
sample_rate,
channels,
vad_handle.clone(),
)?,
cpal::SampleFormat::F32 => build_stream::<f32>(
&device,
&config.into(),
sample_rate,
channels,
vad_handle.clone(),
)?,
sample_format => {
bail!("Unsupported sample format '{sample_format}'")
}
};
stream.play()?;
// Keep main thread alive
loop {
std::thread::sleep(std::time::Duration::from_secs(1));
}
}
fn on_stream_data<T>(input: &[T], sample_rate: u32, channels: u16, vad_handle: Arc<Mutex<Vad>>)
where
T: Sample,
{
// Convert the input samples to f32
let samples: Vec<f32> = input
.iter()
.map(|s| s.to_float_sample().to_sample())
.collect();
// Resample the stereo audio to the desired sample rate
let mut resampled: Vec<f32> = vad_rs::audio_resample(&samples, sample_rate, 16000, channels);
#[allow(unused)]
if channels > 1 {
resampled = vad_rs::stereo_to_mono(&resampled).unwrap();
}
let mut normalizer = NORMALIZER.lock().unwrap();
let normalizer = normalizer.as_mut().unwrap();
resampled = normalizer.normalize_loudness(&samples);
let chunk_size = (30 * sample_rate / 1000) as usize;
let mut vad = vad_handle.lock().unwrap();
let mut vad_buf = VAD_BUF.lock().unwrap();
vad_buf.extend(resampled.clone());
if vad_buf.len() as f32 > sample_rate as f32 * 0.1 {
// 0.1s
// Start timing
let start_time = Instant::now();
// println!("compute {:?}", vad_buf.len());
if let Ok(mut result) = vad.compute(&vad_buf.to_vec()) {
// Calculate the elapsed time
let elapsed_time = start_time.elapsed();
let elapsed_ms = elapsed_time.as_secs_f64() * 1000.0;
// Log or handle the situation if computation time exceeds a threshold
if elapsed_ms > 100.0 {
eprintln!(
"Warning: VAD computation took too long: {} ms (expected < 30 ms)",
elapsed_ms
);
}
match result.status() {
VadStatus::Speech => {
SPEECH_DUR.fetch_add(chunk_size, Ordering::Relaxed);
if SPEECH_DUR.load(Ordering::Relaxed) >= *MIN_SPEECH_DUR
&& !IS_SPEECH.load(Ordering::Relaxed)
{
println!("Speech Start");
SILENCE_DUR.store(0, Ordering::Relaxed);
IS_SPEECH.store(true, Ordering::Relaxed);
vad_buf.extend(resampled.clone());
}
}
VadStatus::Silence => {
SILENCE_DUR.fetch_add(chunk_size, Ordering::Relaxed);
if SILENCE_DUR.load(Ordering::Relaxed) >= *MIN_SILENCE_DUR
&& IS_SPEECH.load(Ordering::Relaxed)
{
println!("Speech End");
SPEECH_DUR.store(0, Ordering::Relaxed);
IS_SPEECH.store(false, Ordering::Relaxed);
}
}
_ => {}
}
}
vad_buf.clear();
}
}