From a95b44db4cad7eafc2b9b836e5de303b079018b7 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 26 Oct 2024 22:46:31 +0800 Subject: [PATCH 1/4] Add C API for Moonshine models --- .github/workflows/c-api.yaml | 33 +++++++++++++ c-api-examples/CMakeLists.txt | 3 ++ c-api-examples/moonshine-c-api.c | 83 ++++++++++++++++++++++++++++++++ sherpa-onnx/c-api/c-api.cc | 12 +++++ sherpa-onnx/c-api/c-api.h | 8 +++ sherpa-onnx/c-api/cxx-api.cc | 9 ++++ sherpa-onnx/c-api/cxx-api.h | 8 +++ 7 files changed, 156 insertions(+) create mode 100644 c-api-examples/moonshine-c-api.c diff --git a/.github/workflows/c-api.yaml b/.github/workflows/c-api.yaml index 94a52a4bc..2e34977f4 100644 --- a/.github/workflows/c-api.yaml +++ b/.github/workflows/c-api.yaml @@ -81,6 +81,39 @@ jobs: otool -L ./install/lib/libsherpa-onnx-c-api.dylib fi + - name: Test Moonshine + shell: bash + run: | + gcc -o moonshine-c-api ./c-api-examples/moonshine-c-api.c \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-c-api \ + -l onnxruntime + + ls -lh moonshine-c-api + + if [[ ${{ matrix.os }} == ubuntu-latest ]]; then + ldd ./moonshine-c-api + echo "----" + readelf -d ./moonshine-c-api + fi + + # Now download models + # + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + ls -lh sherpa-onnx-moonshine-tiny-en-int8 + echo "---" + ls -lh sherpa-onnx-moonshine-tiny-en-int8/test_wavs + + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH + + ./moonshine-c-api + + rm -rf sherpa-onnx-* + - name: Test ffmpeg if: matrix.os == 'macos-latest' shell: bash diff --git a/c-api-examples/CMakeLists.txt b/c-api-examples/CMakeLists.txt index 45ca9a156..58a867726 100644 --- a/c-api-examples/CMakeLists.txt +++ b/c-api-examples/CMakeLists.txt @@ -35,6 +35,9 @@ target_link_libraries(whisper-c-api sherpa-onnx-c-api) add_executable(sense-voice-c-api sense-voice-c-api.c) target_link_libraries(sense-voice-c-api sherpa-onnx-c-api) +add_executable(moonshine-c-api moonshine-c-api.c) +target_link_libraries(moonshine-c-api sherpa-onnx-c-api) + add_executable(zipformer-c-api zipformer-c-api.c) target_link_libraries(zipformer-c-api sherpa-onnx-c-api) diff --git a/c-api-examples/moonshine-c-api.c b/c-api-examples/moonshine-c-api.c new file mode 100644 index 000000000..775dd24c9 --- /dev/null +++ b/c-api-examples/moonshine-c-api.c @@ -0,0 +1,83 @@ +// c-api-examples/moonshine-c-api.c +// +// Copyright (c) 2024 Xiaomi Corporation + +// +// This file demonstrates how to use Moonshine tiny with sherpa-onnx's C API. +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +// tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +// rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +// +// clang-format on + +#include +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +int32_t main() { + const char *wav_filename = + "./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav"; + const char *preprocessor = + "./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx"; + const char *encoder = "./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx"; + const char *uncached_decoder = + "./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx"; + const char *cached_decoder = + "./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx"; + const char *tokens = "./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt"; + + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); + if (wave == NULL) { + fprintf(stderr, "Failed to read %s\n", wav_filename); + return -1; + } + + // Offline model config + SherpaOnnxOfflineModelConfig offline_model_config; + memset(&offline_model_config, 0, sizeof(offline_model_config)); + offline_model_config.debug = 1; + offline_model_config.num_threads = 1; + offline_model_config.provider = "cpu"; + offline_model_config.tokens = tokens; + offline_model_config.moonshine.preprocessor = preprocessor; + offline_model_config.moonshine.encoder = encoder; + offline_model_config.moonshine.uncached_decoder = uncached_decoder; + offline_model_config.moonshine.cached_decoder = cached_decoder; + + // Recognizer config + SherpaOnnxOfflineRecognizerConfig recognizer_config; + memset(&recognizer_config, 0, sizeof(recognizer_config)); + recognizer_config.decoding_method = "greedy_search"; + recognizer_config.model_config = offline_model_config; + + const SherpaOnnxOfflineRecognizer *recognizer = + SherpaOnnxCreateOfflineRecognizer(&recognizer_config); + + if (recognizer == NULL) { + fprintf(stderr, "Please check your config!\n"); + SherpaOnnxFreeWave(wave); + return -1; + } + + const SherpaOnnxOfflineStream *stream = + SherpaOnnxCreateOfflineStream(recognizer); + + SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, wave->samples, + wave->num_samples); + SherpaOnnxDecodeOfflineStream(recognizer, stream); + const SherpaOnnxOfflineRecognizerResult *result = + SherpaOnnxGetOfflineStreamResult(stream); + + fprintf(stderr, "Decoded text: %s\n", result->text); + + SherpaOnnxDestroyOfflineRecognizerResult(result); + SherpaOnnxDestroyOfflineStream(stream); + SherpaOnnxDestroyOfflineRecognizer(recognizer); + SherpaOnnxFreeWave(wave); + + return 0; +} diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index d7fa383be..f01b4917f 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -450,6 +450,18 @@ sherpa_onnx::OfflineRecognizerConfig convertConfig( recognizer_config.model_config.sense_voice.use_itn = config->model_config.sense_voice.use_itn; + recognizer_config.model_config.moonshine.preprocessor = + SHERPA_ONNX_OR(config->model_config.moonshine.preprocessor, ""); + + recognizer_config.model_config.moonshine.encoder = + SHERPA_ONNX_OR(config->model_config.moonshine.encoder, ""); + + recognizer_config.model_config.moonshine.uncached_decoder = + SHERPA_ONNX_OR(config->model_config.moonshine.uncached_decoder, ""); + + recognizer_config.model_config.moonshine.cached_decoder = + SHERPA_ONNX_OR(config->model_config.moonshine.cached_decoder, ""); + recognizer_config.lm_config.model = SHERPA_ONNX_OR(config->lm_config.model, ""); recognizer_config.lm_config.scale = diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index e5fc92eb1..8b4b65786 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -389,6 +389,13 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineWhisperModelConfig { int32_t tail_paddings; } SherpaOnnxOfflineWhisperModelConfig; +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineMoonshineModelConfig { + const char *preprocessor; + const char *encoder; + const char *uncached_decoder; + const char *cached_decoder; +} SherpaOnnxOfflineMoonshineModelConfig; + SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTdnnModelConfig { const char *model; } SherpaOnnxOfflineTdnnModelConfig; @@ -424,6 +431,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { const char *bpe_vocab; const char *telespeech_ctc; SherpaOnnxOfflineSenseVoiceModelConfig sense_voice; + SherpaOnnxOfflineMoonshineModelConfig moonshine; } SherpaOnnxOfflineModelConfig; SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig { diff --git a/sherpa-onnx/c-api/cxx-api.cc b/sherpa-onnx/c-api/cxx-api.cc index 262ad3cc9..c66221f0e 100644 --- a/sherpa-onnx/c-api/cxx-api.cc +++ b/sherpa-onnx/c-api/cxx-api.cc @@ -227,6 +227,15 @@ OfflineRecognizer OfflineRecognizer::Create( config.model_config.sense_voice.language.c_str(); c.model_config.sense_voice.use_itn = config.model_config.sense_voice.use_itn; + c.model_config.moonshine.preprocessor = + config.model_config.moonshine.preprocessor.c_str(); + c.model_config.moonshine.encoder = + config.model_config.moonshine.encoder.c_str(); + c.model_config.moonshine.uncached_decoder = + config.model_config.moonshine.uncached_decoder.c_str(); + c.model_config.moonshine.cached_decoder = + config.model_config.moonshine.cached_decoder.c_str(); + c.lm_config.model = config.lm_config.model.c_str(); c.lm_config.scale = config.lm_config.scale; diff --git a/sherpa-onnx/c-api/cxx-api.h b/sherpa-onnx/c-api/cxx-api.h index 17727f817..b8a46e113 100644 --- a/sherpa-onnx/c-api/cxx-api.h +++ b/sherpa-onnx/c-api/cxx-api.h @@ -225,6 +225,13 @@ struct SHERPA_ONNX_API OfflineSenseVoiceModelConfig { bool use_itn = false; }; +struct SHERPA_ONNX_API OfflineMoonshineModelConfig { + std::string preprocessor; + std::string encoder; + std::string uncached_decoder; + std::string cached_decoder; +}; + struct SHERPA_ONNX_API OfflineModelConfig { OfflineTransducerModelConfig transducer; OfflineParaformerModelConfig paraformer; @@ -241,6 +248,7 @@ struct SHERPA_ONNX_API OfflineModelConfig { std::string bpe_vocab; std::string telespeech_ctc; OfflineSenseVoiceModelConfig sense_voice; + OfflineMoonshineModelConfig moonshine; }; struct SHERPA_ONNX_API OfflineLMConfig { From 0a58c39e72158f295f6f237e5d268cf272442ae3 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 26 Oct 2024 23:00:10 +0800 Subject: [PATCH 2/4] Add more C API examples for Moonshine --- .github/workflows/c-api.yaml | 63 +++++++-- c-api-examples/CMakeLists.txt | 6 + c-api-examples/vad-moonshine-c-api.c | 171 +++++++++++++++++++++++++ c-api-examples/vad-sense-voice-c-api.c | 1 + c-api-examples/vad-whisper-c-api.c | 169 ++++++++++++++++++++++++ 5 files changed, 399 insertions(+), 11 deletions(-) create mode 100644 c-api-examples/vad-moonshine-c-api.c create mode 100644 c-api-examples/vad-whisper-c-api.c diff --git a/.github/workflows/c-api.yaml b/.github/workflows/c-api.yaml index 2e34977f4..049240d77 100644 --- a/.github/workflows/c-api.yaml +++ b/.github/workflows/c-api.yaml @@ -81,31 +81,72 @@ jobs: otool -L ./install/lib/libsherpa-onnx-c-api.dylib fi - - name: Test Moonshine + - name: Test vad + Whisper tiny.en shell: bash run: | - gcc -o moonshine-c-api ./c-api-examples/moonshine-c-api.c \ + gcc -o vad-whisper-c-api ./c-api-examples/vad-whisper-c-api.c \ -I ./build/install/include \ -L ./build/install/lib/ \ -l sherpa-onnx-c-api \ -l onnxruntime - ls -lh moonshine-c-api + # Now download models + # + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav - if [[ ${{ matrix.os }} == ubuntu-latest ]]; then - ldd ./moonshine-c-api - echo "----" - readelf -d ./moonshine-c-api - fi + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 + rm sherpa-onnx-whisper-tiny.en.tar.bz2 + + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH + + ./vad-whisper-c-api + + rm -rf sherpa-onnx-* + rm -rf *.onnx + rm *.wav + + - name: Test vad + Moonshine + shell: bash + run: | + gcc -o vad-moonshine-c-api ./c-api-examples/vad-moonshine-c-api.c \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-c-api \ + -l onnxruntime # Now download models # + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH + + ./vad-moonshine-c-api + + rm -rf sherpa-onnx-* + rm -rf *.onnx + rm *.wav + + - name: Test Moonshine + shell: bash + run: | + gcc -o moonshine-c-api ./c-api-examples/moonshine-c-api.c \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-c-api \ + -l onnxruntime + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 - ls -lh sherpa-onnx-moonshine-tiny-en-int8 - echo "---" - ls -lh sherpa-onnx-moonshine-tiny-en-int8/test_wavs export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH diff --git a/c-api-examples/CMakeLists.txt b/c-api-examples/CMakeLists.txt index 58a867726..c7db2bc27 100644 --- a/c-api-examples/CMakeLists.txt +++ b/c-api-examples/CMakeLists.txt @@ -56,6 +56,12 @@ target_link_libraries(telespeech-c-api sherpa-onnx-c-api) add_executable(vad-sense-voice-c-api vad-sense-voice-c-api.c) target_link_libraries(vad-sense-voice-c-api sherpa-onnx-c-api) +add_executable(vad-whisper-c-api vad-whisper-c-api.c) +target_link_libraries(vad-whisper-c-api sherpa-onnx-c-api) + +add_executable(vad-moonshine-c-api vad-moonshine-c-api.c) +target_link_libraries(vad-moonshine-c-api sherpa-onnx-c-api) + add_executable(streaming-zipformer-buffered-tokens-hotwords-c-api streaming-zipformer-buffered-tokens-hotwords-c-api.c) target_link_libraries(streaming-zipformer-buffered-tokens-hotwords-c-api sherpa-onnx-c-api) diff --git a/c-api-examples/vad-moonshine-c-api.c b/c-api-examples/vad-moonshine-c-api.c new file mode 100644 index 000000000..e4a4a3e34 --- /dev/null +++ b/c-api-examples/vad-moonshine-c-api.c @@ -0,0 +1,171 @@ +// c-api-examples/vad-sense-voice-c-api.c +// +// Copyright (c) 2024 Xiaomi Corporation + +// +// This file demonstrates how to use VAD + Moonshine with sherpa-onnx's C API. +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +// tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +// rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +// +// clang-format on + +#include +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +int32_t main() { + const char *wav_filename = "./Obama.wav"; + const char *vad_filename = "./silero_vad.onnx"; + + const char *preprocessor = + "./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx"; + const char *encoder = "./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx"; + const char *uncached_decoder = + "./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx"; + const char *cached_decoder = + "./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx"; + const char *tokens = "./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt"; + + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); + if (wave == NULL) { + fprintf(stderr, "Failed to read %s\n", wav_filename); + return -1; + } + + if (wave->sample_rate != 16000) { + fprintf(stderr, "Expect the sample rate to be 16000. Given: %d\n", + wave->sample_rate); + SherpaOnnxFreeWave(wave); + return -1; + } + + // Offline model config + SherpaOnnxOfflineModelConfig offline_model_config; + memset(&offline_model_config, 0, sizeof(offline_model_config)); + offline_model_config.debug = 0; + offline_model_config.num_threads = 1; + offline_model_config.provider = "cpu"; + offline_model_config.tokens = tokens; + offline_model_config.moonshine.preprocessor = preprocessor; + offline_model_config.moonshine.encoder = encoder; + offline_model_config.moonshine.uncached_decoder = uncached_decoder; + offline_model_config.moonshine.cached_decoder = cached_decoder; + + // Recognizer config + SherpaOnnxOfflineRecognizerConfig recognizer_config; + memset(&recognizer_config, 0, sizeof(recognizer_config)); + recognizer_config.decoding_method = "greedy_search"; + recognizer_config.model_config = offline_model_config; + + const SherpaOnnxOfflineRecognizer *recognizer = + SherpaOnnxCreateOfflineRecognizer(&recognizer_config); + + if (recognizer == NULL) { + fprintf(stderr, "Please check your recognizer config!\n"); + SherpaOnnxFreeWave(wave); + return -1; + } + + SherpaOnnxVadModelConfig vadConfig; + memset(&vadConfig, 0, sizeof(vadConfig)); + vadConfig.silero_vad.model = vad_filename; + vadConfig.silero_vad.threshold = 0.5; + vadConfig.silero_vad.min_silence_duration = 0.5; + vadConfig.silero_vad.min_speech_duration = 0.5; + vadConfig.silero_vad.max_speech_duration = 10; + vadConfig.silero_vad.window_size = 512; + vadConfig.sample_rate = 16000; + vadConfig.num_threads = 1; + vadConfig.debug = 1; + + SherpaOnnxVoiceActivityDetector *vad = + SherpaOnnxCreateVoiceActivityDetector(&vadConfig, 30); + + if (vad == NULL) { + fprintf(stderr, "Please check your recognizer config!\n"); + SherpaOnnxFreeWave(wave); + SherpaOnnxDestroyOfflineRecognizer(recognizer); + return -1; + } + + int32_t window_size = vadConfig.silero_vad.window_size; + int32_t i = 0; + + while (i + window_size < wave->num_samples) { + SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, wave->samples + i, + window_size); + i += window_size; + + while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) { + const SherpaOnnxSpeechSegment *segment = + SherpaOnnxVoiceActivityDetectorFront(vad); + + const SherpaOnnxOfflineStream *stream = + SherpaOnnxCreateOfflineStream(recognizer); + + SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, + segment->samples, segment->n); + + SherpaOnnxDecodeOfflineStream(recognizer, stream); + + const SherpaOnnxOfflineRecognizerResult *result = + SherpaOnnxGetOfflineStreamResult(stream); + + float start = segment->start / 16000.0f; + float duration = segment->n / 16000.0f; + float stop = start + duration; + + fprintf(stderr, "%.3f -- %.3f: %s\n", start, stop, result->text); + + SherpaOnnxDestroyOfflineRecognizerResult(result); + SherpaOnnxDestroyOfflineStream(stream); + + SherpaOnnxDestroySpeechSegment(segment); + SherpaOnnxVoiceActivityDetectorPop(vad); + } + } + + SherpaOnnxVoiceActivityDetectorFlush(vad); + + while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) { + const SherpaOnnxSpeechSegment *segment = + SherpaOnnxVoiceActivityDetectorFront(vad); + + const SherpaOnnxOfflineStream *stream = + SherpaOnnxCreateOfflineStream(recognizer); + + SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, segment->samples, + segment->n); + + SherpaOnnxDecodeOfflineStream(recognizer, stream); + + const SherpaOnnxOfflineRecognizerResult *result = + SherpaOnnxGetOfflineStreamResult(stream); + + float start = segment->start / 16000.0f; + float duration = segment->n / 16000.0f; + float stop = start + duration; + + fprintf(stderr, "%.3f -- %.3f: %s\n", start, stop, result->text); + + SherpaOnnxDestroyOfflineRecognizerResult(result); + SherpaOnnxDestroyOfflineStream(stream); + + SherpaOnnxDestroySpeechSegment(segment); + SherpaOnnxVoiceActivityDetectorPop(vad); + } + + SherpaOnnxDestroyOfflineRecognizer(recognizer); + SherpaOnnxDestroyVoiceActivityDetector(vad); + SherpaOnnxFreeWave(wave); + + return 0; +} diff --git a/c-api-examples/vad-sense-voice-c-api.c b/c-api-examples/vad-sense-voice-c-api.c index 3049c9572..ee9504d1a 100644 --- a/c-api-examples/vad-sense-voice-c-api.c +++ b/c-api-examples/vad-sense-voice-c-api.c @@ -81,6 +81,7 @@ int32_t main() { vadConfig.silero_vad.threshold = 0.5; vadConfig.silero_vad.min_silence_duration = 0.5; vadConfig.silero_vad.min_speech_duration = 0.5; + vadConfig.silero_vad.max_speech_duration = 5; vadConfig.silero_vad.window_size = 512; vadConfig.sample_rate = 16000; vadConfig.num_threads = 1; diff --git a/c-api-examples/vad-whisper-c-api.c b/c-api-examples/vad-whisper-c-api.c new file mode 100644 index 000000000..83cf9b258 --- /dev/null +++ b/c-api-examples/vad-whisper-c-api.c @@ -0,0 +1,169 @@ +// c-api-examples/vad-sense-voice-c-api.c +// +// Copyright (c) 2024 Xiaomi Corporation + +// +// This file demonstrates how to use VAD + Whisper tiny.en with +// sherpa-onnx's C API. +// +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 +// tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 +// rm sherpa-onnx-whisper-tiny.en.tar.bz2 +// +// clang-format on + +#include +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +int32_t main() { + const char *wav_filename = "./Obama.wav"; + const char *vad_filename = "./silero_vad.onnx"; + + const char *encoder = "sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx"; + const char *decoder = "sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx"; + const char *tokens = "sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt"; + + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); + if (wave == NULL) { + fprintf(stderr, "Failed to read %s\n", wav_filename); + return -1; + } + + if (wave->sample_rate != 16000) { + fprintf(stderr, "Expect the sample rate to be 16000. Given: %d\n", + wave->sample_rate); + SherpaOnnxFreeWave(wave); + return -1; + } + + // Offline model config + SherpaOnnxOfflineModelConfig offline_model_config; + memset(&offline_model_config, 0, sizeof(offline_model_config)); + offline_model_config.debug = 0; + offline_model_config.num_threads = 1; + offline_model_config.provider = "cpu"; + offline_model_config.tokens = tokens; + offline_model_config.whisper.encoder = encoder; + offline_model_config.whisper.decoder = decoder; + offline_model_config.whisper.language = "en"; + offline_model_config.whisper.tail_paddings = 0; + offline_model_config.whisper.task = "transcribe"; + + // Recognizer config + SherpaOnnxOfflineRecognizerConfig recognizer_config; + memset(&recognizer_config, 0, sizeof(recognizer_config)); + recognizer_config.decoding_method = "greedy_search"; + recognizer_config.model_config = offline_model_config; + + const SherpaOnnxOfflineRecognizer *recognizer = + SherpaOnnxCreateOfflineRecognizer(&recognizer_config); + + if (recognizer == NULL) { + fprintf(stderr, "Please check your recognizer config!\n"); + SherpaOnnxFreeWave(wave); + return -1; + } + + SherpaOnnxVadModelConfig vadConfig; + memset(&vadConfig, 0, sizeof(vadConfig)); + vadConfig.silero_vad.model = vad_filename; + vadConfig.silero_vad.threshold = 0.5; + vadConfig.silero_vad.min_silence_duration = 0.5; + vadConfig.silero_vad.min_speech_duration = 0.5; + vadConfig.silero_vad.max_speech_duration = 10; + vadConfig.silero_vad.window_size = 512; + vadConfig.sample_rate = 16000; + vadConfig.num_threads = 1; + vadConfig.debug = 1; + + SherpaOnnxVoiceActivityDetector *vad = + SherpaOnnxCreateVoiceActivityDetector(&vadConfig, 30); + + if (vad == NULL) { + fprintf(stderr, "Please check your recognizer config!\n"); + SherpaOnnxFreeWave(wave); + SherpaOnnxDestroyOfflineRecognizer(recognizer); + return -1; + } + + int32_t window_size = vadConfig.silero_vad.window_size; + int32_t i = 0; + + while (i + window_size < wave->num_samples) { + SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, wave->samples + i, + window_size); + i += window_size; + + while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) { + const SherpaOnnxSpeechSegment *segment = + SherpaOnnxVoiceActivityDetectorFront(vad); + + const SherpaOnnxOfflineStream *stream = + SherpaOnnxCreateOfflineStream(recognizer); + + SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, + segment->samples, segment->n); + + SherpaOnnxDecodeOfflineStream(recognizer, stream); + + const SherpaOnnxOfflineRecognizerResult *result = + SherpaOnnxGetOfflineStreamResult(stream); + + float start = segment->start / 16000.0f; + float duration = segment->n / 16000.0f; + float stop = start + duration; + + fprintf(stderr, "%.3f -- %.3f: %s\n", start, stop, result->text); + + SherpaOnnxDestroyOfflineRecognizerResult(result); + SherpaOnnxDestroyOfflineStream(stream); + + SherpaOnnxDestroySpeechSegment(segment); + SherpaOnnxVoiceActivityDetectorPop(vad); + } + } + + SherpaOnnxVoiceActivityDetectorFlush(vad); + + while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) { + const SherpaOnnxSpeechSegment *segment = + SherpaOnnxVoiceActivityDetectorFront(vad); + + const SherpaOnnxOfflineStream *stream = + SherpaOnnxCreateOfflineStream(recognizer); + + SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, segment->samples, + segment->n); + + SherpaOnnxDecodeOfflineStream(recognizer, stream); + + const SherpaOnnxOfflineRecognizerResult *result = + SherpaOnnxGetOfflineStreamResult(stream); + + float start = segment->start / 16000.0f; + float duration = segment->n / 16000.0f; + float stop = start + duration; + + fprintf(stderr, "%.3f -- %.3f: %s\n", start, stop, result->text); + + SherpaOnnxDestroyOfflineRecognizerResult(result); + SherpaOnnxDestroyOfflineStream(stream); + + SherpaOnnxDestroySpeechSegment(segment); + SherpaOnnxVoiceActivityDetectorPop(vad); + } + + SherpaOnnxDestroyOfflineRecognizer(recognizer); + SherpaOnnxDestroyVoiceActivityDetector(vad); + SherpaOnnxFreeWave(wave); + + return 0; +} From 15d54fdebf509506057156d3871f661931901ab0 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 26 Oct 2024 23:07:14 +0800 Subject: [PATCH 3/4] Add C++ API example for Moonshine models --- .github/workflows/cxx-api.yaml | 22 ++++++++ c-api-examples/vad-moonshine-c-api.c | 2 +- c-api-examples/vad-whisper-c-api.c | 2 +- cxx-api-examples/CMakeLists.txt | 3 + cxx-api-examples/moonshine-cxx-api.cc | 81 +++++++++++++++++++++++++++ 5 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 cxx-api-examples/moonshine-cxx-api.cc diff --git a/.github/workflows/cxx-api.yaml b/.github/workflows/cxx-api.yaml index 357aaa227..8779011a9 100644 --- a/.github/workflows/cxx-api.yaml +++ b/.github/workflows/cxx-api.yaml @@ -83,6 +83,28 @@ jobs: otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib fi + - name: Test Moonshine tiny + shell: bash + run: | + g++ -std=c++17 -o moonshine-cxx-api ./cxx-api-examples/moonshine-cxx-api.cc \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-cxx-api \ + -l sherpa-onnx-c-api \ + -l onnxruntime + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH + + ./moonshine-cxx-api + + rm -rf sherpa-onnx-* + rm ./moonshine-cxx-api + - name: Test whisper shell: bash run: | diff --git a/c-api-examples/vad-moonshine-c-api.c b/c-api-examples/vad-moonshine-c-api.c index e4a4a3e34..1b0d03624 100644 --- a/c-api-examples/vad-moonshine-c-api.c +++ b/c-api-examples/vad-moonshine-c-api.c @@ -1,4 +1,4 @@ -// c-api-examples/vad-sense-voice-c-api.c +// c-api-examples/vad-moonshine-c-api.c // // Copyright (c) 2024 Xiaomi Corporation diff --git a/c-api-examples/vad-whisper-c-api.c b/c-api-examples/vad-whisper-c-api.c index 83cf9b258..169b4ef12 100644 --- a/c-api-examples/vad-whisper-c-api.c +++ b/c-api-examples/vad-whisper-c-api.c @@ -1,4 +1,4 @@ -// c-api-examples/vad-sense-voice-c-api.c +// c-api-examples/vad-whisper-c-api.c // // Copyright (c) 2024 Xiaomi Corporation diff --git a/cxx-api-examples/CMakeLists.txt b/cxx-api-examples/CMakeLists.txt index 7c9853080..e7e722660 100644 --- a/cxx-api-examples/CMakeLists.txt +++ b/cxx-api-examples/CMakeLists.txt @@ -6,5 +6,8 @@ target_link_libraries(streaming-zipformer-cxx-api sherpa-onnx-cxx-api) add_executable(whisper-cxx-api ./whisper-cxx-api.cc) target_link_libraries(whisper-cxx-api sherpa-onnx-cxx-api) +add_executable(moonshine-cxx-api ./moonshine-cxx-api.cc) +target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api) + add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc) target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api) diff --git a/cxx-api-examples/moonshine-cxx-api.cc b/cxx-api-examples/moonshine-cxx-api.cc new file mode 100644 index 000000000..c3b439a3d --- /dev/null +++ b/cxx-api-examples/moonshine-cxx-api.cc @@ -0,0 +1,81 @@ +// cxx-api-examples/whisper-cxx-api.cc +// Copyright (c) 2024 Xiaomi Corporation + +// +// This file demonstrates how to use Moonshine with sherpa-onnx's C++ API. +// +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +// tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +// rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +// +// clang-format on + +#include // NOLINT +#include +#include + +#include "sherpa-onnx/c-api/cxx-api.h" + +int32_t main() { + using namespace sherpa_onnx::cxx; // NOLINT + OfflineRecognizerConfig config; + + config.model_config.moonshine.preprocessor = + "./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx"; + config.model_config.moonshine.encoder = + "./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx"; + config.model_config.moonshine.uncached_decoder = + "./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx"; + config.model_config.moonshine.cached_decoder = + "./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx"; + config.model_config.tokens = + "./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt"; + + config.model_config.num_threads = 1; + + std::cout << "Loading model\n"; + OfflineRecognizer recongizer = OfflineRecognizer::Create(config); + if (!recongizer.Get()) { + std::cerr << "Please check your config\n"; + return -1; + } + std::cout << "Loading model done\n"; + + std::string wave_filename = + "./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav"; + Wave wave = ReadWave(wave_filename); + if (wave.samples.empty()) { + std::cerr << "Failed to read: '" << wave_filename << "'\n"; + return -1; + } + + std::cout << "Start recognition\n"; + const auto begin = std::chrono::steady_clock::now(); + + OfflineStream stream = recongizer.CreateStream(); + stream.AcceptWaveform(wave.sample_rate, wave.samples.data(), + wave.samples.size()); + + recongizer.Decode(&stream); + + OfflineRecognizerResult result = recongizer.GetResult(&stream); + + const auto end = std::chrono::steady_clock::now(); + const float elapsed_seconds = + std::chrono::duration_cast(end - begin) + .count() / + 1000.; + float duration = wave.samples.size() / static_cast(wave.sample_rate); + float rtf = elapsed_seconds / duration; + + std::cout << "text: " << result.text << "\n"; + printf("Number of threads: %d\n", config.model_config.num_threads); + printf("Duration: %.3fs\n", duration); + printf("Elapsed seconds: %.3fs\n", elapsed_seconds); + printf("(Real time factor) RTF = %.3f / %.3f = %.3f\n", elapsed_seconds, + duration, rtf); + + return 0; +} From e0d8ae714963a219713f7937a2353a24e6aa8831 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 26 Oct 2024 23:23:26 +0800 Subject: [PATCH 4/4] Fix typos --- cxx-api-examples/moonshine-cxx-api.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cxx-api-examples/moonshine-cxx-api.cc b/cxx-api-examples/moonshine-cxx-api.cc index c3b439a3d..c2ce565c3 100644 --- a/cxx-api-examples/moonshine-cxx-api.cc +++ b/cxx-api-examples/moonshine-cxx-api.cc @@ -1,4 +1,4 @@ -// cxx-api-examples/whisper-cxx-api.cc +// cxx-api-examples/moonshine-cxx-api.cc // Copyright (c) 2024 Xiaomi Corporation //