Skip to content

Commit

Permalink
Add C++ runtime for SenseVoice models (#1148)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Jul 18, 2024
1 parent 3bae5c3 commit 25f0a10
Show file tree
Hide file tree
Showing 34 changed files with 1,160 additions and 39 deletions.
25 changes: 24 additions & 1 deletion .github/scripts/test-offline-ctc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,30 @@ echo "PATH: $PATH"

which $EXE

if false; then
log "------------------------------------------------------------"
log "Run SenseVoice models"
log "------------------------------------------------------------"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
repo=sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17

for m in model.onnx model.int8.onnx; do
for w in zh en yue ja ko; do
for use_itn in 0 1; do
echo "$m $w $use_itn"
time $EXE \
--tokens=$repo/tokens.txt \
--sense-voice-model=$repo/$m \
--sense-voice-use-itn=$use_itn \
$repo/test_wavs/$w.wav
done
done
done

rm -rf $repo

if true; then
# It has problems with onnxruntime 1.18
log "------------------------------------------------------------"
log "Run Wenet models"
Expand Down
12 changes: 12 additions & 0 deletions .github/scripts/test-python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,18 @@ log() {

export GIT_CLONE_PROTECTION_ACTIVE=false

log "test offline SenseVoice CTC"
url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
name=$(basename $url)
repo=$(basename -s .tar.bz2 $name)

curl -SL -O $url
tar xvf $name
rm $name
ls -lh $repo
python3 ./python-api-examples/offline-sense-voice-ctc-decode-files.py
rm -rf $repo

log "test offline TeleSpeech CTC"
url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2
name=$(basename $url)
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/export-sense-voice-to-onnx.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ jobs:
echo "pwd: $PWD"
ls -lh ../scripts/sense-voice
rm -rf ./
rm -rf ./*
cp -v ../scripts/sense-voice/*.onnx .
cp -v ../scripts/sense-voice/tokens.txt .
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,4 @@ sherpa-onnx-telespeech-ctc-*
*.fst
.ccache
lib*.a
sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 1.10.17

* Support SenseVoice CTC models.

## 1.10.16

* Support zh-en TTS model from MeloTTS.
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ project(sherpa-onnx)
# ./nodejs-addon-examples
# ./dart-api-examples/
# ./CHANGELOG.md
set(SHERPA_ONNX_VERSION "1.10.16")
set(SHERPA_ONNX_VERSION "1.10.17")

# Disable warning about
#
Expand Down
67 changes: 67 additions & 0 deletions python-api-examples/offline-sense-voice-ctc-decode-files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env python3

"""
This file shows how to use a non-streaming SenseVoice CTC model from
https://github.com/FunAudioLLM/SenseVoice
to decode files.
Please download model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
For instance,
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
"""

from pathlib import Path

import sherpa_onnx
import soundfile as sf


def create_recognizer():
model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx"
tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt"
test_wav = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav"
# test_wav = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav"
# test_wav = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/ja.wav"
# test_wav = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/ko.wav"
# test_wav = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/yue.wav"

if not Path(model).is_file() or not Path(test_wav).is_file():
raise ValueError(
"""Please download model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
"""
)
return (
sherpa_onnx.OfflineRecognizer.from_sense_voice(
model=model,
tokens=tokens,
use_itn=True,
debug=True,
),
test_wav,
)


def main():
recognizer, wave_filename = create_recognizer()

audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True)
audio = audio[:, 0] # only use the first channel

# audio is a 1-D float32 numpy array normalized to the range [-1, 1]
# sample_rate does not need to be 16000 Hz

stream = recognizer.create_stream()
stream.accept_waveform(sample_rate, audio)
recognizer.decode_stream(stream)
print(wave_filename)
print(stream.result)


if __name__ == "__main__":
main()
9 changes: 7 additions & 2 deletions scripts/sense-voice/export-onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,9 @@ def main():
"neg_mean": neg_mean,
"inv_stddev": inv_stddev,
"model_type": "sense_voice_ctc",
"version": "1",
# version 1: Use QInt8
# version 2: Use QUInt8
"version": "2",
"model_author": "iic",
"maintainer": "k2-fsa",
"vocab_size": vocab_size,
Expand All @@ -185,7 +187,10 @@ def main():
model_input=filename,
model_output=filename_int8,
op_types_to_quantize=["MatMul"],
weight_type=QuantType.QInt8,
# Note that we have to use QUInt8 here.
#
# When QInt8 is used, C++ onnxruntime produces incorrect results
weight_type=QuantType.QUInt8,
)


Expand Down
10 changes: 10 additions & 0 deletions sherpa-onnx/c-api/c-api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,7 @@ struct SherpaOnnxOfflineStream {

static sherpa_onnx::OfflineRecognizerConfig convertConfig(
const SherpaOnnxOfflineRecognizerConfig *config);

SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer(
const SherpaOnnxOfflineRecognizerConfig *config) {
sherpa_onnx::OfflineRecognizerConfig recognizer_config =
Expand Down Expand Up @@ -391,6 +392,15 @@ sherpa_onnx::OfflineRecognizerConfig convertConfig(
recognizer_config.model_config.telespeech_ctc =
SHERPA_ONNX_OR(config->model_config.telespeech_ctc, "");

recognizer_config.model_config.sense_voice.model =
SHERPA_ONNX_OR(config->model_config.sense_voice.model, "");

recognizer_config.model_config.sense_voice.language =
SHERPA_ONNX_OR(config->model_config.sense_voice.language, "");

recognizer_config.model_config.sense_voice.use_itn =
config->model_config.sense_voice.use_itn;

recognizer_config.lm_config.model =
SHERPA_ONNX_OR(config->lm_config.model, "");
recognizer_config.lm_config.scale =
Expand Down
7 changes: 7 additions & 0 deletions sherpa-onnx/c-api/c-api.h
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineLMConfig {
float scale;
} SherpaOnnxOfflineLMConfig;

SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSenseVoiceModelConfig {
const char *model;
const char *language;
int32_t use_itn;
} SherpaOnnxOfflineSenseVoiceModelConfig;

SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
SherpaOnnxOfflineTransducerModelConfig transducer;
SherpaOnnxOfflineParaformerModelConfig paraformer;
Expand All @@ -398,6 +404,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
const char *modeling_unit;
const char *bpe_vocab;
const char *telespeech_ctc;
SherpaOnnxOfflineSenseVoiceModelConfig sense_voice;
} SherpaOnnxOfflineModelConfig;

SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig {
Expand Down
2 changes: 2 additions & 0 deletions sherpa-onnx/csrc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ set(sources
offline-recognizer-impl.cc
offline-recognizer.cc
offline-rnn-lm.cc
offline-sense-voice-model-config.cc
offline-sense-voice-model.cc
offline-stream.cc
offline-tdnn-ctc-model.cc
offline-tdnn-model-config.cc
Expand Down
2 changes: 1 addition & 1 deletion sherpa-onnx/csrc/offline-ct-transformer-model-meta-data.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// sherpa-onnx/csrc/offline-ct-transformer-model-meta_data.h
// sherpa-onnx/csrc/offline-ct-transformer-model-meta-data.h
//
// Copyright (c) 2024 Xiaomi Corporation
#ifndef SHERPA_ONNX_CSRC_OFFLINE_CT_TRANSFORMER_MODEL_META_DATA_H_
Expand Down
2 changes: 2 additions & 0 deletions sherpa-onnx/csrc/offline-ctc-model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ static ModelType GetModelType(char *model_data, size_t model_data_length,

std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
const OfflineModelConfig &config) {
// TODO(fangjun): Refactor it. We don't need to use model_type here
ModelType model_type = ModelType::kUnknown;

std::string filename;
Expand Down Expand Up @@ -148,6 +149,7 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(

std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
AAssetManager *mgr, const OfflineModelConfig &config) {
// TODO(fangjun): Refactor it. We don't need to use model_type here
ModelType model_type = ModelType::kUnknown;

std::string filename;
Expand Down
14 changes: 11 additions & 3 deletions sherpa-onnx/csrc/offline-model-config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ void OfflineModelConfig::Register(ParseOptions *po) {
tdnn.Register(po);
zipformer_ctc.Register(po);
wenet_ctc.Register(po);
sense_voice.Register(po);

po->Register("telespeech-ctc", &telespeech_ctc,
"Path to model.onnx for telespeech ctc");
Expand Down Expand Up @@ -94,15 +95,21 @@ bool OfflineModelConfig::Validate() const {
return wenet_ctc.Validate();
}

if (!sense_voice.model.empty()) {
return sense_voice.Validate();
}

if (!telespeech_ctc.empty() && !FileExists(telespeech_ctc)) {
SHERPA_ONNX_LOGE("telespeech_ctc: '%s' does not exist",
telespeech_ctc.c_str());
return false;
} else {
return true;
}

return transducer.Validate();
if (!transducer.encoder_filename.empty()) {
return transducer.Validate();
}

return true;
}

std::string OfflineModelConfig::ToString() const {
Expand All @@ -116,6 +123,7 @@ std::string OfflineModelConfig::ToString() const {
os << "tdnn=" << tdnn.ToString() << ", ";
os << "zipformer_ctc=" << zipformer_ctc.ToString() << ", ";
os << "wenet_ctc=" << wenet_ctc.ToString() << ", ";
os << "sense_voice=" << sense_voice.ToString() << ", ";
os << "telespeech_ctc=\"" << telespeech_ctc << "\", ";
os << "tokens=\"" << tokens << "\", ";
os << "num_threads=" << num_threads << ", ";
Expand Down
4 changes: 4 additions & 0 deletions sherpa-onnx/csrc/offline-model-config.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "sherpa-onnx/csrc/offline-nemo-enc-dec-ctc-model-config.h"
#include "sherpa-onnx/csrc/offline-paraformer-model-config.h"
#include "sherpa-onnx/csrc/offline-sense-voice-model-config.h"
#include "sherpa-onnx/csrc/offline-tdnn-model-config.h"
#include "sherpa-onnx/csrc/offline-transducer-model-config.h"
#include "sherpa-onnx/csrc/offline-wenet-ctc-model-config.h"
Expand All @@ -24,6 +25,7 @@ struct OfflineModelConfig {
OfflineTdnnModelConfig tdnn;
OfflineZipformerCtcModelConfig zipformer_ctc;
OfflineWenetCtcModelConfig wenet_ctc;
OfflineSenseVoiceModelConfig sense_voice;
std::string telespeech_ctc;

std::string tokens;
Expand Down Expand Up @@ -53,6 +55,7 @@ struct OfflineModelConfig {
const OfflineTdnnModelConfig &tdnn,
const OfflineZipformerCtcModelConfig &zipformer_ctc,
const OfflineWenetCtcModelConfig &wenet_ctc,
const OfflineSenseVoiceModelConfig &sense_voice,
const std::string &telespeech_ctc,
const std::string &tokens, int32_t num_threads, bool debug,
const std::string &provider, const std::string &model_type,
Expand All @@ -65,6 +68,7 @@ struct OfflineModelConfig {
tdnn(tdnn),
zipformer_ctc(zipformer_ctc),
wenet_ctc(wenet_ctc),
sense_voice(sense_voice),
telespeech_ctc(telespeech_ctc),
tokens(tokens),
num_threads(num_threads),
Expand Down
5 changes: 1 addition & 4 deletions sherpa-onnx/csrc/offline-recognizer-ctc-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -212,10 +212,7 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl {
}
}

OfflineRecognizerConfig GetConfig() const override {
return config_;
}

OfflineRecognizerConfig GetConfig() const override { return config_; }

private:
// Decode a single stream.
Expand Down
45 changes: 45 additions & 0 deletions sherpa-onnx/csrc/offline-recognizer-impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/offline-recognizer-ctc-impl.h"
#include "sherpa-onnx/csrc/offline-recognizer-paraformer-impl.h"
#include "sherpa-onnx/csrc/offline-recognizer-sense-voice-impl.h"
#include "sherpa-onnx/csrc/offline-recognizer-transducer-impl.h"
#include "sherpa-onnx/csrc/offline-recognizer-transducer-nemo-impl.h"
#include "sherpa-onnx/csrc/offline-recognizer-whisper-impl.h"
Expand All @@ -31,6 +32,28 @@ namespace sherpa_onnx {

std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
const OfflineRecognizerConfig &config) {
if (!config.model_config.sense_voice.model.empty()) {
return std::make_unique<OfflineRecognizerSenseVoiceImpl>(config);
}

if (!config.model_config.paraformer.model.empty()) {
return std::make_unique<OfflineRecognizerParaformerImpl>(config);
}

if (!config.model_config.nemo_ctc.model.empty() ||
!config.model_config.zipformer_ctc.model.empty() ||
!config.model_config.tdnn.model.empty() ||
!config.model_config.wenet_ctc.model.empty()) {
return std::make_unique<OfflineRecognizerCtcImpl>(config);
}

if (!config.model_config.whisper.encoder.empty()) {
return std::make_unique<OfflineRecognizerWhisperImpl>(config);
}

// TODO(fangjun): Refactor it. We only need to use model type for the
// following models:
// 1. transducer and nemo_transducer
if (!config.model_config.model_type.empty()) {
const auto &model_type = config.model_config.model_type;
if (model_type == "transducer") {
Expand Down Expand Up @@ -180,6 +203,28 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
#if __ANDROID_API__ >= 9
std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
AAssetManager *mgr, const OfflineRecognizerConfig &config) {
if (!config.model_config.sense_voice.model.empty()) {
return std::make_unique<OfflineRecognizerSenseVoiceImpl>(mgr, config);
}

if (!config.model_config.paraformer.model.empty()) {
return std::make_unique<OfflineRecognizerParaformerImpl>(mgr, config);
}

if (!config.model_config.nemo_ctc.model.empty() ||
!config.model_config.zipformer_ctc.model.empty() ||
!config.model_config.tdnn.model.empty() ||
!config.model_type.wenet_ctc.model.empty()) {
return std::make_unique<OfflineRecognizerCtcImpl>(mgr, config);
}

if (!config.model_config.whisper.encoder.empty()) {
return std::make_unique<OfflineRecognizerWhisperImpl>(mgr, config);
}

// TODO(fangjun): Refactor it. We only need to use model type for the
// following models:
// 1. transducer and nemo_transducer
if (!config.model_config.model_type.empty()) {
const auto &model_type = config.model_config.model_type;
if (model_type == "transducer") {
Expand Down
Loading

0 comments on commit 25f0a10

Please sign in to comment.