From ffdb23a8ec72ebf06e461837b56af2f53f090ec7 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sun, 21 Jul 2024 21:48:12 +0800 Subject: [PATCH] Add dart API for SenseVoice (#1159) --- .github/scripts/test-dart.sh | 4 ++ dart-api-examples/non-streaming-asr/README.md | 1 + .../non-streaming-asr/bin/sense-voice.dart | 61 +++++++++++++++++++ .../non-streaming-asr/pubspec.yaml | 2 +- .../non-streaming-asr/run-sense-voice.sh | 18 ++++++ dart-api-examples/streaming-asr/pubspec.yaml | 2 +- dart-api-examples/tts/pubspec.yaml | 2 +- dart-api-examples/vad/pubspec.yaml | 2 +- flutter-examples/streaming_asr/pubspec.yaml | 4 +- flutter-examples/tts/pubspec.yaml | 4 +- .../lib/src/offline_recognizer.dart | 31 +++++++++- .../lib/src/sherpa_onnx_bindings.dart | 10 +++ flutter/sherpa_onnx/pubspec.yaml | 12 ++-- .../ios/sherpa_onnx_ios.podspec | 2 +- .../macos/sherpa_onnx_macos.podspec | 2 +- scripts/dart/sherpa-onnx-pubspec.yaml | 2 +- sherpa-onnx/csrc/provider-config.cc | 59 +++++++++--------- 17 files changed, 169 insertions(+), 49 deletions(-) create mode 100644 dart-api-examples/non-streaming-asr/bin/sense-voice.dart create mode 100755 dart-api-examples/non-streaming-asr/run-sense-voice.sh diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh index 711aa631f..05b8fce08 100755 --- a/.github/scripts/test-dart.sh +++ b/.github/scripts/test-dart.sh @@ -6,6 +6,10 @@ cd dart-api-examples pushd non-streaming-asr +echo '----------SenseVoice----------' +./run-sense-voice.sh +rm -rf sherpa-onnx-* + echo '----------NeMo transducer----------' ./run-nemo-transducer.sh rm -rf sherpa-onnx-* diff --git a/dart-api-examples/non-streaming-asr/README.md b/dart-api-examples/non-streaming-asr/README.md index bfa21e4cd..e897d3e97 100644 --- a/dart-api-examples/non-streaming-asr/README.md +++ b/dart-api-examples/non-streaming-asr/README.md @@ -11,4 +11,5 @@ This folder contains examples for non-streaming ASR with Dart API. |[./bin/whisper.dart](./bin/whisper.dart)| Use whisper for speech recognition. See [./run-whisper.sh](./run-whisper.sh)| |[./bin/zipformer-transducer.dart](./bin/zipformer-transducer.dart)| Use a zipformer transducer for speech recognition. See [./run-zipformer-transducer.sh](./run-zipformer-transducer.sh)| |[./bin/vad-with-paraformer.dart](./bin/vad-with-paraformer.dart)| Use a [silero-vad](https://github.com/snakers4/silero-vad) with paraformer for speech recognition. See [./run-vad-with-paraformer.sh](./run-vad-with-paraformer.sh)| +|[./bin/sense-voice.dart](./bin/sense-voice.dart)| Use a SenseVoice CTC model for speech recognition. See [./run-sense-voice.sh](./run-sense-voice.sh)| diff --git a/dart-api-examples/non-streaming-asr/bin/sense-voice.dart b/dart-api-examples/non-streaming-asr/bin/sense-voice.dart new file mode 100644 index 000000000..055cc9a6c --- /dev/null +++ b/dart-api-examples/non-streaming-asr/bin/sense-voice.dart @@ -0,0 +1,61 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('model', help: 'Path to the paraformer model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('language', + help: 'auto, zh, en, ja, ko, yue, or leave it empty to use auto', + defaultsTo: '') + ..addOption('use-itn', + help: 'true to use inverse text normalization', defaultsTo: 'false') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['model'] == null || + res['tokens'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + final model = res['model'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + final language = res['language'] as String; + final useItn = (res['use-itn'] as String).toLowerCase() == 'true'; + + final senseVoice = sherpa_onnx.OfflineSenseVoiceModelConfig( + model: model, language: language, useInverseTextNormalization: useItn); + + final modelConfig = sherpa_onnx.OfflineModelConfig( + senseVoice: senseVoice, + tokens: tokens, + debug: true, + numThreads: 1, + ); + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig); + final recognizer = sherpa_onnx.OfflineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + final stream = recognizer.createStream(); + + stream.acceptWaveform( + samples: waveData.samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + print(result.text); + + stream.free(); + recognizer.free(); +} diff --git a/dart-api-examples/non-streaming-asr/pubspec.yaml b/dart-api-examples/non-streaming-asr/pubspec.yaml index b916bee4c..5821f09ff 100644 --- a/dart-api-examples/non-streaming-asr/pubspec.yaml +++ b/dart-api-examples/non-streaming-asr/pubspec.yaml @@ -10,7 +10,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.10.16 + sherpa_onnx: ^1.10.17 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/non-streaming-asr/run-sense-voice.sh b/dart-api-examples/non-streaming-asr/run-sense-voice.sh new file mode 100755 index 000000000..4298f93cf --- /dev/null +++ b/dart-api-examples/non-streaming-asr/run-sense-voice.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +fi + +dart run \ + ./bin/sense-voice.dart \ + --model ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx \ + --tokens ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt \ + --use-itn true \ + --input-wav ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav diff --git a/dart-api-examples/streaming-asr/pubspec.yaml b/dart-api-examples/streaming-asr/pubspec.yaml index 6722c1804..edf64e3bf 100644 --- a/dart-api-examples/streaming-asr/pubspec.yaml +++ b/dart-api-examples/streaming-asr/pubspec.yaml @@ -11,7 +11,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.10.16 + sherpa_onnx: ^1.10.17 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/tts/pubspec.yaml b/dart-api-examples/tts/pubspec.yaml index 3383c983a..68e4bae47 100644 --- a/dart-api-examples/tts/pubspec.yaml +++ b/dart-api-examples/tts/pubspec.yaml @@ -8,7 +8,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.10.16 + sherpa_onnx: ^1.10.17 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/vad/pubspec.yaml b/dart-api-examples/vad/pubspec.yaml index 81c2a8588..91f4c27dc 100644 --- a/dart-api-examples/vad/pubspec.yaml +++ b/dart-api-examples/vad/pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ^3.4.0 dependencies: - sherpa_onnx: ^1.10.16 + sherpa_onnx: ^1.10.17 path: ^1.9.0 args: ^2.5.0 diff --git a/flutter-examples/streaming_asr/pubspec.yaml b/flutter-examples/streaming_asr/pubspec.yaml index 20188a053..df6463b06 100644 --- a/flutter-examples/streaming_asr/pubspec.yaml +++ b/flutter-examples/streaming_asr/pubspec.yaml @@ -5,7 +5,7 @@ description: > publish_to: 'none' -version: 1.10.16 +version: 1.10.17 topics: - speech-recognition @@ -30,7 +30,7 @@ dependencies: record: ^5.1.0 url_launcher: ^6.2.6 - sherpa_onnx: ^1.10.16 + sherpa_onnx: ^1.10.17 # sherpa_onnx: # path: ../../flutter/sherpa_onnx diff --git a/flutter-examples/tts/pubspec.yaml b/flutter-examples/tts/pubspec.yaml index 776c96063..72469714d 100644 --- a/flutter-examples/tts/pubspec.yaml +++ b/flutter-examples/tts/pubspec.yaml @@ -5,7 +5,7 @@ description: > publish_to: 'none' # Remove this line if you wish to publish to pub.dev -version: 1.10.16 +version: 1.10.17 environment: sdk: '>=3.4.0 <4.0.0' @@ -17,7 +17,7 @@ dependencies: cupertino_icons: ^1.0.6 path_provider: ^2.1.3 path: ^1.9.0 - sherpa_onnx: ^1.10.16 + sherpa_onnx: ^1.10.17 url_launcher: ^6.2.6 audioplayers: ^5.0.0 diff --git a/flutter/sherpa_onnx/lib/src/offline_recognizer.dart b/flutter/sherpa_onnx/lib/src/offline_recognizer.dart index ca2a4dbf0..a6ecb0dcc 100644 --- a/flutter/sherpa_onnx/lib/src/offline_recognizer.dart +++ b/flutter/sherpa_onnx/lib/src/offline_recognizer.dart @@ -79,6 +79,23 @@ class OfflineTdnnModelConfig { final String model; } +class OfflineSenseVoiceModelConfig { + const OfflineSenseVoiceModelConfig({ + this.model = '', + this.language = '', + this.useInverseTextNormalization = false, + }); + + @override + String toString() { + return 'OfflineSenseVoiceModelConfig(model: $model, language: $language, useInverseTextNormalization: $useInverseTextNormalization)'; + } + + final String model; + final String language; + final bool useInverseTextNormalization; +} + class OfflineLMConfig { const OfflineLMConfig({this.model = '', this.scale = 1.0}); @@ -98,6 +115,7 @@ class OfflineModelConfig { this.nemoCtc = const OfflineNemoEncDecCtcModelConfig(), this.whisper = const OfflineWhisperModelConfig(), this.tdnn = const OfflineTdnnModelConfig(), + this.senseVoice = const OfflineSenseVoiceModelConfig(), required this.tokens, this.numThreads = 1, this.debug = true, @@ -110,7 +128,7 @@ class OfflineModelConfig { @override String toString() { - return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)'; + return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)'; } final OfflineTransducerModelConfig transducer; @@ -118,6 +136,7 @@ class OfflineModelConfig { final OfflineNemoEncDecCtcModelConfig nemoCtc; final OfflineWhisperModelConfig whisper; final OfflineTdnnModelConfig tdnn; + final OfflineSenseVoiceModelConfig senseVoice; final String tokens; final int numThreads; @@ -219,6 +238,14 @@ class OfflineRecognizer { c.ref.model.tdnn.model = config.model.tdnn.model.toNativeUtf8(); + c.ref.model.senseVoice.model = config.model.senseVoice.model.toNativeUtf8(); + + c.ref.model.senseVoice.language = + config.model.senseVoice.language.toNativeUtf8(); + + c.ref.model.senseVoice.useInverseTextNormalization = + config.model.senseVoice.useInverseTextNormalization ? 1 : 0; + c.ref.model.tokens = config.model.tokens.toNativeUtf8(); c.ref.model.numThreads = config.model.numThreads; @@ -254,6 +281,8 @@ class OfflineRecognizer { calloc.free(c.ref.model.modelType); calloc.free(c.ref.model.provider); calloc.free(c.ref.model.tokens); + calloc.free(c.ref.model.senseVoice.language); + calloc.free(c.ref.model.senseVoice.model); calloc.free(c.ref.model.tdnn.model); calloc.free(c.ref.model.whisper.task); calloc.free(c.ref.model.whisper.language); diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart index c00e337ac..72dfc96b1 100644 --- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart +++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart @@ -87,6 +87,14 @@ final class SherpaOnnxOfflineTdnnModelConfig extends Struct { external Pointer model; } +final class SherpaOnnxOfflineSenseVoiceModelConfig extends Struct { + external Pointer model; + external Pointer language; + + @Int32() + external int useInverseTextNormalization; +} + final class SherpaOnnxOfflineLMConfig extends Struct { external Pointer model; @@ -115,6 +123,8 @@ final class SherpaOnnxOfflineModelConfig extends Struct { external Pointer modelingUnit; external Pointer bpeVocab; external Pointer telespeechCtc; + + external SherpaOnnxOfflineSenseVoiceModelConfig senseVoice; } final class SherpaOnnxOfflineRecognizerConfig extends Struct { diff --git a/flutter/sherpa_onnx/pubspec.yaml b/flutter/sherpa_onnx/pubspec.yaml index d4833805e..0064476de 100644 --- a/flutter/sherpa_onnx/pubspec.yaml +++ b/flutter/sherpa_onnx/pubspec.yaml @@ -17,7 +17,7 @@ topics: - voice-activity-detection # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec -version: 1.10.16 +version: 1.10.17 homepage: https://github.com/k2-fsa/sherpa-onnx @@ -30,19 +30,19 @@ dependencies: flutter: sdk: flutter - sherpa_onnx_android: ^1.10.16 + sherpa_onnx_android: ^1.10.17 # path: ../sherpa_onnx_android - sherpa_onnx_macos: ^1.10.16 + sherpa_onnx_macos: ^1.10.17 # path: ../sherpa_onnx_macos - sherpa_onnx_linux: ^1.10.16 + sherpa_onnx_linux: ^1.10.17 # path: ../sherpa_onnx_linux # - sherpa_onnx_windows: ^1.10.16 + sherpa_onnx_windows: ^1.10.17 # path: ../sherpa_onnx_windows - sherpa_onnx_ios: ^1.10.16 + sherpa_onnx_ios: ^1.10.17 # sherpa_onnx_ios: # path: ../sherpa_onnx_ios diff --git a/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec b/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec index df087dcec..54bf687ba 100644 --- a/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec +++ b/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec @@ -7,7 +7,7 @@ # https://groups.google.com/g/dart-ffi/c/nUATMBy7r0c Pod::Spec.new do |s| s.name = 'sherpa_onnx_ios' - s.version = '1.10.16' + s.version = '1.10.17' s.summary = 'A new Flutter FFI plugin project.' s.description = <<-DESC A new Flutter FFI plugin project. diff --git a/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec b/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec index 0b7e60c3a..2e645caa1 100644 --- a/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec +++ b/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec @@ -4,7 +4,7 @@ # Pod::Spec.new do |s| s.name = 'sherpa_onnx_macos' - s.version = '1.10.16' + s.version = '1.10.17' s.summary = 'sherpa-onnx Flutter FFI plugin project.' s.description = <<-DESC sherpa-onnx Flutter FFI plugin project. diff --git a/scripts/dart/sherpa-onnx-pubspec.yaml b/scripts/dart/sherpa-onnx-pubspec.yaml index a0aeb0e5c..f4563eada 100644 --- a/scripts/dart/sherpa-onnx-pubspec.yaml +++ b/scripts/dart/sherpa-onnx-pubspec.yaml @@ -17,7 +17,7 @@ topics: - voice-activity-detection # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx.podspec -version: 1.10.16 +version: 1.10.17 homepage: https://github.com/k2-fsa/sherpa-onnx diff --git a/sherpa-onnx/csrc/provider-config.cc b/sherpa-onnx/csrc/provider-config.cc index 3c8f0ee47..1db62aa6b 100644 --- a/sherpa-onnx/csrc/provider-config.cc +++ b/sherpa-onnx/csrc/provider-config.cc @@ -13,14 +13,15 @@ namespace sherpa_onnx { void CudaConfig::Register(ParseOptions *po) { po->Register("cuda-cudnn-conv-algo-search", &cudnn_conv_algo_search, - "CuDNN convolution algrorithm search"); + "CuDNN convolution algrorithm search"); } bool CudaConfig::Validate() const { if (cudnn_conv_algo_search < 1 || cudnn_conv_algo_search > 3) { - SHERPA_ONNX_LOGE("cudnn_conv_algo_search: '%d' is not a valid option." - "Options : [1,3]. Check OnnxRT docs", - cudnn_conv_algo_search); + SHERPA_ONNX_LOGE( + "cudnn_conv_algo_search: '%d' is not a valid option." + "Options : [1,3]. Check OnnxRT docs", + cudnn_conv_algo_search); return false; } return true; @@ -37,41 +38,41 @@ std::string CudaConfig::ToString() const { void TensorrtConfig::Register(ParseOptions *po) { po->Register("trt-max-workspace-size", &trt_max_workspace_size, - "Set TensorRT EP GPU memory usage limit."); + "Set TensorRT EP GPU memory usage limit."); po->Register("trt-max-partition-iterations", &trt_max_partition_iterations, - "Limit partitioning iterations for model conversion."); + "Limit partitioning iterations for model conversion."); po->Register("trt-min-subgraph-size", &trt_min_subgraph_size, - "Set minimum size for subgraphs in partitioning."); + "Set minimum size for subgraphs in partitioning."); po->Register("trt-fp16-enable", &trt_fp16_enable, - "Enable FP16 precision for faster performance."); + "Enable FP16 precision for faster performance."); po->Register("trt-detailed-build-log", &trt_detailed_build_log, - "Enable detailed logging of build steps."); + "Enable detailed logging of build steps."); po->Register("trt-engine-cache-enable", &trt_engine_cache_enable, - "Enable caching of TensorRT engines."); + "Enable caching of TensorRT engines."); po->Register("trt-timing-cache-enable", &trt_timing_cache_enable, - "Enable use of timing cache to speed up builds."); + "Enable use of timing cache to speed up builds."); po->Register("trt-engine-cache-path", &trt_engine_cache_path, - "Set path to store cached TensorRT engines."); + "Set path to store cached TensorRT engines."); po->Register("trt-timing-cache-path", &trt_timing_cache_path, - "Set path for storing timing cache."); + "Set path for storing timing cache."); po->Register("trt-dump-subgraphs", &trt_dump_subgraphs, - "Dump optimized subgraphs for debugging."); + "Dump optimized subgraphs for debugging."); } bool TensorrtConfig::Validate() const { if (trt_max_workspace_size < 0) { - SHERPA_ONNX_LOGE("trt_max_workspace_size: %lld is not valid.", - trt_max_workspace_size); + SHERPA_ONNX_LOGE("trt_max_workspace_size: %ld is not valid.", + trt_max_workspace_size); return false; } if (trt_max_partition_iterations < 0) { SHERPA_ONNX_LOGE("trt_max_partition_iterations: %d is not valid.", - trt_max_partition_iterations); + trt_max_partition_iterations); return false; } if (trt_min_subgraph_size < 0) { SHERPA_ONNX_LOGE("trt_min_subgraph_size: %d is not valid.", - trt_min_subgraph_size); + trt_min_subgraph_size); return false; } @@ -83,23 +84,19 @@ std::string TensorrtConfig::ToString() const { os << "TensorrtConfig("; os << "trt_max_workspace_size=" << trt_max_workspace_size << ", "; - os << "trt_max_partition_iterations=" - << trt_max_partition_iterations << ", "; + os << "trt_max_partition_iterations=" << trt_max_partition_iterations << ", "; os << "trt_min_subgraph_size=" << trt_min_subgraph_size << ", "; - os << "trt_fp16_enable=\"" - << (trt_fp16_enable? "True" : "False") << "\", "; + os << "trt_fp16_enable=\"" << (trt_fp16_enable ? "True" : "False") << "\", "; os << "trt_detailed_build_log=\"" - << (trt_detailed_build_log? "True" : "False") << "\", "; + << (trt_detailed_build_log ? "True" : "False") << "\", "; os << "trt_engine_cache_enable=\"" - << (trt_engine_cache_enable? "True" : "False") << "\", "; - os << "trt_engine_cache_path=\"" - << trt_engine_cache_path.c_str() << "\", "; + << (trt_engine_cache_enable ? "True" : "False") << "\", "; + os << "trt_engine_cache_path=\"" << trt_engine_cache_path.c_str() << "\", "; os << "trt_timing_cache_enable=\"" - << (trt_timing_cache_enable? "True" : "False") << "\", "; - os << "trt_timing_cache_path=\"" - << trt_timing_cache_path.c_str() << "\","; - os << "trt_dump_subgraphs=\"" - << (trt_dump_subgraphs? "True" : "False") << "\" )"; + << (trt_timing_cache_enable ? "True" : "False") << "\", "; + os << "trt_timing_cache_path=\"" << trt_timing_cache_path.c_str() << "\","; + os << "trt_dump_subgraphs=\"" << (trt_dump_subgraphs ? "True" : "False") + << "\" )"; return os.str(); }