From 5ed8e31868cd763a0f65f13e7c5878549a1d2c11 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 24 Aug 2024 23:05:54 +0800 Subject: [PATCH] Add VAD and keyword spotting for the Node package with WebAssembly (#1286) --- .github/scripts/test-nodejs-npm.sh | 22 ++++ .github/workflows/npm.yaml | 31 ++++- .github/workflows/test-nodejs.yaml | 4 + CHANGELOG.md | 11 ++ CMakeLists.txt | 3 +- .../add-punctuations/pubspec.yaml | 2 +- dart-api-examples/audio-tagging/pubspec.yaml | 2 +- .../keyword-spotter/pubspec.yaml | 2 +- .../non-streaming-asr/pubspec.yaml | 2 +- .../speaker-identification/pubspec.yaml | 2 +- dart-api-examples/streaming-asr/pubspec.yaml | 2 +- dart-api-examples/tts/pubspec.yaml | 2 +- .../vad-with-non-streaming-asr/pubspec.yaml | 2 +- dart-api-examples/vad/pubspec.yaml | 2 +- flutter-examples/streaming_asr/pubspec.yaml | 4 +- flutter-examples/tts/pubspec.yaml | 4 +- flutter/sherpa_onnx/pubspec.yaml | 12 +- .../ios/sherpa_onnx_ios.podspec | 2 +- .../macos/sherpa_onnx_macos.podspec | 2 +- new-release.sh | 7 + nodejs-addon-examples/package.json | 2 +- .../test_keyword_spotter_transducer.js | 2 +- ...test_vad_with_non_streaming_asr_whisper.js | 4 +- .../test-keyword-spotter-transducer.js | 49 +++++++ nodejs-examples/test-offline-nemo-ctc.js | 91 ++----------- .../test-offline-paraformer-itn.js | 76 +---------- nodejs-examples/test-offline-paraformer.js | 80 +---------- nodejs-examples/test-offline-sense-voice.js | 79 +---------- nodejs-examples/test-offline-transducer.js | 81 +----------- nodejs-examples/test-offline-whisper.js | 81 +----------- .../test-online-paraformer-microphone.js | 12 -- nodejs-examples/test-online-paraformer.js | 16 --- nodejs-examples/test-online-transducer.js | 16 --- ...test-vad-with-non-streaming-asr-whisper.js | 124 ++++++++++++++++++ scripts/nodejs/.gitignore | 2 + scripts/nodejs/index.js | 28 ++++ wasm/asr/sherpa-onnx-asr.js | 2 +- wasm/kws/sherpa-onnx-kws.js | 30 +++-- wasm/nodejs/CMakeLists.txt | 28 ++++ wasm/nodejs/sherpa-onnx-wave.js | 57 ++++++++ 40 files changed, 456 insertions(+), 524 deletions(-) create mode 100755 new-release.sh create mode 100644 nodejs-examples/test-keyword-spotter-transducer.js create mode 100644 nodejs-examples/test-vad-with-non-streaming-asr-whisper.js create mode 100644 wasm/nodejs/sherpa-onnx-wave.js diff --git a/.github/scripts/test-nodejs-npm.sh b/.github/scripts/test-nodejs-npm.sh index 006be8b0d..c41a0de65 100755 --- a/.github/scripts/test-nodejs-npm.sh +++ b/.github/scripts/test-nodejs-npm.sh @@ -9,6 +9,28 @@ git status ls -lh ls -lh node_modules +echo '-----vad+whisper----------' + +curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 +tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 +rm sherpa-onnx-whisper-tiny.en.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +node ./test-vad-with-non-streaming-asr-whisper.js +rm Obama.wav +rm silero_vad.onnx +rm -rf sherpa-onnx-whisper-tiny.en + +echo "----------keyword spotting----------" + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/kws-models/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2 +tar xvf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2 +rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2 + +node ./test-keyword-spotter-transducer.js +rm -rf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01 + # offline asr # curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 diff --git a/.github/workflows/npm.yaml b/.github/workflows/npm.yaml index c608a307f..9fd9f17a3 100644 --- a/.github/workflows/npm.yaml +++ b/.github/workflows/npm.yaml @@ -1,6 +1,9 @@ name: npm on: + push: + branches: + - npm workflow_dispatch: concurrency: @@ -27,6 +30,9 @@ jobs: - name: Install emsdk uses: mymindstorm/setup-emsdk@v14 + with: + version: 3.1.51 + actions-cache-folder: 'emsdk-cache' - name: View emsdk version shell: bash @@ -51,8 +57,6 @@ jobs: - name: Build nodejs package shell: bash - env: - NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} run: | ./build-wasm-simd-nodejs.sh cp -v build-wasm-simd-nodejs/install/bin/wasm/nodejs/*.js ./scripts/nodejs/ @@ -71,6 +75,29 @@ jobs: rm package.json.bak + - name: Collect files + shell: bash + run: | + dst=sherpa-onnx-wasm-nodejs + mkdir $dst + cp -v scripts/nodejs/* $dst + tar cvjf $dst.tar.bz2 $dst + + echo "---" + ls -h $dst + + - uses: actions/upload-artifact@v4 + with: + name: sherpa-onnx-wasm-nodejs + path: ./*.tar.bz2 + + - name: Build nodejs package + shell: bash + env: + NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} + run: | + cd scripts/nodejs + git diff npm install diff --git a/.github/workflows/test-nodejs.yaml b/.github/workflows/test-nodejs.yaml index 21a0d0a5c..25f3c38fd 100644 --- a/.github/workflows/test-nodejs.yaml +++ b/.github/workflows/test-nodejs.yaml @@ -55,6 +55,9 @@ jobs: - name: Install emsdk uses: mymindstorm/setup-emsdk@v14 + with: + version: 3.1.51 + actions-cache-folder: 'emsdk-cache' - name: View emsdk version shell: bash @@ -109,6 +112,7 @@ jobs: node --version npm --version export d=scripts/nodejs + cat $d/index.js pushd $d npm install diff --git a/CHANGELOG.md b/CHANGELOG.md index 8977bae89..f78268086 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +## 1.10.23 + +* flutter: add lang, emotion, event to OfflineRecognizerResult (#1268) +* Use a separate thread to initialize models for lazarus examples. (#1270) +* Object pascal examples for recording and playing audio with portaudio. (#1271) +* Text to speech API for Object Pascal. (#1273) +* update kotlin api for better release native object and add user-friendly apis. (#1275) +* Update wave-reader.cc to support 8/16/32-bit waves (#1278) +* Add WebAssembly for VAD (#1281) +* WebAssembly example for VAD + Non-streaming ASR (#1284) + ## 1.10.22 * Add Pascal API for reading wave files (#1243) diff --git a/CMakeLists.txt b/CMakeLists.txt index b71bb133d..4b94e2351 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ project(sherpa-onnx) # ./nodejs-addon-examples # ./dart-api-examples/ # ./CHANGELOG.md -set(SHERPA_ONNX_VERSION "1.10.22") +set(SHERPA_ONNX_VERSION "1.10.23") # Disable warning about # @@ -206,6 +206,7 @@ if(SHERPA_ONNX_ENABLE_WASM_NODEJS) if(NOT SHERPA_ONNX_ENABLE_WASM) message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for NodeJS") endif() + add_definitions(-DSHERPA_ONNX_ENABLE_WASM_KWS=1) endif() if(SHERPA_ONNX_ENABLE_WASM) diff --git a/dart-api-examples/add-punctuations/pubspec.yaml b/dart-api-examples/add-punctuations/pubspec.yaml index 5efb28b42..91fde0353 100644 --- a/dart-api-examples/add-punctuations/pubspec.yaml +++ b/dart-api-examples/add-punctuations/pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ^3.4.0 dependencies: - sherpa_onnx: ^1.10.22 + sherpa_onnx: ^1.10.23 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/audio-tagging/pubspec.yaml b/dart-api-examples/audio-tagging/pubspec.yaml index de9c515c1..69235c159 100644 --- a/dart-api-examples/audio-tagging/pubspec.yaml +++ b/dart-api-examples/audio-tagging/pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ^3.4.0 dependencies: - sherpa_onnx: ^1.10.22 + sherpa_onnx: ^1.10.23 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/keyword-spotter/pubspec.yaml b/dart-api-examples/keyword-spotter/pubspec.yaml index 93cd09173..7b78341c7 100644 --- a/dart-api-examples/keyword-spotter/pubspec.yaml +++ b/dart-api-examples/keyword-spotter/pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ^3.4.0 dependencies: - sherpa_onnx: ^1.10.22 + sherpa_onnx: ^1.10.23 # sherpa_onnx: # path: ../../flutter/sherpa_onnx path: ^1.9.0 diff --git a/dart-api-examples/non-streaming-asr/pubspec.yaml b/dart-api-examples/non-streaming-asr/pubspec.yaml index 4ecf29778..82c359d24 100644 --- a/dart-api-examples/non-streaming-asr/pubspec.yaml +++ b/dart-api-examples/non-streaming-asr/pubspec.yaml @@ -10,7 +10,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.10.22 + sherpa_onnx: ^1.10.23 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/speaker-identification/pubspec.yaml b/dart-api-examples/speaker-identification/pubspec.yaml index 2e3c4b7ef..bd3e3e5be 100644 --- a/dart-api-examples/speaker-identification/pubspec.yaml +++ b/dart-api-examples/speaker-identification/pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ^3.4.0 dependencies: - sherpa_onnx: ^1.10.22 + sherpa_onnx: ^1.10.23 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/streaming-asr/pubspec.yaml b/dart-api-examples/streaming-asr/pubspec.yaml index 7e0856290..b5c289eb5 100644 --- a/dart-api-examples/streaming-asr/pubspec.yaml +++ b/dart-api-examples/streaming-asr/pubspec.yaml @@ -11,7 +11,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.10.22 + sherpa_onnx: ^1.10.23 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/tts/pubspec.yaml b/dart-api-examples/tts/pubspec.yaml index da6cebafe..89c648cfd 100644 --- a/dart-api-examples/tts/pubspec.yaml +++ b/dart-api-examples/tts/pubspec.yaml @@ -8,7 +8,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.10.22 + sherpa_onnx: ^1.10.23 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/vad-with-non-streaming-asr/pubspec.yaml b/dart-api-examples/vad-with-non-streaming-asr/pubspec.yaml index 4f3e37143..a08f811e0 100644 --- a/dart-api-examples/vad-with-non-streaming-asr/pubspec.yaml +++ b/dart-api-examples/vad-with-non-streaming-asr/pubspec.yaml @@ -10,7 +10,7 @@ environment: sdk: ^3.4.0 dependencies: - sherpa_onnx: ^1.10.22 + sherpa_onnx: ^1.10.23 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/vad/pubspec.yaml b/dart-api-examples/vad/pubspec.yaml index 9cc6186e2..c670af7da 100644 --- a/dart-api-examples/vad/pubspec.yaml +++ b/dart-api-examples/vad/pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ^3.4.0 dependencies: - sherpa_onnx: ^1.10.22 + sherpa_onnx: ^1.10.23 path: ^1.9.0 args: ^2.5.0 diff --git a/flutter-examples/streaming_asr/pubspec.yaml b/flutter-examples/streaming_asr/pubspec.yaml index f3d67be19..bafe4068f 100644 --- a/flutter-examples/streaming_asr/pubspec.yaml +++ b/flutter-examples/streaming_asr/pubspec.yaml @@ -5,7 +5,7 @@ description: > publish_to: 'none' -version: 1.10.22 +version: 1.10.23 topics: - speech-recognition @@ -30,7 +30,7 @@ dependencies: record: ^5.1.0 url_launcher: ^6.2.6 - sherpa_onnx: ^1.10.22 + sherpa_onnx: ^1.10.23 # sherpa_onnx: # path: ../../flutter/sherpa_onnx diff --git a/flutter-examples/tts/pubspec.yaml b/flutter-examples/tts/pubspec.yaml index 745f8faf4..f9eb64da6 100644 --- a/flutter-examples/tts/pubspec.yaml +++ b/flutter-examples/tts/pubspec.yaml @@ -5,7 +5,7 @@ description: > publish_to: 'none' # Remove this line if you wish to publish to pub.dev -version: 1.10.22 +version: 1.10.23 environment: sdk: '>=3.4.0 <4.0.0' @@ -17,7 +17,7 @@ dependencies: cupertino_icons: ^1.0.6 path_provider: ^2.1.3 path: ^1.9.0 - sherpa_onnx: ^1.10.22 + sherpa_onnx: ^1.10.23 url_launcher: ^6.2.6 audioplayers: ^5.0.0 diff --git a/flutter/sherpa_onnx/pubspec.yaml b/flutter/sherpa_onnx/pubspec.yaml index 22765831d..106607999 100644 --- a/flutter/sherpa_onnx/pubspec.yaml +++ b/flutter/sherpa_onnx/pubspec.yaml @@ -17,7 +17,7 @@ topics: - voice-activity-detection # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec -version: 1.10.22 +version: 1.10.23 homepage: https://github.com/k2-fsa/sherpa-onnx @@ -30,23 +30,23 @@ dependencies: flutter: sdk: flutter - sherpa_onnx_android: ^1.10.22 + sherpa_onnx_android: ^1.10.23 # sherpa_onnx_android: # path: ../sherpa_onnx_android - sherpa_onnx_macos: ^1.10.22 + sherpa_onnx_macos: ^1.10.23 # sherpa_onnx_macos: # path: ../sherpa_onnx_macos - sherpa_onnx_linux: ^1.10.22 + sherpa_onnx_linux: ^1.10.23 # sherpa_onnx_linux: # path: ../sherpa_onnx_linux # - sherpa_onnx_windows: ^1.10.22 + sherpa_onnx_windows: ^1.10.23 # sherpa_onnx_windows: # path: ../sherpa_onnx_windows - sherpa_onnx_ios: ^1.10.22 + sherpa_onnx_ios: ^1.10.23 # sherpa_onnx_ios: # path: ../sherpa_onnx_ios diff --git a/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec b/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec index c3e261387..bc91bc803 100644 --- a/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec +++ b/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec @@ -7,7 +7,7 @@ # https://groups.google.com/g/dart-ffi/c/nUATMBy7r0c Pod::Spec.new do |s| s.name = 'sherpa_onnx_ios' - s.version = '1.10.22' + s.version = '1.10.23' s.summary = 'A new Flutter FFI plugin project.' s.description = <<-DESC A new Flutter FFI plugin project. diff --git a/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec b/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec index 956b5c91b..cfbabb144 100644 --- a/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec +++ b/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec @@ -4,7 +4,7 @@ # Pod::Spec.new do |s| s.name = 'sherpa_onnx_macos' - s.version = '1.10.22' + s.version = '1.10.23' s.summary = 'sherpa-onnx Flutter FFI plugin project.' s.description = <<-DESC sherpa-onnx Flutter FFI plugin project. diff --git a/new-release.sh b/new-release.sh new file mode 100755 index 000000000..056107fd6 --- /dev/null +++ b/new-release.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +find flutter -name *.yaml -type f -exec sed -i.bak 's/1\.10\.22/1\.10\.23/g' {} \; +find dart-api-examples -name *.yaml -type f -exec sed -i.bak 's/1\.10\.22/1\.10\.23/g' {} \; +find flutter-examples -name *.yaml -type f -exec sed -i.bak 's/1\.10\.22/1\.10\.23/g' {} \; +find flutter -name *.podspec -type f -exec sed -i.bak 's/1\.10\.22/1\.10\.23/g' {} \; +find nodejs-addon-examples -name package.json -type f -exec sed -i.bak 's/1\.10\.22/1\.10\.23/g' {} \; diff --git a/nodejs-addon-examples/package.json b/nodejs-addon-examples/package.json index 744d08ea0..d80d25108 100644 --- a/nodejs-addon-examples/package.json +++ b/nodejs-addon-examples/package.json @@ -1,5 +1,5 @@ { "dependencies": { - "sherpa-onnx-node": "^1.10.22" + "sherpa-onnx-node": "^1.10.23" } } diff --git a/nodejs-addon-examples/test_keyword_spotter_transducer.js b/nodejs-addon-examples/test_keyword_spotter_transducer.js index 31fa53046..9e05a9116 100644 --- a/nodejs-addon-examples/test_keyword_spotter_transducer.js +++ b/nodejs-addon-examples/test_keyword_spotter_transducer.js @@ -42,11 +42,11 @@ stream.acceptWaveform({samples: tailPadding, sampleRate: wave.sampleRate}); const detectedKeywords = []; while (kws.isReady(stream)) { + kws.decode(stream); const keyword = kws.getResult(stream).keyword; if (keyword != '') { detectedKeywords.push(keyword); } - kws.decode(stream); } let stop = Date.now(); diff --git a/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js b/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js index 20e17db78..6f3783e7c 100644 --- a/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js +++ b/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js @@ -120,8 +120,8 @@ console.log('Done') const elapsed_seconds = (stop - start) / 1000; const duration = wave.samples.length / wave.sampleRate; const real_time_factor = elapsed_seconds / duration; -console.log('Wave duration', duration.toFixed(3), 'secodns') -console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') +console.log('Wave duration', duration.toFixed(3), 'seconds') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds') console.log( `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, real_time_factor.toFixed(3)) diff --git a/nodejs-examples/test-keyword-spotter-transducer.js b/nodejs-examples/test-keyword-spotter-transducer.js new file mode 100644 index 000000000..9ead2b191 --- /dev/null +++ b/nodejs-examples/test-keyword-spotter-transducer.js @@ -0,0 +1,49 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx'); + +function createKeywordSpotter() { + // Please download test files from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/kws-models + const config = { + 'modelConfig': { + 'transducer': { + 'encoder': + './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/encoder-epoch-12-avg-2-chunk-16-left-64.onnx', + 'decoder': + './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/decoder-epoch-12-avg-2-chunk-16-left-64.onnx', + 'joiner': + './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/joiner-epoch-12-avg-2-chunk-16-left-64.onnx', + }, + 'tokens': + './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt', + }, + keywords: 'w én s ēn t è k ǎ s uǒ @文森特卡索\n' + + 'f ǎ g uó @法国' + }; + + return sherpa_onnx.createKws(config); +} + +const kws = createKeywordSpotter(); +const stream = kws.createStream(); +const waveFilename = + './sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav'; + +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform(wave.sampleRate, wave.samples); + +const tailPadding = new Float32Array(wave.sampleRate * 0.4); +stream.acceptWaveform(kws.config.featConfig.sampleRate, tailPadding); + +const detectedKeywords = []; +while (kws.isReady(stream)) { + kws.decode(stream); + const keyword = kws.getResult(stream).keyword; + if (keyword != '') { + detectedKeywords.push(keyword); + } +} +console.log(detectedKeywords); + +stream.free(); +kws.free(); diff --git a/nodejs-examples/test-offline-nemo-ctc.js b/nodejs-examples/test-offline-nemo-ctc.js index fc18d4194..bc0c31b73 100644 --- a/nodejs-examples/test-offline-nemo-ctc.js +++ b/nodejs-examples/test-offline-nemo-ctc.js @@ -7,27 +7,13 @@ const wav = require('wav'); const sherpa_onnx = require('sherpa-onnx'); function createOfflineRecognizer() { - let featConfig = { - sampleRate: 16000, - featureDim: 80, - }; - - let modelConfig = { - nemoCtc: { - model: './sherpa-onnx-nemo-ctc-en-conformer-small/model.int8.onnx', - }, - tokens: './sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt', - numThreads: 1, - debug: 0, - provider: 'cpu', - modelType: 'nemo_ctc', - }; - let config = { - featConfig: featConfig, - modelConfig: modelConfig, - decodingMethod: 'greedy_search', - maxActivePaths: 4, + modelConfig: { + nemoCtc: { + model: './sherpa-onnx-nemo-ctc-en-conformer-small/model.int8.onnx', + }, + tokens: './sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt', + } }; return sherpa_onnx.createOfflineRecognizer(config); @@ -38,63 +24,12 @@ const stream = recognizer.createStream(); const waveFilename = './sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/0.wav'; +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform(wave.sampleRate, wave.samples); -const reader = new wav.Reader(); -const readable = new Readable().wrap(reader); -const buf = []; - -reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { - if (sampleRate != recognizer.config.featConfig.sampleRate) { - throw new Error(`Only support sampleRate ${ - recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`); - } - - if (audioFormat != 1) { - throw new Error(`Only support PCM format. Given ${audioFormat}`); - } - - if (channels != 1) { - throw new Error(`Only a single channel. Given ${channel}`); - } - - if (bitDepth != 16) { - throw new Error(`Only support 16-bit samples. Given ${bitDepth}`); - } -}); - -fs.createReadStream(waveFilename, {highWaterMark: 4096}) - .pipe(reader) - .on('finish', function(err) { - // tail padding - const floatSamples = - new Float32Array(recognizer.config.featConfig.sampleRate * 0.5); - - buf.push(floatSamples); - const flattened = - Float32Array.from(buf.reduce((a, b) => [...a, ...b], [])); - - stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened); - recognizer.decode(stream); - const text = recognizer.getResult(stream).text; - console.log(text); - - stream.free(); - recognizer.free(); - }); - -readable.on('readable', function() { - let chunk; - while ((chunk = readable.read()) != null) { - const int16Samples = new Int16Array( - chunk.buffer, chunk.byteOffset, - chunk.length / Int16Array.BYTES_PER_ELEMENT); - - const floatSamples = new Float32Array(int16Samples.length); - - for (let i = 0; i < floatSamples.length; i++) { - floatSamples[i] = int16Samples[i] / 32768.0; - } +recognizer.decode(stream); +const text = recognizer.getResult(stream).text; +console.log(text); - buf.push(floatSamples); - } -}); +stream.free(); +recognizer.free(); diff --git a/nodejs-examples/test-offline-paraformer-itn.js b/nodejs-examples/test-offline-paraformer-itn.js index 58856cc82..100f0cab9 100644 --- a/nodejs-examples/test-offline-paraformer-itn.js +++ b/nodejs-examples/test-offline-paraformer-itn.js @@ -7,27 +7,15 @@ const wav = require('wav'); const sherpa_onnx = require('sherpa-onnx'); function createOfflineRecognizer() { - let featConfig = { - sampleRate: 16000, - featureDim: 80, - }; - let modelConfig = { paraformer: { model: './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx', }, tokens: './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt', - numThreads: 1, - debug: 0, - provider: 'cpu', - modelType: 'paraformer', }; - let config = { - featConfig: featConfig, modelConfig: modelConfig, - decodingMethod: 'greedy_search', // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst ruleFsts: './itn_zh_number.fst', }; @@ -41,62 +29,12 @@ const stream = recognizer.createStream(); // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav const waveFilename = './itn-zh-number.wav'; +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform(wave.sampleRate, wave.samples); -const reader = new wav.Reader(); -const readable = new Readable().wrap(reader); -const buf = []; - -reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { - if (sampleRate != recognizer.config.featConfig.sampleRate) { - throw new Error(`Only support sampleRate ${ - recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`); - } - - if (audioFormat != 1) { - throw new Error(`Only support PCM format. Given ${audioFormat}`); - } - - if (channels != 1) { - throw new Error(`Only a single channel. Given ${channel}`); - } - - if (bitDepth != 16) { - throw new Error(`Only support 16-bit samples. Given ${bitDepth}`); - } -}); - -fs.createReadStream(waveFilename, {'highWaterMark': 4096}) - .pipe(reader) - .on('finish', function(err) { - // tail padding - const floatSamples = - new Float32Array(recognizer.config.featConfig.sampleRate * 0.5); - - buf.push(floatSamples); - const flattened = - Float32Array.from(buf.reduce((a, b) => [...a, ...b], [])); - - stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened); - recognizer.decode(stream); - const text = recognizer.getResult(stream).text; - console.log(text); - - stream.free(); - recognizer.free(); - }); - -readable.on('readable', function() { - let chunk; - while ((chunk = readable.read()) != null) { - const int16Samples = new Int16Array( - chunk.buffer, chunk.byteOffset, - chunk.length / Int16Array.BYTES_PER_ELEMENT); - - const floatSamples = new Float32Array(int16Samples.length); - for (let i = 0; i < floatSamples.length; i++) { - floatSamples[i] = int16Samples[i] / 32768.0; - } +recognizer.decode(stream); +const text = recognizer.getResult(stream).text; +console.log(text); - buf.push(floatSamples); - } -}); +stream.free(); +recognizer.free(); diff --git a/nodejs-examples/test-offline-paraformer.js b/nodejs-examples/test-offline-paraformer.js index d9286aaab..9e7b1946a 100644 --- a/nodejs-examples/test-offline-paraformer.js +++ b/nodejs-examples/test-offline-paraformer.js @@ -1,98 +1,32 @@ // Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) -const fs = require('fs'); -const {Readable} = require('stream'); -const wav = require('wav'); - const sherpa_onnx = require('sherpa-onnx'); function createOfflineRecognizer() { - let featConfig = { - sampleRate: 16000, - featureDim: 80, - }; - let modelConfig = { paraformer: { model: './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx', }, tokens: './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt', - numThreads: 1, - debug: 0, - provider: 'cpu', - modelType: 'paraformer', }; let config = { - featConfig: featConfig, modelConfig: modelConfig, - decodingMethod: 'greedy_search', }; return sherpa_onnx.createOfflineRecognizer(config); } - const recognizer = createOfflineRecognizer(); const stream = recognizer.createStream(); const waveFilename = './sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/0.wav'; +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform(wave.sampleRate, wave.samples); -const reader = new wav.Reader(); -const readable = new Readable().wrap(reader); -const buf = []; - -reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { - if (sampleRate != recognizer.config.featConfig.sampleRate) { - throw new Error(`Only support sampleRate ${ - recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`); - } - - if (audioFormat != 1) { - throw new Error(`Only support PCM format. Given ${audioFormat}`); - } - - if (channels != 1) { - throw new Error(`Only a single channel. Given ${channel}`); - } - - if (bitDepth != 16) { - throw new Error(`Only support 16-bit samples. Given ${bitDepth}`); - } -}); - -fs.createReadStream(waveFilename, {'highWaterMark': 4096}) - .pipe(reader) - .on('finish', function(err) { - // tail padding - const floatSamples = - new Float32Array(recognizer.config.featConfig.sampleRate * 0.5); - - buf.push(floatSamples); - const flattened = - Float32Array.from(buf.reduce((a, b) => [...a, ...b], [])); - - stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened); - recognizer.decode(stream); - const text = recognizer.getResult(stream).text; - console.log(text); - - stream.free(); - recognizer.free(); - }); - -readable.on('readable', function() { - let chunk; - while ((chunk = readable.read()) != null) { - const int16Samples = new Int16Array( - chunk.buffer, chunk.byteOffset, - chunk.length / Int16Array.BYTES_PER_ELEMENT); - - const floatSamples = new Float32Array(int16Samples.length); - for (let i = 0; i < floatSamples.length; i++) { - floatSamples[i] = int16Samples[i] / 32768.0; - } +recognizer.decode(stream); +const text = recognizer.getResult(stream).text; +console.log(text); - buf.push(floatSamples); - } -}); +stream.free(); +recognizer.free(); diff --git a/nodejs-examples/test-offline-sense-voice.js b/nodejs-examples/test-offline-sense-voice.js index 1c0c8bd01..ff3d829b9 100644 --- a/nodejs-examples/test-offline-sense-voice.js +++ b/nodejs-examples/test-offline-sense-voice.js @@ -1,17 +1,8 @@ // Copyright (c) 2024 Xiaomi Corporation (authors: Fangjun Kuang) -const fs = require('fs'); -const {Readable} = require('stream'); -const wav = require('wav'); - const sherpa_onnx = require('sherpa-onnx'); function createOfflineRecognizer() { - let featConfig = { - sampleRate: 16000, - featureDim: 80, - }; - let modelConfig = { senseVoice: { model: @@ -20,82 +11,26 @@ function createOfflineRecognizer() { useInverseTextNormalization: 1, }, tokens: './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt', - numThreads: 1, - debug: 0, - provider: 'cpu', }; let config = { - featConfig: featConfig, modelConfig: modelConfig, - decodingMethod: 'greedy_search', }; return sherpa_onnx.createOfflineRecognizer(config); } - const recognizer = createOfflineRecognizer(); const stream = recognizer.createStream(); const waveFilename = './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav'; +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform(wave.sampleRate, wave.samples); -const reader = new wav.Reader(); -const readable = new Readable().wrap(reader); -const buf = []; - -reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { - if (sampleRate != recognizer.config.featConfig.sampleRate) { - throw new Error(`Only support sampleRate ${ - recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`); - } - - if (audioFormat != 1) { - throw new Error(`Only support PCM format. Given ${audioFormat}`); - } - - if (channels != 1) { - throw new Error(`Only a single channel. Given ${channel}`); - } - - if (bitDepth != 16) { - throw new Error(`Only support 16-bit samples. Given ${bitDepth}`); - } -}); - -fs.createReadStream(waveFilename, {'highWaterMark': 4096}) - .pipe(reader) - .on('finish', function(err) { - // tail padding - const floatSamples = - new Float32Array(recognizer.config.featConfig.sampleRate * 0.5); - - buf.push(floatSamples); - const flattened = - Float32Array.from(buf.reduce((a, b) => [...a, ...b], [])); - - stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened); - recognizer.decode(stream); - const text = recognizer.getResult(stream).text; - console.log(text); - - stream.free(); - recognizer.free(); - }); - -readable.on('readable', function() { - let chunk; - while ((chunk = readable.read()) != null) { - const int16Samples = new Int16Array( - chunk.buffer, chunk.byteOffset, - chunk.length / Int16Array.BYTES_PER_ELEMENT); - - const floatSamples = new Float32Array(int16Samples.length); - for (let i = 0; i < floatSamples.length; i++) { - floatSamples[i] = int16Samples[i] / 32768.0; - } +recognizer.decode(stream); +const text = recognizer.getResult(stream).text; +console.log(text); - buf.push(floatSamples); - } -}); +stream.free(); +recognizer.free(); diff --git a/nodejs-examples/test-offline-transducer.js b/nodejs-examples/test-offline-transducer.js index fddfa5890..a7e101373 100644 --- a/nodejs-examples/test-offline-transducer.js +++ b/nodejs-examples/test-offline-transducer.js @@ -1,17 +1,8 @@ // Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) // -const fs = require('fs'); -const {Readable} = require('stream'); -const wav = require('wav'); - const sherpa_onnx = require('sherpa-onnx'); function createOfflineRecognizer() { - let featConfig = { - sampleRate: 16000, - featureDim: 80, - }; - let modelConfig = { transducer: { encoder: @@ -22,19 +13,11 @@ function createOfflineRecognizer() { './sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.int8.onnx', }, tokens: './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt', - numThreads: 1, - debug: 0, - provider: 'cpu', modelType: 'transducer', }; let config = { - featConfig: featConfig, modelConfig: modelConfig, - decodingMethod: 'greedy_search', - maxActivePaths: 4, - hotwordsFile: '', - hotwordsScore: 1.5, }; return sherpa_onnx.createOfflineRecognizer(config); @@ -43,62 +26,12 @@ const recognizer = createOfflineRecognizer(); const stream = recognizer.createStream(); const waveFilename = './sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav'; +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform(wave.sampleRate, wave.samples); -const reader = new wav.Reader(); -const readable = new Readable().wrap(reader); -const buf = []; - -reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { - if (sampleRate != recognizer.config.featConfig.sampleRate) { - throw new Error(`Only support sampleRate ${ - recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`); - } - - if (audioFormat != 1) { - throw new Error(`Only support PCM format. Given ${audioFormat}`); - } - - if (channels != 1) { - throw new Error(`Only a single channel. Given ${channel}`); - } - - if (bitDepth != 16) { - throw new Error(`Only support 16-bit samples. Given ${bitDepth}`); - } -}); - -fs.createReadStream(waveFilename, {'highWaterMark': 4096}) - .pipe(reader) - .on('finish', function(err) { - // tail padding - const floatSamples = - new Float32Array(recognizer.config.featConfig.sampleRate * 0.5); - - buf.push(floatSamples); - const flattened = - Float32Array.from(buf.reduce((a, b) => [...a, ...b], [])); - - stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened); - recognizer.decode(stream); - const text = recognizer.getResult(stream).text; - console.log(text); - - stream.free(); - recognizer.free(); - }); - -readable.on('readable', function() { - let chunk; - while ((chunk = readable.read()) != null) { - const int16Samples = new Int16Array( - chunk.buffer, chunk.byteOffset, - chunk.length / Int16Array.BYTES_PER_ELEMENT); - - const floatSamples = new Float32Array(int16Samples.length); - for (let i = 0; i < floatSamples.length; i++) { - floatSamples[i] = int16Samples[i] / 32768.0; - } +recognizer.decode(stream); +const text = recognizer.getResult(stream).text; +console.log(text); - buf.push(floatSamples); - } -}); +stream.free(); +recognizer.free(); diff --git a/nodejs-examples/test-offline-whisper.js b/nodejs-examples/test-offline-whisper.js index a8a90fb72..c4b08427a 100644 --- a/nodejs-examples/test-offline-whisper.js +++ b/nodejs-examples/test-offline-whisper.js @@ -1,17 +1,8 @@ // Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) // -const fs = require('fs'); -const {Readable} = require('stream'); -const wav = require('wav'); - const sherpa_onnx = require('sherpa-onnx'); function createOfflineRecognizer() { - let featConfig = { - sampleRate: 16000, - featureDim: 80, - }; - let modelConfig = { whisper: { encoder: './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx', @@ -21,83 +12,25 @@ function createOfflineRecognizer() { tailPaddings: -1, }, tokens: './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt', - numThreads: 1, - debug: 0, - provider: 'cpu', - modelType: 'whisper', }; let config = { - featConfig: featConfig, modelConfig: modelConfig, - decodingMethod: 'greedy_search', }; return sherpa_onnx.createOfflineRecognizer(config); } - recognizer = createOfflineRecognizer(); stream = recognizer.createStream(); const waveFilename = './sherpa-onnx-whisper-tiny.en/test_wavs/0.wav'; +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform(wave.sampleRate, wave.samples); -const reader = new wav.Reader(); -const readable = new Readable().wrap(reader); -const buf = []; - -reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { - if (sampleRate != recognizer.config.featConfig.sampleRate) { - throw new Error(`Only support sampleRate ${ - recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`); - } - - if (audioFormat != 1) { - throw new Error(`Only support PCM format. Given ${audioFormat}`); - } - - if (channels != 1) { - throw new Error(`Only a single channel. Given ${channel}`); - } - - if (bitDepth != 16) { - throw new Error(`Only support 16-bit samples. Given ${bitDepth}`); - } -}); - -fs.createReadStream(waveFilename, {'highWaterMark': 4096}) - .pipe(reader) - .on('finish', function(err) { - // tail padding - const floatSamples = - new Float32Array(recognizer.config.featConfig.sampleRate * 0.5); - - buf.push(floatSamples); - const flattened = - Float32Array.from(buf.reduce((a, b) => [...a, ...b], [])); - - stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened); - recognizer.decode(stream); - const text = recognizer.getResult(stream).text; - console.log(text); - - stream.free(); - recognizer.free(); - }); - -readable.on('readable', function() { - let chunk; - while ((chunk = readable.read()) != null) { - const int16Samples = new Int16Array( - chunk.buffer, chunk.byteOffset, - chunk.length / Int16Array.BYTES_PER_ELEMENT); - - const floatSamples = new Float32Array(int16Samples.length); - - for (let i = 0; i < floatSamples.length; i++) { - floatSamples[i] = int16Samples[i] / 32768.0; - } +recognizer.decode(stream); +const text = recognizer.getResult(stream).text; +console.log(text); - buf.push(floatSamples); - } -}); +stream.free(); +recognizer.free(); diff --git a/nodejs-examples/test-online-paraformer-microphone.js b/nodejs-examples/test-online-paraformer-microphone.js index a8fb596f6..95ca9fa6b 100644 --- a/nodejs-examples/test-online-paraformer-microphone.js +++ b/nodejs-examples/test-online-paraformer-microphone.js @@ -16,22 +16,10 @@ function createOnlineRecognizer() { let onlineModelConfig = { paraformer: onlineParaformerModelConfig, tokens: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt', - numThreads: 1, - provider: 'cpu', - debug: 1, - modelType: 'paraformer', - }; - - let featureConfig = { - sampleRate: 16000, - featureDim: 80, }; let recognizerConfig = { - featConfig: featureConfig, modelConfig: onlineModelConfig, - decodingMethod: 'greedy_search', - maxActivePaths: 4, enableEndpoint: 1, rule1MinTrailingSilence: 2.4, rule2MinTrailingSilence: 1.2, diff --git a/nodejs-examples/test-online-paraformer.js b/nodejs-examples/test-online-paraformer.js index ff6cdc0f2..65eb91954 100644 --- a/nodejs-examples/test-online-paraformer.js +++ b/nodejs-examples/test-online-paraformer.js @@ -17,26 +17,10 @@ function createOnlineRecognizer() { let onlineModelConfig = { paraformer: onlineParaformerModelConfig, tokens: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt', - numThreads: 1, - provider: 'cpu', - debug: 1, - modelType: 'paraformer', - }; - - let featureConfig = { - sampleRate: 16000, - featureDim: 80, }; let recognizerConfig = { - featConfig: featureConfig, modelConfig: onlineModelConfig, - decodingMethod: 'greedy_search', - maxActivePaths: 4, - enableEndpoint: 1, - rule1MinTrailingSilence: 2.4, - rule2MinTrailingSilence: 1.2, - rule3MinUtteranceLength: 20, }; return sherpa_onnx.createOnlineRecognizer(recognizerConfig); diff --git a/nodejs-examples/test-online-transducer.js b/nodejs-examples/test-online-transducer.js index 96d66a840..5aec27de0 100644 --- a/nodejs-examples/test-online-transducer.js +++ b/nodejs-examples/test-online-transducer.js @@ -20,26 +20,10 @@ function createOnlineRecognizer() { transducer: onlineTransducerModelConfig, tokens: './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt', - numThreads: 1, - provider: 'cpu', - debug: 1, - modelType: 'zipformer', - }; - - let featureConfig = { - sampleRate: 16000, - featureDim: 80, }; let recognizerConfig = { - featConfig: featureConfig, modelConfig: onlineModelConfig, - decodingMethod: 'greedy_search', - maxActivePaths: 4, - enableEndpoint: 1, - rule1MinTrailingSilence: 2.4, - rule2MinTrailingSilence: 1.2, - rule3MinUtteranceLength: 20, }; return sherpa_onnx.createOnlineRecognizer(recognizerConfig); diff --git a/nodejs-examples/test-vad-with-non-streaming-asr-whisper.js b/nodejs-examples/test-vad-with-non-streaming-asr-whisper.js new file mode 100644 index 000000000..e84c3ab11 --- /dev/null +++ b/nodejs-examples/test-vad-with-non-streaming-asr-whisper.js @@ -0,0 +1,124 @@ +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) + +const sherpa_onnx = require('sherpa-onnx'); + +function createRecognizer() { + // Please download test files from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + const config = { + 'modelConfig': { + 'whisper': { + 'encoder': './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx', + 'decoder': './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx', + 'tailPaddings': 2000, + }, + 'tokens': './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt', + 'debug': 0, + } + }; + + return sherpa_onnx.createOfflineRecognizer(config); +} + +function createVad() { + // please download silero_vad.onnx from + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + const config = { + sileroVad: { + model: './silero_vad.onnx', + threshold: 0.5, + minSpeechDuration: 0.25, + minSilenceDuration: 0.5, + windowSize: 512, + }, + sampleRate: 16000, + debug: true, + numThreads: 1, + bufferSizeInSeconds: 60, + }; + + return sherpa_onnx.createVad(config); +} + +const recognizer = createRecognizer(); +const vad = createVad(); + +// please download ./Obama.wav from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +const waveFilename = './Obama.wav'; +const wave = sherpa_onnx.readWave(waveFilename); + +if (wave.sampleRate != recognizer.config.featConfig.sampleRate) { + throw new Error( + 'Expected sample rate: ${recognizer.config.featConfig.sampleRate}. Given: ${wave.sampleRate}'); +} + +console.log('Started') +let start = Date.now(); + +const windowSize = vad.config.sileroVad.windowSize; +for (let i = 0; i < wave.samples.length; i += windowSize) { + const thisWindow = wave.samples.subarray(i, i + windowSize); + vad.acceptWaveform(thisWindow); + + while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + + let start_time = segment.start / wave.sampleRate; + let end_time = start_time + segment.samples.length / wave.sampleRate; + + start_time = start_time.toFixed(2); + end_time = end_time.toFixed(2); + + const stream = recognizer.createStream(); + stream.acceptWaveform(wave.sampleRate, segment.samples); + + recognizer.decode(stream); + const r = recognizer.getResult(stream); + if (r.text.length > 0) { + const text = r.text.toLowerCase().trim(); + console.log(`${start_time} -- ${end_time}: ${text}`); + } + + stream.free(); + } +} + +vad.flush(); + +while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + + let start_time = segment.start / wave.sampleRate; + let end_time = start_time + segment.samples.length / wave.sampleRate; + + start_time = start_time.toFixed(2); + end_time = end_time.toFixed(2); + + const stream = recognizer.createStream(); + stream.acceptWaveform(wave.sampleRate, segment.samples); + + recognizer.decode(stream); + const r = recognizer.getResult(stream); + if (r.text.length > 0) { + const text = r.text.toLowerCase().trim(); + console.log(`${start_time} -- ${end_time}: ${text}`); + } +} + +let stop = Date.now(); +console.log('Done') + +const elapsed_seconds = (stop - start) / 1000; +const duration = wave.samples.length / wave.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'seconds') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) + +vad.free(); +recognizer.free(); diff --git a/scripts/nodejs/.gitignore b/scripts/nodejs/.gitignore index d84de665b..075c52da9 100644 --- a/scripts/nodejs/.gitignore +++ b/scripts/nodejs/.gitignore @@ -1,2 +1,4 @@ node_modules jslint.mjs +sherpa-onnx-*.js +sherpa-onnx-*.wasm diff --git a/scripts/nodejs/index.js b/scripts/nodejs/index.js index 3163c0ab3..3f0789edb 100644 --- a/scripts/nodejs/index.js +++ b/scripts/nodejs/index.js @@ -4,6 +4,9 @@ const wasmModule = require('./sherpa-onnx-wasm-nodejs.js')(); const sherpa_onnx_asr = require('./sherpa-onnx-asr.js'); const sherpa_onnx_tts = require('./sherpa-onnx-tts.js'); +const sherpa_onnx_kws = require('./sherpa-onnx-kws.js'); +const sherpa_onnx_wave = require('./sherpa-onnx-wave.js'); +const sherpa_onnx_vad = require('./sherpa-onnx-vad.js'); function createOnlineRecognizer(config) { return sherpa_onnx_asr.createOnlineRecognizer(wasmModule, config); @@ -17,10 +20,35 @@ function createOfflineTts(config) { return sherpa_onnx_tts.createOfflineTts(wasmModule, config); } +function createKws(config) { + return sherpa_onnx_kws.createKws(wasmModule, config); +} + +function createCircularBuffer(capacity) { + return new sherpa_onnx_vad.CircularBuffer(capacity, wasmModule); +} + +function createVad(config) { + return sherpa_onnx_vad.createVad(wasmModule, config); +} + +function readWave(filename) { + return sherpa_onnx_wave.readWave(filename, wasmModule); +} + +function writeWave(filename, data) { + sherpa_onnx_wave.writeWave(filename, data, wasmModule); +} + // Note: online means streaming and offline means non-streaming here. // Both of them don't require internet connection. module.exports = { createOnlineRecognizer, createOfflineRecognizer, createOfflineTts, + createKws, + readWave, + writeWave, + createCircularBuffer, + createVad, }; diff --git a/wasm/asr/sherpa-onnx-asr.js b/wasm/asr/sherpa-onnx-asr.js index 71848a7a9..f0b8bb778 100644 --- a/wasm/asr/sherpa-onnx-asr.js +++ b/wasm/asr/sherpa-onnx-asr.js @@ -546,7 +546,7 @@ function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { Module.setValue(ptr + 12, buffer + offset, 'i8*'); offset += taskLen; - Module.setValue(ptr + 16, config.tailPaddings || -1, 'i32'); + Module.setValue(ptr + 16, config.tailPaddings || 2000, 'i32'); return { buffer: buffer, ptr: ptr, len: len, diff --git a/wasm/kws/sherpa-onnx-kws.js b/wasm/kws/sherpa-onnx-kws.js index 23d2bb361..c9fd7cb6f 100644 --- a/wasm/kws/sherpa-onnx-kws.js +++ b/wasm/kws/sherpa-onnx-kws.js @@ -69,13 +69,14 @@ function initModelConfig(config, Module) { const len = transducer.len + paraformer_len + ctc_len + 7 * 4; const ptr = Module._malloc(len); + Module.HEAPU8.fill(0, ptr, ptr + len); let offset = 0; Module._CopyHeap(transducer.ptr, transducer.len, ptr + offset); const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1; - const providerLen = Module.lengthBytesUTF8(config.provider) + 1; - const modelTypeLen = Module.lengthBytesUTF8(config.modelType) + 1; + const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1; + const modelTypeLen = Module.lengthBytesUTF8(config.modelType || '') + 1; const modelingUnitLen = Module.lengthBytesUTF8(config.modelingUnit || '') + 1; const bpeVocabLen = Module.lengthBytesUTF8(config.bpeVocab || '') + 1; const bufferLen = @@ -86,10 +87,10 @@ function initModelConfig(config, Module) { Module.stringToUTF8(config.tokens, buffer, tokensLen); offset += tokensLen; - Module.stringToUTF8(config.provider, buffer + offset, providerLen); + Module.stringToUTF8(config.provider || 'cpu', buffer + offset, providerLen); offset += providerLen; - Module.stringToUTF8(config.modelType, buffer + offset, modelTypeLen); + Module.stringToUTF8(config.modelType || '', buffer + offset, modelTypeLen); offset += modelTypeLen; Module.stringToUTF8( @@ -103,7 +104,7 @@ function initModelConfig(config, Module) { Module.setValue(ptr + offset, buffer, 'i8*'); // tokens offset += 4; - Module.setValue(ptr + offset, config.numThreads, 'i32'); + Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); offset += 4; Module.setValue(ptr + offset, buffer + tokensLen, 'i8*'); // provider @@ -134,14 +135,21 @@ function initModelConfig(config, Module) { function initFeatureExtractorConfig(config, Module) { let ptr = Module._malloc(4 * 2); - Module.setValue(ptr, config.samplingRate, 'i32'); - Module.setValue(ptr + 4, config.featureDim, 'i32'); + Module.setValue(ptr, config.samplingRate || 16000, 'i32'); + Module.setValue(ptr + 4, config.featureDim || 80, 'i32'); return { ptr: ptr, len: 8, } } function initKwsConfig(config, Module) { + if (!('featConfig' in config)) { + config.featConfig = { + sampleRate: 16000, + featureDim: 80, + }; + } + let featConfig = initFeatureExtractorConfig(config.featConfig, Module); let modelConfig = initModelConfig(config.modelConfig, Module); @@ -155,16 +163,16 @@ function initKwsConfig(config, Module) { Module._CopyHeap(modelConfig.ptr, modelConfig.len, ptr + offset) offset += modelConfig.len; - Module.setValue(ptr + offset, config.maxActivePaths, 'i32'); + Module.setValue(ptr + offset, config.maxActivePaths || 4, 'i32'); offset += 4; - Module.setValue(ptr + offset, config.numTrailingBlanks, 'i32'); + Module.setValue(ptr + offset, config.numTrailingBlanks || 1, 'i32'); offset += 4; - Module.setValue(ptr + offset, config.keywordsScore, 'float'); + Module.setValue(ptr + offset, config.keywordsScore || 1.0, 'float'); offset += 4; - Module.setValue(ptr + offset, config.keywordsThreshold, 'float'); + Module.setValue(ptr + offset, config.keywordsThreshold || 0.25, 'float'); offset += 4; let keywordsLen = Module.lengthBytesUTF8(config.keywords) + 1; diff --git a/wasm/nodejs/CMakeLists.txt b/wasm/nodejs/CMakeLists.txt index ffc005065..4efc879a1 100644 --- a/wasm/nodejs/CMakeLists.txt +++ b/wasm/nodejs/CMakeLists.txt @@ -49,6 +49,32 @@ set(exported_functions SherpaOnnxDestroyKeywordSpotter SherpaOnnxGetKeywordResult SherpaOnnxIsKeywordStreamReady + # VAD + SherpaOnnxCreateCircularBuffer + SherpaOnnxDestroyCircularBuffer + SherpaOnnxCircularBufferPush + SherpaOnnxCircularBufferGet + SherpaOnnxCircularBufferFree + SherpaOnnxCircularBufferPop + SherpaOnnxCircularBufferSize + SherpaOnnxCircularBufferHead + SherpaOnnxCircularBufferReset + SherpaOnnxCreateVoiceActivityDetector + SherpaOnnxDestroyVoiceActivityDetector + SherpaOnnxVoiceActivityDetectorAcceptWaveform + SherpaOnnxVoiceActivityDetectorEmpty + SherpaOnnxVoiceActivityDetectorDetected + SherpaOnnxVoiceActivityDetectorPop + SherpaOnnxVoiceActivityDetectorClear + SherpaOnnxVoiceActivityDetectorFront + SherpaOnnxDestroySpeechSegment + SherpaOnnxVoiceActivityDetectorReset + SherpaOnnxVoiceActivityDetectorFlush + # + SherpaOnnxFileExists + SherpaOnnxReadWave + SherpaOnnxFreeWave + SherpaOnnxWriteWave ) @@ -82,6 +108,8 @@ install( ${CMAKE_SOURCE_DIR}/wasm/asr/sherpa-onnx-asr.js ${CMAKE_SOURCE_DIR}/wasm/tts/sherpa-onnx-tts.js ${CMAKE_SOURCE_DIR}/wasm/kws/sherpa-onnx-kws.js + ${CMAKE_SOURCE_DIR}/wasm/vad/sherpa-onnx-vad.js + ${CMAKE_SOURCE_DIR}/wasm/nodejs/sherpa-onnx-wave.js "$/sherpa-onnx-wasm-nodejs.js" "$/sherpa-onnx-wasm-nodejs.wasm" DESTINATION diff --git a/wasm/nodejs/sherpa-onnx-wave.js b/wasm/nodejs/sherpa-onnx-wave.js new file mode 100644 index 000000000..af87efc35 --- /dev/null +++ b/wasm/nodejs/sherpa-onnx-wave.js @@ -0,0 +1,57 @@ +// return an object +// { +// samples: a float32 array +// sampleRate: an integer +// } +function readWave(filename, Module) { + const filenameLen = Module.lengthBytesUTF8(filename) + 1; + const pFilename = Module._malloc(filenameLen); + Module.stringToUTF8(filename, pFilename, filenameLen); + + const w = Module._SherpaOnnxReadWave(pFilename); + Module._free(pFilename); + + + const samplesPtr = Module.HEAP32[w / 4] / 4; + const sampleRate = Module.HEAP32[w / 4 + 1]; + const numSamples = Module.HEAP32[w / 4 + 2]; + + const samples = new Float32Array(numSamples); + for (let i = 0; i < numSamples; i++) { + samples[i] = Module.HEAPF32[samplesPtr + i]; + } + + Module._SherpaOnnxFreeWave(w); + + + return {samples: samples, sampleRate: sampleRate}; +} + +// data is an object +// { +// samples: a float32 array +// sampleRate: an integer +// } +function writeWave(filename, data, Module) { + const pSamples = + Module._malloc(data.samples.length * data.samples.BYTES_PER_ELEMENT); + Module.HEAPF32.set(data.samples, pSamples / data.samples.BYTES_PER_ELEMENT); + + const filenameLen = Module.lengthBytesUTF8(filename) + 1; + const pFilename = Module._malloc(filenameLen); + Module.stringToUTF8(filename, pFilename, filenameLen); + + Module._SherpaOnnxWriteWave( + pSamples, data.samples.length, data.sampleRate, pFilename); + + Module._free(pFilename); + Module._free(pSamples); +} + +if (typeof process == 'object' && typeof process.versions == 'object' && + typeof process.versions.node == 'string') { + module.exports = { + readWave, + writeWave, + }; +}