Skip to content

Commit

Permalink
JavaScript API (node-addon) for speaker diarization (#1408)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Oct 10, 2024
1 parent a45e5db commit 67349b5
Show file tree
Hide file tree
Showing 11 changed files with 443 additions and 13 deletions.
22 changes: 16 additions & 6 deletions .github/scripts/node-addon/package-optional.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "sherpa-onnx-PLATFORM2-ARCH",
"version": "SHERPA_ONNX_VERSION",
"description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection",
"description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
Expand All @@ -16,8 +16,18 @@
"transcription",
"real-time speech recognition",
"without internet connection",
"locally",
"local",
"embedded systems",
"open source",
"diarization",
"speaker diarization",
"speaker recognition",
"speaker",
"speaker segmentation",
"speaker verification",
"spoken language identification",
"sherpa",
"zipformer",
"asr",
"tts",
Expand All @@ -30,13 +40,13 @@
"offline",
"privacy",
"open source",
"vad",
"speaker id",
"language id",
"node-addon-api",
"streaming speech recognition",
"speech",
"recognition"
"recognition",
"vad",
"node-addon-api",
"speaker id",
"language id"
],
"author": "The next-gen Kaldi team",
"license": "Apache-2.0",
Expand Down
22 changes: 16 additions & 6 deletions .github/scripts/node-addon/package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "sherpa-onnx-node",
"version": "SHERPA_ONNX_VERSION",
"description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection",
"description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection",
"main": "sherpa-onnx.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
Expand All @@ -16,8 +16,18 @@
"transcription",
"real-time speech recognition",
"without internet connection",
"locally",
"local",
"embedded systems",
"open source",
"diarization",
"speaker diarization",
"speaker recognition",
"speaker",
"speaker segmentation",
"speaker verification",
"spoken language identification",
"sherpa",
"zipformer",
"asr",
"tts",
Expand All @@ -30,13 +40,13 @@
"offline",
"privacy",
"open source",
"vad",
"speaker id",
"language id",
"node-addon-api",
"streaming speech recognition",
"speech",
"recognition"
"recognition",
"vad",
"node-addon-api",
"speaker id",
"language id"
],
"author": "The next-gen Kaldi team",
"license": "Apache-2.0",
Expand Down
14 changes: 14 additions & 0 deletions .github/scripts/test-nodejs-addon-npm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,20 @@ arch=$(node -p "require('os').arch()")
platform=$(node -p "require('os').platform()")
node_version=$(node -p "process.versions.node.split('.')[0]")

echo "----------non-streaming speaker diarization----------"

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav

node ./test_offline_speaker_diarization.js

rm -rfv *.onnx *.wav sherpa-onnx-pyannote-*

echo "----------non-streaming asr + vad----------"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
Expand Down
21 changes: 21 additions & 0 deletions nodejs-addon-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ export LD_LIBRARY_PATH=$PWD/node_modules/.pnpm/sherpa-onnx-node@<REPLACE-THIS-WI

The following tables list the examples in this folder.

## Speaker diarization

|File| Description|
|---|---|
|[./test_offline_speaker_diarization.js](./test_offline_speaker_diarization.js)| It demonstrates how to use sherpa-onnx JavaScript API for speaker diarization. It supports speaker segmentation models from [pyannote-audio](https://github.com/pyannote/pyannote-audio)|

## Add punctuations to text

|File| Description|
Expand Down Expand Up @@ -130,6 +136,21 @@ The following tables list the examples in this folder.
|[./test_tts_non_streaming_vits_zh_aishell3.js](./test_tts_non_streaming_vits_zh_aishell3.js)| Text-to-speech with a Chinese TTS model|


### Speaker diarization

```bash

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav

node ./test_offline_speaker_diarization.js
```

### Voice Activity detection (VAD)

```bash
Expand Down
62 changes: 62 additions & 0 deletions nodejs-addon-examples/test_offline_speaker_diarization.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Copyright (c) 2024 Xiaomi Corporation
const sherpa_onnx = require('sherpa-onnx-node');

// clang-format off
/* Please use the following commands to download files
used in this script
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
*/
// clang-format on

const config = {
segmentation: {
pyannote: {
model: './sherpa-onnx-pyannote-segmentation-3-0/model.onnx',
},
},
embedding: {
model: './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx',
},
clustering: {
// since we know that the test wave file
// ./0-four-speakers-zh.wav contains 4 speakers, we use 4 for numClusters
// here. if you don't have such information, please set numClusters to -1
numClusters: 4,

// If numClusters is not -1, then threshold is ignored.
//
// A larger threshold leads to fewer clusters, i.e., fewer speakers
// A smaller threshold leads to more clusters, i.e., more speakers
// You need to tune it by yourself.
threshold: 0.5,
},

// If a segment is shorter than minDurationOn, we discard it
minDurationOn: 0.2, // in seconds

// If the gap between two segments is less than minDurationOff, then we
// merge these two segments into a single one
minDurationOff: 0.5, // in seconds
};

const waveFilename = './0-four-speakers-zh.wav';

const sd = new sherpa_onnx.OfflineSpeakerDiarization(config);
console.log('Started')

const wave = sherpa_onnx.readWave(waveFilename);
if (sd.sampleRate != wave.sampleRate) {
throw new Error(
`Expected sample rate: ${sd.sampleRate}, given: ${wave.sampleRate}`);
}

const segments = sd.process(wave.samples);
console.log(segments);
1 change: 1 addition & 0 deletions scripts/node-addon-api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ set(srcs
src/audio-tagging.cc
src/keyword-spotting.cc
src/non-streaming-asr.cc
src/non-streaming-speaker-diarization.cc
src/non-streaming-tts.cc
src/punctuation.cc
src/sherpa-onnx-node-addon-api.cc
Expand Down
32 changes: 32 additions & 0 deletions scripts/node-addon-api/lib/non-streaming-speaker-diarization.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
const addon = require('./addon.js');

class OfflineSpeakerDiarization {
constructor(config) {
this.handle = addon.createOfflineSpeakerDiarization(config);
this.config = config;

this.sampleRate = addon.getOfflineSpeakerDiarizationSampleRate(this.handle);
}

/**
* samples is a 1-d float32 array. Each element of the array should be
* in the range [-1, 1].
*
* We assume its sample rate equals to this.sampleRate.
*
* Returns an array of object, where an object is
*
* {
* "start": start_time_in_seconds,
* "end": end_time_in_seconds,
* "speaker": an_integer,
* }
*/
process(samples) {
return addon.offlineSpeakerDiarizationProcess(this.handle, samples);
}
}

module.exports = {
OfflineSpeakerDiarization,
}
2 changes: 2 additions & 0 deletions scripts/node-addon-api/lib/sherpa-onnx.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ const sid = require('./speaker-identification.js');
const at = require('./audio-tagg.js');
const punct = require('./punctuation.js');
const kws = require('./keyword-spotter.js');
const sd = require('./non-streaming-speaker-diarization.js');

module.exports = {
OnlineRecognizer: streaming_asr.OnlineRecognizer,
Expand All @@ -24,4 +25,5 @@ module.exports = {
AudioTagging: at.AudioTagging,
Punctuation: punct.Punctuation,
KeywordSpotter: kws.KeywordSpotter,
OfflineSpeakerDiarization: sd.OfflineSpeakerDiarization,
}
12 changes: 11 additions & 1 deletion scripts/node-addon-api/package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"main": "lib/sherpa-onnx.js",
"version": "1.0.0",
"description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection",
"description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection",
"dependencies": {
"cmake-js": "^6.0.0",
"node-addon-api": "^1.1.0",
Expand All @@ -21,8 +21,18 @@
"transcription",
"real-time speech recognition",
"without internet connection",
"locally",
"local",
"embedded systems",
"open source",
"diarization",
"speaker diarization",
"speaker recognition",
"speaker",
"speaker segmentation",
"speaker verification",
"spoken language identification",
"sherpa",
"zipformer",
"asr",
"tts",
Expand Down
Loading

0 comments on commit 67349b5

Please sign in to comment.