JavaScript API (node-addon) for speaker diarization (#1408)

k2-fsa · Oct 10, 2024 · 67349b5 · 67349b5
1 parent a45e5db
commit 67349b5
Show file tree

Hide file tree

Showing 11 changed files with 443 additions and 13 deletions.
diff --git a/.github/scripts/node-addon/package-optional.json b/.github/scripts/node-addon/package-optional.json
@@ -1,7 +1,7 @@
 {
   "name": "sherpa-onnx-PLATFORM2-ARCH",
   "version": "SHERPA_ONNX_VERSION",
-  "description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection",
+  "description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection",
   "main": "index.js",
   "scripts": {
     "test": "echo \"Error: no test specified\" && exit 1"
@@ -16,8 +16,18 @@
     "transcription",
     "real-time speech recognition",
     "without internet connection",
+    "locally",
+    "local",
     "embedded systems",
     "open source",
+    "diarization",
+    "speaker diarization",
+    "speaker recognition",
+    "speaker",
+    "speaker segmentation",
+    "speaker verification",
+    "spoken language identification",
+    "sherpa",
     "zipformer",
     "asr",
     "tts",
@@ -30,13 +40,13 @@
     "offline",
     "privacy",
     "open source",
-    "vad",
-    "speaker id",
-    "language id",
-    "node-addon-api",
     "streaming speech recognition",
     "speech",
-    "recognition"
+    "recognition",
+    "vad",
+    "node-addon-api",
+    "speaker id",
+    "language id"
   ],
   "author": "The next-gen Kaldi team",
   "license": "Apache-2.0",

diff --git a/.github/scripts/node-addon/package.json b/.github/scripts/node-addon/package.json
@@ -1,7 +1,7 @@
 {
   "name": "sherpa-onnx-node",
   "version": "SHERPA_ONNX_VERSION",
-  "description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection",
+  "description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection",
   "main": "sherpa-onnx.js",
   "scripts": {
     "test": "echo \"Error: no test specified\" && exit 1"
@@ -16,8 +16,18 @@
     "transcription",
     "real-time speech recognition",
     "without internet connection",
+    "locally",
+    "local",
     "embedded systems",
     "open source",
+    "diarization",
+    "speaker diarization",
+    "speaker recognition",
+    "speaker",
+    "speaker segmentation",
+    "speaker verification",
+    "spoken language identification",
+    "sherpa",
     "zipformer",
     "asr",
     "tts",
@@ -30,13 +40,13 @@
     "offline",
     "privacy",
     "open source",
-    "vad",
-    "speaker id",
-    "language id",
-    "node-addon-api",
     "streaming speech recognition",
     "speech",
-    "recognition"
+    "recognition",
+    "vad",
+    "node-addon-api",
+    "speaker id",
+    "language id"
   ],
   "author": "The next-gen Kaldi team",
   "license": "Apache-2.0",

diff --git a/.github/scripts/test-nodejs-addon-npm.sh b/.github/scripts/test-nodejs-addon-npm.sh
@@ -10,6 +10,20 @@ arch=$(node -p "require('os').arch()")
 platform=$(node -p "require('os').platform()")
 node_version=$(node -p "process.versions.node.split('.')[0]")
 
+echo "----------non-streaming speaker diarization----------"
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
+
+node ./test_offline_speaker_diarization.js
+
+rm -rfv *.onnx *.wav sherpa-onnx-pyannote-*
+
 echo "----------non-streaming asr + vad----------"
 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
 tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2

diff --git a/nodejs-addon-examples/README.md b/nodejs-addon-examples/README.md
@@ -43,6 +43,12 @@ export LD_LIBRARY_PATH=$PWD/node_modules/.pnpm/sherpa-onnx-node@<REPLACE-THIS-WI
 
 The following tables list the examples in this folder.
 
+## Speaker diarization
+
+|File| Description|
+|---|---|
+|[./test_offline_speaker_diarization.js](./test_offline_speaker_diarization.js)| It demonstrates how to use sherpa-onnx JavaScript API for speaker diarization. It supports speaker segmentation models from [pyannote-audio](https://github.com/pyannote/pyannote-audio)|
+
 ## Add punctuations to text
 
 |File| Description|
@@ -130,6 +136,21 @@ The following tables list the examples in this folder.
 |[./test_tts_non_streaming_vits_zh_aishell3.js](./test_tts_non_streaming_vits_zh_aishell3.js)| Text-to-speech with a Chinese TTS model|
 
 
+### Speaker diarization
+
+```bash
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
+
+node ./test_offline_speaker_diarization.js
+```
+
 ### Voice Activity detection (VAD)
 
 ```bash

diff --git a/nodejs-addon-examples/test_offline_speaker_diarization.js b/nodejs-addon-examples/test_offline_speaker_diarization.js
@@ -0,0 +1,62 @@
+// Copyright (c)  2024  Xiaomi Corporation
+const sherpa_onnx = require('sherpa-onnx-node');
+
+// clang-format off
+/* Please use the following commands to download files
+   used in this script
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
+
+ */
+// clang-format on
+
+const config = {
+  segmentation: {
+    pyannote: {
+      model: './sherpa-onnx-pyannote-segmentation-3-0/model.onnx',
+    },
+  },
+  embedding: {
+    model: './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx',
+  },
+  clustering: {
+    // since we know that the test wave file
+    // ./0-four-speakers-zh.wav contains 4 speakers, we use 4 for numClusters
+    // here. if you don't have such information, please set numClusters to -1
+    numClusters: 4,
+
+    // If numClusters is not -1, then threshold is ignored.
+    //
+    // A larger threshold leads to fewer clusters, i.e., fewer speakers
+    // A smaller threshold leads to more clusters, i.e., more speakers
+    // You need to tune it by yourself.
+    threshold: 0.5,
+  },
+
+  // If a segment is shorter than minDurationOn, we discard it
+  minDurationOn: 0.2,  // in seconds
+
+  // If the gap between two segments is less than minDurationOff, then we
+  // merge these two segments into a single one
+  minDurationOff: 0.5,  // in seconds
+};
+
+const waveFilename = './0-four-speakers-zh.wav';
+
+const sd = new sherpa_onnx.OfflineSpeakerDiarization(config);
+console.log('Started')
+
+const wave = sherpa_onnx.readWave(waveFilename);
+if (sd.sampleRate != wave.sampleRate) {
+  throw new Error(
+      `Expected sample rate: ${sd.sampleRate}, given: ${wave.sampleRate}`);
+}
+
+const segments = sd.process(wave.samples);
+console.log(segments);
diff --git a/scripts/node-addon-api/CMakeLists.txt b/scripts/node-addon-api/CMakeLists.txt
@@ -21,6 +21,7 @@ set(srcs
   src/audio-tagging.cc
   src/keyword-spotting.cc
   src/non-streaming-asr.cc
+  src/non-streaming-speaker-diarization.cc
   src/non-streaming-tts.cc
   src/punctuation.cc
   src/sherpa-onnx-node-addon-api.cc

diff --git a/scripts/node-addon-api/lib/non-streaming-speaker-diarization.js b/scripts/node-addon-api/lib/non-streaming-speaker-diarization.js
@@ -0,0 +1,32 @@
+const addon = require('./addon.js');
+
+class OfflineSpeakerDiarization {
+  constructor(config) {
+    this.handle = addon.createOfflineSpeakerDiarization(config);
+    this.config = config;
+
+    this.sampleRate = addon.getOfflineSpeakerDiarizationSampleRate(this.handle);
+  }
+
+  /**
+   * samples is a 1-d float32 array. Each element of the array should be
+   * in the range [-1, 1].
+   *
+   * We assume its sample rate equals to this.sampleRate.
+   *
+   * Returns an array of object, where an object is
+   *
+   *  {
+   *    "start": start_time_in_seconds,
+   *    "end": end_time_in_seconds,
+   *    "speaker": an_integer,
+   *  }
+   */
+  process(samples) {
+    return addon.offlineSpeakerDiarizationProcess(this.handle, samples);
+  }
+}
+
+module.exports = {
+  OfflineSpeakerDiarization,
+}
diff --git a/scripts/node-addon-api/lib/sherpa-onnx.js b/scripts/node-addon-api/lib/sherpa-onnx.js
@@ -8,6 +8,7 @@ const sid = require('./speaker-identification.js');
 const at = require('./audio-tagg.js');
 const punct = require('./punctuation.js');
 const kws = require('./keyword-spotter.js');
+const sd = require('./non-streaming-speaker-diarization.js');
 
 module.exports = {
   OnlineRecognizer: streaming_asr.OnlineRecognizer,
@@ -24,4 +25,5 @@ module.exports = {
   AudioTagging: at.AudioTagging,
   Punctuation: punct.Punctuation,
   KeywordSpotter: kws.KeywordSpotter,
+  OfflineSpeakerDiarization: sd.OfflineSpeakerDiarization,
 }
diff --git a/scripts/node-addon-api/package.json b/scripts/node-addon-api/package.json
@@ -1,7 +1,7 @@
 {
   "main": "lib/sherpa-onnx.js",
   "version": "1.0.0",
-  "description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection",
+  "description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection",
   "dependencies": {
     "cmake-js": "^6.0.0",
     "node-addon-api": "^1.1.0",
@@ -21,8 +21,18 @@
     "transcription",
     "real-time speech recognition",
     "without internet connection",
+    "locally",
+    "local",
     "embedded systems",
     "open source",
+    "diarization",
+    "speaker diarization",
+    "speaker recognition",
+    "speaker",
+    "speaker segmentation",
+    "speaker verification",
+    "spoken language identification",
+    "sherpa",
     "zipformer",
     "asr",
     "tts",