Skip to content

Commit

Permalink
Add inverse text normalization for online ASR (#1020)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Jun 17, 2024
1 parent 6e09933 commit 349d957
Show file tree
Hide file tree
Showing 12 changed files with 390 additions and 32 deletions.
11 changes: 11 additions & 0 deletions .github/scripts/test-python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,18 @@ if [[ x$OS != x'windows-latest' ]]; then
$repo/test_wavs/3.wav \
$repo/test_wavs/8k.wav

ln -s $repo $PWD/

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav

python3 ./python-api-examples/inverse-text-normalization-online-asr.py

python3 sherpa-onnx/python/tests/test_online_recognizer.py --verbose

rm -rfv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20

rm -rf $repo
fi

log "Test non-streaming transducer models"
Expand Down
91 changes: 91 additions & 0 deletions python-api-examples/inverse-text-normalization-online-asr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/env python3
#
# Copyright (c) 2024 Xiaomi Corporation

"""
This script shows how to use inverse text normalization with streaming ASR.
Usage:
(1) Download the test model
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
(2) Download rule fst
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst
Please refer to
https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/itn_zh_number.ipynb
for how itn_zh_number.fst is generated.
(3) Download test wave
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav
(4) Run this script
python3 ./python-api-examples/inverse-text-normalization-online-asr.py
"""
from pathlib import Path

import sherpa_onnx
import soundfile as sf


def create_recognizer():
encoder = "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx"
decoder = "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx"
joiner = "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx"
tokens = "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt"
rule_fsts = "./itn_zh_number.fst"

if (
not Path(encoder).is_file()
or not Path(decoder).is_file()
or not Path(joiner).is_file()
or not Path(tokens).is_file()
or not Path(rule_fsts).is_file()
):
raise ValueError(
"""Please download model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
"""
)
return sherpa_onnx.OnlineRecognizer.from_transducer(
encoder=encoder,
decoder=decoder,
joiner=joiner,
tokens=tokens,
debug=True,
rule_fsts=rule_fsts,
)


def main():
recognizer = create_recognizer()
wave_filename = "./itn-zh-number.wav"
if not Path(wave_filename).is_file():
raise ValueError(
"""Please download model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
"""
)
audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True)
audio = audio[:, 0] # only use the first channel

stream = recognizer.create_stream()
stream.accept_waveform(sample_rate, audio)

tail_padding = [0] * int(0.3 * sample_rate)
stream.accept_waveform(sample_rate, tail_padding)

while recognizer.is_ready(stream):
recognizer.decode_stream(stream)

print(wave_filename)
print(recognizer.get_result_all(stream))


if __name__ == "__main__":
main()
12 changes: 8 additions & 4 deletions sherpa-onnx/csrc/online-recognizer-ctc-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src,
class OnlineRecognizerCtcImpl : public OnlineRecognizerImpl {
public:
explicit OnlineRecognizerCtcImpl(const OnlineRecognizerConfig &config)
: config_(config),
: OnlineRecognizerImpl(config),
config_(config),
model_(OnlineCtcModel::Create(config.model_config)),
sym_(config.model_config.tokens),
endpoint_(config_.endpoint_config) {
Expand All @@ -84,7 +85,8 @@ class OnlineRecognizerCtcImpl : public OnlineRecognizerImpl {
#if __ANDROID_API__ >= 9
explicit OnlineRecognizerCtcImpl(AAssetManager *mgr,
const OnlineRecognizerConfig &config)
: config_(config),
: OnlineRecognizerImpl(mgr, config),
config_(config),
model_(OnlineCtcModel::Create(mgr, config.model_config)),
sym_(mgr, config.model_config.tokens),
endpoint_(config_.endpoint_config) {
Expand Down Expand Up @@ -182,8 +184,10 @@ class OnlineRecognizerCtcImpl : public OnlineRecognizerImpl {
// TODO(fangjun): Remember to change these constants if needed
int32_t frame_shift_ms = 10;
int32_t subsampling_factor = 4;
return Convert(decoder_result, sym_, frame_shift_ms, subsampling_factor,
s->GetCurrentSegment(), s->GetNumFramesSinceStart());
auto r = Convert(decoder_result, sym_, frame_shift_ms, subsampling_factor,
s->GetCurrentSegment(), s->GetNumFramesSinceStart());
r.text = ApplyInverseTextNormalization(r.text);
return r;
}

bool IsEndpoint(OnlineStream *s) const override {
Expand Down
117 changes: 117 additions & 0 deletions sherpa-onnx/csrc/online-recognizer-impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,22 @@

#include "sherpa-onnx/csrc/online-recognizer-impl.h"

#if __ANDROID_API__ >= 9
#include <strstream>

#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif

#include "fst/extensions/far/far.h"
#include "kaldifst/csrc/kaldi-fst-io.h"
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/online-recognizer-ctc-impl.h"
#include "sherpa-onnx/csrc/online-recognizer-paraformer-impl.h"
#include "sherpa-onnx/csrc/online-recognizer-transducer-impl.h"
#include "sherpa-onnx/csrc/online-recognizer-transducer-nemo-impl.h"
#include "sherpa-onnx/csrc/onnx-utils.h"
#include "sherpa-onnx/csrc/text-utils.h"

namespace sherpa_onnx {

Expand Down Expand Up @@ -78,4 +89,110 @@ std::unique_ptr<OnlineRecognizerImpl> OnlineRecognizerImpl::Create(
}
#endif

OnlineRecognizerImpl::OnlineRecognizerImpl(const OnlineRecognizerConfig &config)
: config_(config) {
if (!config.rule_fsts.empty()) {
std::vector<std::string> files;
SplitStringToVector(config.rule_fsts, ",", false, &files);
itn_list_.reserve(files.size());
for (const auto &f : files) {
if (config.model_config.debug) {
SHERPA_ONNX_LOGE("rule fst: %s", f.c_str());
}
itn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(f));
}
}

if (!config.rule_fars.empty()) {
if (config.model_config.debug) {
SHERPA_ONNX_LOGE("Loading FST archives");
}
std::vector<std::string> files;
SplitStringToVector(config.rule_fars, ",", false, &files);

itn_list_.reserve(files.size() + itn_list_.size());

for (const auto &f : files) {
if (config.model_config.debug) {
SHERPA_ONNX_LOGE("rule far: %s", f.c_str());
}
std::unique_ptr<fst::FarReader<fst::StdArc>> reader(
fst::FarReader<fst::StdArc>::Open(f));
for (; !reader->Done(); reader->Next()) {
std::unique_ptr<fst::StdConstFst> r(
fst::CastOrConvertToConstFst(reader->GetFst()->Copy()));

itn_list_.push_back(
std::make_unique<kaldifst::TextNormalizer>(std::move(r)));
}
}

if (config.model_config.debug) {
SHERPA_ONNX_LOGE("FST archives loaded!");
}
}
}

#if __ANDROID_API__ >= 9
OnlineRecognizerImpl::OnlineRecognizerImpl(AAssetManager *mgr,
const OnlineRecognizerConfig &config)
: config_(config) {
if (!config.rule_fsts.empty()) {
std::vector<std::string> files;
SplitStringToVector(config.rule_fsts, ",", false, &files);
itn_list_.reserve(files.size());
for (const auto &f : files) {
if (config.model_config.debug) {
SHERPA_ONNX_LOGE("rule fst: %s", f.c_str());
}
auto buf = ReadFile(mgr, f);
std::istrstream is(buf.data(), buf.size());
itn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(is));
}
}

if (!config.rule_fars.empty()) {
std::vector<std::string> files;
SplitStringToVector(config.rule_fars, ",", false, &files);
itn_list_.reserve(files.size() + itn_list_.size());

for (const auto &f : files) {
if (config.model_config.debug) {
SHERPA_ONNX_LOGE("rule far: %s", f.c_str());
}

auto buf = ReadFile(mgr, f);

std::unique_ptr<std::istream> s(
new std::istrstream(buf.data(), buf.size()));

std::unique_ptr<fst::FarReader<fst::StdArc>> reader(
fst::FarReader<fst::StdArc>::Open(std::move(s)));

for (; !reader->Done(); reader->Next()) {
std::unique_ptr<fst::StdConstFst> r(
fst::CastOrConvertToConstFst(reader->GetFst()->Copy()));

itn_list_.push_back(
std::make_unique<kaldifst::TextNormalizer>(std::move(r)));
} // for (; !reader->Done(); reader->Next())
} // for (const auto &f : files)
} // if (!config.rule_fars.empty())
}
#endif

std::string OnlineRecognizerImpl::ApplyInverseTextNormalization(
std::string text) const {
if (!itn_list_.empty()) {
for (const auto &tn : itn_list_) {
text = tn->Normalize(text);
if (config_.model_config.debug) {
SHERPA_ONNX_LOGE("After inverse text normalization: %s", text.c_str());
}
}
}

return text;
}

} // namespace sherpa_onnx
20 changes: 20 additions & 0 deletions sherpa-onnx/csrc/online-recognizer-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@
#include <string>
#include <vector>

#if __ANDROID_API__ >= 9
#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif

#include "kaldifst/csrc/text-normalizer.h"
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/online-recognizer.h"
#include "sherpa-onnx/csrc/online-stream.h"
Expand All @@ -17,10 +23,15 @@ namespace sherpa_onnx {

class OnlineRecognizerImpl {
public:
explicit OnlineRecognizerImpl(const OnlineRecognizerConfig &config);

static std::unique_ptr<OnlineRecognizerImpl> Create(
const OnlineRecognizerConfig &config);

#if __ANDROID_API__ >= 9
OnlineRecognizerImpl(AAssetManager *mgr,
const OnlineRecognizerConfig &config);

static std::unique_ptr<OnlineRecognizerImpl> Create(
AAssetManager *mgr, const OnlineRecognizerConfig &config);
#endif
Expand Down Expand Up @@ -50,6 +61,15 @@ class OnlineRecognizerImpl {
virtual bool IsEndpoint(OnlineStream *s) const = 0;

virtual void Reset(OnlineStream *s) const = 0;

std::string ApplyInverseTextNormalization(std::string text) const;

private:
OnlineRecognizerConfig config_;
// for inverse text normalization. Used only if
// config.rule_fsts is not empty or
// config.rule_fars is not empty
std::vector<std::unique_ptr<kaldifst::TextNormalizer>> itn_list_;
};

} // namespace sherpa_onnx
Expand Down
10 changes: 7 additions & 3 deletions sherpa-onnx/csrc/online-recognizer-paraformer-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ static void Scale(const float *x, int32_t n, float scale, float *y) {
class OnlineRecognizerParaformerImpl : public OnlineRecognizerImpl {
public:
explicit OnlineRecognizerParaformerImpl(const OnlineRecognizerConfig &config)
: config_(config),
: OnlineRecognizerImpl(config),
config_(config),
model_(config.model_config),
sym_(config.model_config.tokens),
endpoint_(config_.endpoint_config) {
Expand All @@ -116,7 +117,8 @@ class OnlineRecognizerParaformerImpl : public OnlineRecognizerImpl {
#if __ANDROID_API__ >= 9
explicit OnlineRecognizerParaformerImpl(AAssetManager *mgr,
const OnlineRecognizerConfig &config)
: config_(config),
: OnlineRecognizerImpl(mgr, config),
config_(config),
model_(mgr, config.model_config),
sym_(mgr, config.model_config.tokens),
endpoint_(config_.endpoint_config) {
Expand Down Expand Up @@ -160,7 +162,9 @@ class OnlineRecognizerParaformerImpl : public OnlineRecognizerImpl {
OnlineRecognizerResult GetResult(OnlineStream *s) const override {
auto decoder_result = s->GetParaformerResult();

return Convert(decoder_result, sym_);
auto r = Convert(decoder_result, sym_);
r.text = ApplyInverseTextNormalization(r.text);
return r;
}

bool IsEndpoint(OnlineStream *s) const override {
Expand Down
12 changes: 8 additions & 4 deletions sherpa-onnx/csrc/online-recognizer-transducer-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ OnlineRecognizerResult Convert(const OnlineTransducerDecoderResult &src,
class OnlineRecognizerTransducerImpl : public OnlineRecognizerImpl {
public:
explicit OnlineRecognizerTransducerImpl(const OnlineRecognizerConfig &config)
: config_(config),
: OnlineRecognizerImpl(config),
config_(config),
model_(OnlineTransducerModel::Create(config.model_config)),
sym_(config.model_config.tokens),
endpoint_(config_.endpoint_config) {
Expand Down Expand Up @@ -124,7 +125,8 @@ class OnlineRecognizerTransducerImpl : public OnlineRecognizerImpl {
#if __ANDROID_API__ >= 9
explicit OnlineRecognizerTransducerImpl(AAssetManager *mgr,
const OnlineRecognizerConfig &config)
: config_(config),
: OnlineRecognizerImpl(mgr, config),
config_(config),
model_(OnlineTransducerModel::Create(mgr, config.model_config)),
sym_(mgr, config.model_config.tokens),
endpoint_(config_.endpoint_config) {
Expand Down Expand Up @@ -332,8 +334,10 @@ class OnlineRecognizerTransducerImpl : public OnlineRecognizerImpl {
// TODO(fangjun): Remember to change these constants if needed
int32_t frame_shift_ms = 10;
int32_t subsampling_factor = 4;
return Convert(decoder_result, sym_, frame_shift_ms, subsampling_factor,
s->GetCurrentSegment(), s->GetNumFramesSinceStart());
auto r = Convert(decoder_result, sym_, frame_shift_ms, subsampling_factor,
s->GetCurrentSegment(), s->GetNumFramesSinceStart());
r.text = ApplyInverseTextNormalization(std::move(r.text));
return r;
}

bool IsEndpoint(OnlineStream *s) const override {
Expand Down
Loading

0 comments on commit 349d957

Please sign in to comment.