From 2c4b952887a002128b9d9156c7b5575984a6bca7 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 8 Mar 2024 03:18:56 +0000 Subject: [PATCH] Add Python ASR examples with alsa --- CMakeLists.txt | 1 + ...microphone-with-endpoint-detection-alsa.py | 206 ++++++++++++++++++ sherpa-onnx/python/csrc/CMakeLists.txt | 2 +- 3 files changed, 208 insertions(+), 1 deletion(-) create mode 100755 python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d98b00cb..c57ae3598 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -146,6 +146,7 @@ include(CheckIncludeFileCXX) if(UNIX AND NOT APPLE AND NOT SHERPA_ONNX_ENABLE_WASM AND NOT CMAKE_SYSTEM_NAME STREQUAL Android) check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA) if(SHERPA_ONNX_HAS_ALSA) + message(STATUS "With Alsa") add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1) else() message(WARNING "\ diff --git a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py new file mode 100755 index 000000000..eaf9202b9 --- /dev/null +++ b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 + +# Real-time speech recognition from a microphone with sherpa-onnx Python API +# with endpoint detection. +# +# Note: This script uses ALSA and works only on Linux systems. +# +# Please refer to +# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html +# to download pre-trained models + +import argparse +import sys +from pathlib import Path +import sherpa_onnx + + +def assert_file_exists(filename: str): + assert Path(filename).is_file(), ( + f"{filename} does not exist!\n" + "Please refer to " + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it" + ) + + +def get_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--tokens", + type=str, + required=True, + help="Path to tokens.txt", + ) + + parser.add_argument( + "--encoder", + type=str, + required=True, + help="Path to the encoder model", + ) + + parser.add_argument( + "--decoder", + type=str, + required=True, + help="Path to the decoder model", + ) + + parser.add_argument( + "--joiner", + type=str, + required=True, + help="Path to the joiner model", + ) + + parser.add_argument( + "--decoding-method", + type=str, + default="greedy_search", + help="Valid values are greedy_search and modified_beam_search", + ) + + parser.add_argument( + "--provider", + type=str, + default="cpu", + help="Valid values: cpu, cuda, coreml", + ) + + parser.add_argument( + "--hotwords-file", + type=str, + default="", + help=""" + The file containing hotwords, one words/phrases per line, and for each + phrase the bpe/cjkchar are separated by a space. For example: + + ▁HE LL O ▁WORLD + 你 好 世 界 + """, + ) + + parser.add_argument( + "--hotwords-score", + type=float, + default=1.5, + help=""" + The hotword score of each token for biasing word/phrase. Used only if + --hotwords-file is given. + """, + ) + + parser.add_argument( + "--blank-penalty", + type=float, + default=0.0, + help=""" + The penalty applied on blank symbol during decoding. + Note: It is a positive value that would be applied to logits like + this `logits[:, 0] -= blank_penalty` (suppose logits.shape is + [batch_size, vocab] and blank id is 0). + """, + ) + + parser.add_argument( + "--device-name", + type=str, + required=True, + help=""" +The device name specifies which microphone to use in case there are several +on your system. You can use + + arecord -l + +to find all available microphones on your computer. For instance, if it outputs + +**** List of CAPTURE Hardware Devices **** +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] + Subdevices: 1/1 + Subdevice #0: subdevice #0 + +and if you want to select card 3 and the device 0 on that card, please use: + + plughw:3,0 + +as the device_name. + """, + ) + + return parser.parse_args() + + +def create_recognizer(args): + assert_file_exists(args.encoder) + assert_file_exists(args.decoder) + assert_file_exists(args.joiner) + assert_file_exists(args.tokens) + # Please replace the model files if needed. + # See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html + # for download links. + recognizer = sherpa_onnx.OnlineRecognizer.from_transducer( + tokens=args.tokens, + encoder=args.encoder, + decoder=args.decoder, + joiner=args.joiner, + num_threads=1, + sample_rate=16000, + feature_dim=80, + enable_endpoint_detection=True, + rule1_min_trailing_silence=2.4, + rule2_min_trailing_silence=1.2, + rule3_min_utterance_length=300, # it essentially disables this rule + decoding_method=args.decoding_method, + provider=args.provider, + hotwords_file=args.hotwords_file, + hotwords_score=args.hotwords_score, + blank_penalty=args.blank_penalty, + ) + return recognizer + + +def main(): + args = get_args() + device_name = args.device_name + print(f"device_name: {device_name}") + alsa = sherpa_onnx.Alsa(device_name) + + print("Creating recognizer") + recognizer = create_recognizer(args) + print("Started! Please speak") + + sample_rate = 16000 + samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms + + stream = recognizer.create_stream() + + last_result = "" + segment_id = 0 + while True: + samples = alsa.read(samples_per_read) # a blocking read + stream.accept_waveform(sample_rate, samples) + while recognizer.is_ready(stream): + recognizer.decode_stream(stream) + + is_endpoint = recognizer.is_endpoint(stream) + + result = recognizer.get_result(stream) + + if result and (last_result != result): + last_result = result + print("\r{}:{}".format(segment_id, result), end="", flush=True) + if is_endpoint: + if result: + print("\r{}:{}".format(segment_id, result), flush=True) + segment_id += 1 + recognizer.reset(stream) + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + print("\nCaught Ctrl + C. Exiting") diff --git a/sherpa-onnx/python/csrc/CMakeLists.txt b/sherpa-onnx/python/csrc/CMakeLists.txt index 4b3dd8146..bba7903a0 100644 --- a/sherpa-onnx/python/csrc/CMakeLists.txt +++ b/sherpa-onnx/python/csrc/CMakeLists.txt @@ -38,7 +38,7 @@ set(srcs voice-activity-detector.cc ) if(SHERPA_ONNX_HAS_ALSA) - list(APPEND srcs ${CMAKE_SOURCE_DIR/sherpa-onnx/csrc/alsa.cc} alsa.cc) + list(APPEND srcs ${CMAKE_SOURCE_DIR}/sherpa-onnx/csrc/alsa.cc alsa.cc) else() list(APPEND srcs faked-alsa.cc) endif()