Add Python ASR example with alsa

k2-fsa · Mar 8, 2024 · 42c81cb · 42c81cb
1 parent 2c37cdb
commit 42c81cb
Showing 1 changed file with 111 additions and 0 deletions.
diff --git a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+
+# Real-time speech recognition from a microphone with sherpa-ncnn Python API
+# with endpoint detection.
+#
+# Note: This script uses ALSA and works only on Linux systems, especially
+# for embedding Linux systems and for running Linux on Windows using WSL.
+#
+# Please refer to
+# https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
+# to download pre-trained models
+
+import argparse
+import sys
+
+import sherpa_ncnn
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--device-name",
+        type=str,
+        required=True,
+        help="""
+The device name specifies which microphone to use in case there are several
+on your system. You can use
+
+  arecord -l
+
+to find all available microphones on your computer. For instance, if it outputs
+
+**** List of CAPTURE Hardware Devices ****
+card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
+  Subdevices: 1/1
+  Subdevice #0: subdevice #0
+
+and if you want to select card 3 and the device 0 on that card, please use:
+
+  plughw:3,0
+
+as the device_name.
+        """,
+    )
+
+    return parser.parse_args()
+
+
+def create_recognizer():
+    # Please replace the model files if needed.
+    # See https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
+    # for download links.
+    recognizer = sherpa_ncnn.Recognizer(
+        tokens="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/tokens.txt",
+        encoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.param",
+        encoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.bin",
+        decoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.param",
+        decoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.bin",
+        joiner_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.param",
+        joiner_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.bin",
+        num_threads=4,
+        decoding_method="modified_beam_search",
+        enable_endpoint_detection=True,
+        rule1_min_trailing_silence=2.4,
+        rule2_min_trailing_silence=1.2,
+        rule3_min_utterance_length=300,
+        hotwords_file="",
+        hotwords_score=1.5,
+    )
+    return recognizer
+
+
+def main():
+    args = get_args()
+    device_name = args.device_name
+    print(f"device_name: {device_name}")
+    alsa = sherpa_ncnn.Alsa(device_name)
+
+    recognizer = create_recognizer()
+    print("Started! Please speak")
+    sample_rate = recognizer.sample_rate
+    samples_per_read = int(0.1 * sample_rate)  # 0.1 second = 100 ms
+    last_result = ""
+    segment_id = 0
+
+    while True:
+        samples = alsa.read(samples_per_read)  # a blocking read
+        recognizer.accept_waveform(sample_rate, samples)
+
+        is_endpoint = recognizer.is_endpoint
+
+        result = recognizer.text
+        if result and (last_result != result):
+            last_result = result
+            print("\r{}:{}".format(segment_id, result), end="", flush=True)
+
+        if is_endpoint:
+            if result:
+                print("\r{}:{}".format(segment_id, result), flush=True)
+                segment_id += 1
+            recognizer.reset()
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("\nCaught Ctrl + C. Exiting")