k2-fsa · AmirHussein96 · Apr 5, 2024 · Apr 5, 2024 · Apr 5, 2024 · Apr 5, 2024
diff --git a/egs/librispeech/ASR/zipformer/export.py b/egs/librispeech/ASR/zipformer/export.py
@@ -29,13 +29,17 @@
 
 (1) Export to torchscript model using torch.jit.script()
 
-- For non-streaming model:
-
-./zipformer/export.py \
-  --exp-dir ./zipformer/exp \
-  --tokens data/lang_bpe_500/tokens.txt \
-  --epoch 30 \
-  --avg 9 \
+- For non-streaming model: 
+
+./zipformer_hat_seame/export.py \
+  --exp-dir ./zipformer_hat/exp \
+  --tokens data_seame/lang_bpe_4000/tokens.txt \
+  --epoch 20 \
+  --avg 5 \
+  --num-encoder-layers 2,2,2,2,2,2 \
+  --feedforward-dim 512,768,1024,1024,1024,768 \
+  --encoder-dim 192,256,256,256,256,256 \
+  --encoder-unmasked-dim 192,192,192,192,192,192 \
   --jit 1
 
 It will generate a file `jit_script.pt` in the given `exp_dir`. You can later
@@ -234,7 +238,7 @@ def get_parser():
     parser.add_argument(
         "--tokens",
         type=str,
-        default="data/lang_bpe_500/tokens.txt",
+        default="data_libri/lang_bpe_500/tokens.txt",
         help="Path to the tokens.txt",
     )
 

diff --git a/egs/seame/ASR/README.md b/egs/seame/ASR/README.md
@@ -0,0 +1,23 @@
+# Introduction
+
+This recipe includes ASR models (zipformer, zipformer-hat, zipformer-hat-lid) trained and evaluated on SEAME dataset.
+The SEAME corpora is Singaporean Codeswitched English and Mandarin.
+
+This corpus comes defined with a training split and two development splits:
+
+train -- A mix of codeswitched, Mandarin and Singaporean English
+dev_sge -- A set of primarily Singaporean English though there is codeswitching  
+dev_man -- A set of primarily Mandarin though there is also some codeswitching
+
+
+[./RESULTS.md](./RESULTS.md) contains the latest results.
+
+# Zipformer-hat
+
+Zipformer with hybrid autoregressive transducer (HAT) loss https://arxiv.org/abs/2003.07705
+see https://github.com/k2-fsa/icefall/pull/1291
+
+# Zipformer-hat-lid
+
+Zipformer-hat with auxiliary LID encoder and blank sharing for synchronization between ASR and LID as described here (will be shared soon)
+
diff --git a/egs/seame/ASR/RESULTS.md b/egs/seame/ASR/RESULTS.md
@@ -0,0 +1,168 @@
+## Results
+
+#### Zipformer
+
+|                                    |     dev    |    test    | comment                                  |
+|------------------------------------|------------|------------|------------------------------------------|
+| modified beam search               | 21.87      | 29.04      | --epoch 25, --avg 5, --max-duration 500 |
+
+The training command:
+
+```
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+./zipformer/train.py \
+  --world-size 4 \
+  --num-epochs 25 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir zipformer/exp-asr-seame \
+  --causal 0 \
+  --num-encoder-layers 2,2,2,2,2,2 \
+  --feedforward-dim 512,768,1024,1024,1024,768 \
+  --encoder-dim 192,256,256,256,256,256 \
+  --encoder-unmasked-dim 192,192,192,192,192,192 \
+  --prune-range 10 \
+  --max-duration 500
+```
+
+The decoding command:
+
+```
+ ./zipformer/decode.py \
+    --epoch 25 \
+    --avg 5 \
+    --beam-size 10
+    --exp-dir ./zipformer/exp-asr-seame \
+    --max-duration 800 \
+    --num-encoder-layers 2,2,2,2,2,2 \
+    --feedforward-dim 512,768,1024,1024,1024,768 \
+    --encoder-dim 192,256,256,256,256,256 \
+    --encoder-unmasked-dim 192,192,192,192,192,192 \
+    --decoding-method modified_beam_search
+```
+
+The pretrained model is available at:  <https://huggingface.co/AmirHussein/zipformer-seame>
+
+
+### Zipformer-HAT
+
+|                                    |     dev    |    test    | comment                                  |
+|------------------------------------|------------|------------|------------------------------------------|
+| modified beam search               | 22.00      | 29.92      | --epoch 20, --avg 5, --max-duration 500 |
+
+
+The training command for reproducing is given below:
+
+```
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5"
+
+./zipformer_hat/train.py \
+  --world-size 4 \
+  --num-epochs 25 \
+  --start-epoch 1 \
+  --base-lr 0.045 \
+  --lr-epochs 6 \
+  --use-fp16 1 \
+  --exp-dir zipformer_hat/exp \
+  --causal 0 \
+  --num-encoder-layers 2,2,2,2,2,2 \
+  --feedforward-dim 512,768,1024,1024,1024,768 \
+  --encoder-dim 192,256,256,256,256,256 \
+  --encoder-unmasked-dim 192,192,192,192,192,192 \
+  --prune-range 10 \
+  --max-duration 500 
+```
+
+The decoding command is:
+```
+## modified beam search
+./zipformer_hat/decode.py \
+      --epoch 25 --avg 5 --use-averaged-model True \
+      --beam-size 10 \
+      --causal 0 \
+      --exp-dir zipformer_hat/exp \
+      --bpe-model data_seame/lang_bpe_4000/bpe.model \
+      --max-duration 1000 \
+      --num-encoder-layers 2,2,2,2,2,2 \
+      --feedforward-dim 512,768,1024,1024,1024,768 \
+      --encoder-dim 192,256,256,256,256,256 \
+      --encoder-unmasked-dim 192,192,192,192,192,192 \
+      --decoding-method modified_beam_search 
+```
+
+A pre-trained model and decoding logs can be found at <https://huggingface.co/AmirHussein/zipformer-hat-seame>
+
+### Zipformer-HAT-LID
+
+|                                    |     dev    |    test    | comment                                  |
+|------------------------------------|------------|------------|------------------------------------------|
+| modified beam search               | 20.04      | 26.91      | --epoch 15, --avg 5, --max-duration 500 |
+
+The training command for reproducing is given below:
+
+```
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5"
+
+./zipformer_hat_lid/train.py \
+  --world-size 4 \
+  --lid True \
+  --num-epochs 25 \
+  --start-epoch 1 \
+  --base-lr 0.045 \
+  --use-fp16 1 \
+  --lid-loss-scale 0.3 \
+  --exp-dir zipformer_hat_lid/exp \
+  --causal 0 \
+  --lid-output-layer 3 \
+  --num-encoder-layers 2,2,2,2,2,2 \
+  --feedforward-dim 512,768,1024,1024,1024,768 \
+  --encoder-dim 192,256,256,256,256,256 \
+  --encoder-unmasked-dim 192,192,192,192,192,192 \
+  --lids "<en>,<zh>" \
+  --prune-range 10 \
+  --freeze-main-model False \
+  --use-lid-encoder True \
+  --use-lid-joiner True \
+  --lid-num-encoder-layers 2,2,2 \
+  --lid-downsampling-factor 2,4,2 \
+  --lid-feedforward-dim 256,256,256 \
+  --lid-num-heads 4,4,4 \
+  --lid-encoder-dim 256,256,256 \
+  --lid-encoder-unmasked-dim 128,128,128 \
+  --lid-cnn-module-kernel 31,15,31 \
+  --max-duration 500
+
+```
+
+The decoding command is:
+```
+## modified beam search
+python zipformer_hat_lid/decode.py \
+      --epoch $epoch --avg 5 --use-averaged-model True \
+      --beam-size 10 \
+      --lid False \
+      --lids "<en>,<zh>" \
+      --exp-dir zipformer_hat_lid/exp \
+      --bpe-model data_seame/lang_bpe_4000/bpe.model \
+      --max-duration 800 \
+      --num-encoder-layers 2,2,2,2,2,2 \
+      --feedforward-dim 512,768,1024,1024,1024,768 \
+      --encoder-dim 192,256,256,256,256,256 \
+      --encoder-unmasked-dim 192,192,192,192,192,192 \
+      --decoding-method modified_beam_search \
+      --lid-output-layer 3 \
+      --use-lid-encoder True \
+      --use-lid-joiner True \
+      --lid-num-encoder-layers 2,2,2 \
+      --lid-downsampling-factor 2,4,2 \
+      --lid-feedforward-dim 256,256,256 \
+      --lid-num-heads 4,4,4 \
+      --lid-encoder-dim 256,256,256 \
+      --lid-encoder-unmasked-dim 128,128,128 \
+      --lid-cnn-module-kernel 31,15,31 
+```
+
+A pre-trained model and decoding logs can be found at <https://huggingface.co/AmirHussein/zipformer-hat-lid-seame>
+
+
diff --git a/egs/seame/ASR/local/cer.py b/egs/seame/ASR/local/cer.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# Johns Hopkins University  (authors: Amir Hussein)
+
+
+"""
+This file cer from icefall decoded "recogs" file:
+    id [ref] xxx
+    id [hyp] yxy 
+"""
+
+import argparse
+
+import jiwer
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dec-file", type=str, help="Decoded icefall recogs file")
+
+    return parser
+
+
+def cer_(file):
+    hyp = []
+    ref = []
+    cer_results = 0
+    ref_lens = 0
+    with open(file, "r", encoding="utf-8") as dec:
+        for line in dec:
+            id, target = line.split("\t")
+            id = id[0:-2]
+            target, txt = target.split("=")
+            if target == "ref":
+                words = txt.strip().strip("[]").split(", ")
+                word_list = [word.strip("'") for word in words]
+                ref.append(" ".join(word_list))
+            elif target == "hyp":
+                words = txt.strip().strip("[]").split(", ")
+                word_list = [word.strip("'") for word in words]
+                hyp.append(" ".join(word_list))
+        for h, r in zip(hyp, ref):
+            if r:
+                cer_results += jiwer.cer(r, h) * len(r)
+
+                ref_lens += len(r)
+    print(cer_results / ref_lens)
+
+
+def main():
+    parse = get_args()
+    args = parse.parse_args()
+    cer_(args.dec_file)
+
+
+if __name__ == "__main__":
+    main()