wenet-e2e · robin1001 · Dec 10, 2023 · Dec 8, 2023 · Dec 9, 2023 · Dec 9, 2023
diff --git a/examples/aishell/rnnt/conf/conformer_rnnt.yaml b/examples/aishell/rnnt/conf/conformer_rnnt.yaml
@@ -17,7 +17,7 @@ encoder_conf:
     pos_enc_layer_type: 'rel_pos'
     selfattention_layer_type: 'rel_selfattn'
 
-
+joint: transducer_joint
 joint_conf:
     enc_output_size: 256
     pred_output_size: 256
@@ -50,6 +50,7 @@ decoder_conf:
   src_attention_dropout_rate: 0.1
 
 # hybrid transducer+ctc+attention
+model: transducer
 model_conf:
     transducer_weight: 0.75
     ctc_weight: 0.1

diff --git a/examples/aishell/rnnt/conf/conformer_u2pp_rnnt.yaml b/examples/aishell/rnnt/conf/conformer_u2pp_rnnt.yaml
@@ -21,7 +21,7 @@ encoder_conf:
     cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
     use_dynamic_left_chunk: false
 
-
+joint: transducer_joint
 joint_conf:
     enc_output_size: 256
     pred_output_size: 256
@@ -54,6 +54,7 @@ decoder_conf:
   src_attention_dropout_rate: 0.1
 
 # hybrid transducer+ctc+attention
+model: transducer
 model_conf:
     transducer_weight: 0.75
     ctc_weight: 0.1

diff --git a/examples/aishell/rnnt/conf/example_embedding_predictor.yaml b/examples/aishell/rnnt/conf/example_embedding_predictor.yaml
@@ -15,7 +15,7 @@ encoder_conf:
     pos_enc_layer_type: 'rel_pos'
     selfattention_layer_type: 'rel_selfattn'
 
-
+joint: transducer_joint
 joint_conf:
     enc_output_size: 256
     pred_output_size: 320
@@ -46,6 +46,7 @@ decoder_conf:
   src_attention_dropout_rate: 0.1
 
 # hybrid transducer+ctc+attention
+model: transducer
 model_conf:
     transducer_weight: 0.4
     ctc_weight: 0.2

diff --git a/examples/aishell/s0/conf/train_unified_conformer_ctl.yaml b/examples/aishell/s0/conf/train_unified_conformer_ctl.yaml
@@ -1,6 +1,6 @@
 # network architecture
 # encoder related
-encoder: conformer
+encoder: dual_conformer
 encoder_conf:
     output_size: 256    # dimension of attention
     attention_heads: 4
@@ -32,8 +32,8 @@ decoder_conf:
     self_attention_dropout_rate: 0.0
     src_attention_dropout_rate: 0.0
 
-ctlmodel: true
 # hybrid CTC/attention
+model: ctl_model
 model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option

diff --git a/examples/aishell/s0/run.sh b/examples/aishell/s0/run.sh
@@ -304,8 +304,10 @@ if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
   # 9.1 Build token level bigram fst for LF-MMI training
   tools/k2/prepare_mmi.sh data/train/ data/dev data/local/lfmmi
 
-  # 9.2 Run LF-MMI training from stage 4, with below new args
-  # --lfmmi_dir data/local/lfmmi
+  # 9.2 Run LF-MMI training from stage 4, modify below args in train.yaml
+  # model: k2_model
+  # model_conf:
+  #   lfmmi_dir data/local/lfmmi
 
   # 9.3 Run HLG decode from stage 8.2
 fi
diff --git a/examples/aishell2/rnnt/conf/conformer_rnnt.yaml b/examples/aishell2/rnnt/conf/conformer_rnnt.yaml
@@ -17,8 +17,10 @@ encoder_conf:
     pos_enc_layer_type: 'rel_pos'
     selfattention_layer_type: 'rel_selfattn'
 
-
+joint: transducer_joint
 joint_conf:
+    enc_output_size: 256
+    pred_output_size: 256
     join_dim: 512
     prejoin_linear: True
     postjoin_linear: false
@@ -48,6 +50,7 @@ decoder_conf:
   src_attention_dropout_rate: 0.1
 
 # hybrid transducer+ctc+attention
+model: transducer
 model_conf:
     transducer_weight: 0.75
     ctc_weight: 0.1

diff --git a/examples/aishell2/rnnt/conf/conformer_u2pp_rnnt.yaml b/examples/aishell2/rnnt/conf/conformer_u2pp_rnnt.yaml
@@ -21,8 +21,10 @@ encoder_conf:
     cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
     use_dynamic_left_chunk: false
 
-
+joint: transducer_joint
 joint_conf:
+    enc_output_size: 256
+    pred_output_size: 256
     join_dim: 512
     prejoin_linear: True
     postjoin_linear: false
@@ -52,6 +54,7 @@ decoder_conf:
   src_attention_dropout_rate: 0.1
 
 # hybrid transducer+ctc+attention
+model: transducer
 model_conf:
     transducer_weight: 0.75
     ctc_weight: 0.1

diff --git a/examples/librispeech/rnnt/conf/conformer_rnnt.yaml b/examples/librispeech/rnnt/conf/conformer_rnnt.yaml
@@ -17,8 +17,10 @@ encoder_conf:
     pos_enc_layer_type: 'rel_pos'
     selfattention_layer_type: 'rel_selfattn'
 
-
+joint: transducer_joint
 joint_conf:
+    enc_output_size: 256
+    pred_output_size: 256
     join_dim: 512
     prejoin_linear: True
     postjoin_linear: false
@@ -48,6 +50,7 @@ decoder_conf:
   src_attention_dropout_rate: 0.1
 
 # hybrid transducer+ctc+attention
+model: transducer
 model_conf:
     transducer_weight: 0.75
     ctc_weight: 0.1

diff --git a/test/wenet/utils/test_init_model.py b/test/wenet/utils/test_init_model.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright [2023-12-10] <[email protected], Xingchen Song>
+
+import glob
+import yaml
+
+from wenet.utils.init_model import init_model
+
+
+class DummyArguments:
+    jit = False
+    enc_init = None
+    checkpoint = None
+
+
+def test_init_model():
+    configs = glob.glob("examples/*/*/conf/*.yaml")
+    args = DummyArguments()
+    for c in configs:
+        with open(c, 'r') as fin:
+            config = yaml.load(fin, Loader=yaml.FullLoader)
+        if 'fbank_conf' in config['dataset_conf']:
+            input_dim = config['dataset_conf']['fbank_conf']['num_mel_bins']
+        elif 'log_mel_spectrogram_conf' in config['dataset_conf']:
+            input_dim = config['dataset_conf']['log_mel_spectrogram_conf'][
+                'num_mel_bins']
+        else:
+            input_dim = config['dataset_conf']['mfcc_conf']['num_mel_bins']
+        config['input_dim'] = input_dim
+        # TODO(xcsong): fix vocab_size
+        config['output_dim'] = 3000
+        print("checking {} {}".format(c, config))
+        init_model(args, config)
diff --git a/test/wenet/whisper/test_whisper.py b/test/wenet/whisper/test_whisper.py
@@ -361,7 +361,7 @@ def test_model(model, audio_path):
 
         # 6. Forward wenet.decoder
         wenet_tokens, _ = add_whisper_tokens(
-            configs['model_conf']['special_tokens'],
+            configs['tokenizer_conf']['special_tokens'],
             torch.tensor([dummy_tokens], dtype=torch.long),
             ignore_id=-1,
             task=task,