fix(whisper): support arbitrary ctc blank id (#2157)

wenet-e2e · Nov 24, 2023 · 8f7a8f3 · 8f7a8f3
1 parent e148526
commit 8f7a8f3
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 2 deletions.
diff --git a/wenet/transformer/ctc.py b/wenet/transformer/ctc.py
@@ -25,21 +25,23 @@ def __init__(
         encoder_output_size: int,
         dropout_rate: float = 0.0,
         reduce: bool = True,
+        blank_id: int = 0,
     ):
         """ Construct CTC module
         Args:
             odim: dimension of outputs
             encoder_output_size: number of encoder projection units
             dropout_rate: dropout rate (0.0 ~ 1.0)
             reduce: reduce the CTC loss into a scalar
+            blank_id: blank label.
         """
         super().__init__()
         eprojs = encoder_output_size
         self.dropout_rate = dropout_rate
         self.ctc_lo = torch.nn.Linear(eprojs, odim)
 
         reduction_type = "sum" if reduce else "none"
-        self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type)
+        self.ctc_loss = torch.nn.CTCLoss(blank=blank_id, reduction=reduction_type)
 
     def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor,
                 ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor:

diff --git a/wenet/utils/init_model.py b/wenet/utils/init_model.py
@@ -99,7 +99,8 @@ def init_model(args, configs):
         assert configs['decoder_conf']['r_num_blocks'] > 0
         decoder = BiTransformerDecoder(vocab_size, encoder.output_size(),
                                        **configs['decoder_conf'])
-    ctc = CTC(vocab_size, encoder.output_size())
+    ctc = CTC(vocab_size, encoder.output_size(),
+              blank_id=configs['ctc_conf']['ctc_blank_id'])
 
     # Init joint CTC/Attention or Transducer model
     if 'predictor' in configs:

diff --git a/wenet/utils/train_utils.py b/wenet/utils/train_utils.py
@@ -216,6 +216,17 @@ def check_modify_and_save_config(args, configs):
     symbol_table = read_symbol_table(args.symbol_table)
     vocab_size = len(symbol_table)
 
+    if 'ctc_conf' not in configs:
+        configs['ctc_conf'] = {}
+
+    if '<blank>' in symbol_table:
+        if 'ctc_blank_id' in configs['ctc_conf']:
+            assert configs['ctc_conf']['ctc_blank_id'] == symbol_table['<blank>']
+        else:
+            configs['ctc_conf']['ctc_blank_id'] = symbol_table['<blank>']
+    else:
+        assert 'ctc_blank_id' in configs['ctc_conf'], "PLZ set ctc_blank_id in yaml"
+
     configs['input_dim'] = input_dim
     configs['output_dim'] = configs.get('output_dim', vocab_size)
     configs['cmvn_file'] = args.cmvn

diff --git a/wenet/whisper/convert_whisper_to_wenet_config_and_ckpt.py b/wenet/whisper/convert_whisper_to_wenet_config_and_ckpt.py
@@ -89,6 +89,9 @@ def convert_to_wenet_yaml(tokenizer, dims, wenet_yaml_path: str):
     configs['decoder_conf']['key_bias'] = False
     configs['decoder_conf']['activation_type'] = "gelu"
 
+    configs['ctc_conf'] = {}
+    configs['ctc_conf']['ctc_blank_id'] = 50362  # <nospeech>
+
     configs['model_conf'] = {}
     configs['model_conf']['ctc_weight'] = 0.3
     configs['model_conf']['lsm_weight'] = 0.1