Skip to content

Commit

Permalink
[text] add test unit parallel for bpe and whisper
Browse files Browse the repository at this point in the history
  • Loading branch information
Mddct committed Nov 28, 2023
1 parent ec2d838 commit c70192b
Showing 1 changed file with 43 additions and 0 deletions.
43 changes: 43 additions & 0 deletions test/wenet/text/test_parallel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from functools import partial
from multiprocessing import Pool
from wenet.text.base_tokenizer import BaseTokenizer

from wenet.text.bpe_tokenizer import BpeTokenizer
from wenet.text.whisper_tokenizer import WhisperTokenizer


def consistency(tokenizer: BaseTokenizer, line: str) -> str:
return tokenizer.detokenize(tokenizer.tokenize(line)[1])[0]


def test_whisper_tokenzier_parallel():

inputs = ["it's ok", "wenet is simple", "test for new io"]
pool = Pool(processes=3)
tokenizer = WhisperTokenizer(False)

partial_tokenize = partial(consistency, tokenizer)
with Pool(processes=len(inputs)) as pool:
results = pool.map(partial_tokenize, inputs)

inputs.sort()
results.sort()

assert all(h == r for (h, r) in zip(results, inputs))


def test_bpe_tokenzier_parallel():

symbol_table_path = "test/resources/librispeech.words.txt"
bpe_model = "test/resources/librispeech.train_960_unigram5000.bpemodel"

inputs = ["WENR IS SIMPLE", "GOOD"]
tokenizer = BpeTokenizer(bpe_model, symbol_table_path)
partial_tokenize = partial(consistency, tokenizer)
with Pool(processes=len(inputs)) as pool:
results = pool.map(partial_tokenize, inputs)

inputs.sort()
results.sort()

assert all(h == r for (h, r) in zip(results, inputs))

0 comments on commit c70192b

Please sign in to comment.