From 10bd5d6ba70048e0557c38794e4eba28e1025748 Mon Sep 17 00:00:00 2001 From: Dmitrii Mukhutdinov Date: Thu, 7 Dec 2023 09:56:06 +0000 Subject: [PATCH] Use `attacut` module for Thai word tokenization --- lhotse/workflows/forced_alignment/mms_aligner.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/lhotse/workflows/forced_alignment/mms_aligner.py b/lhotse/workflows/forced_alignment/mms_aligner.py index 4375e1a5c..53330e89f 100644 --- a/lhotse/workflows/forced_alignment/mms_aligner.py +++ b/lhotse/workflows/forced_alignment/mms_aligner.py @@ -143,19 +143,15 @@ def _word_tokenize(text: str, language: Optional[str] = None) -> List[str]: return kss.split_morphemes(text, return_pos=False) elif language == "th": - # `pythainlp` is alive and much better, but it is a huge package bloated with dependencies - if not is_module_available("tltk"): + if not is_module_available("attacut"): raise ImportError( - "MMSForcedAligner requires the 'tltk' module to be installed to align Thai text." - "Please install it with 'pip install tltk'." + "MMSForcedAligner requires the 'attacut' module to be installed to align Thai text." + "Please install it with 'pip install attacut'." ) - from tltk import nlp + import attacut - pieces = nlp.pos_tag(text) - return [ - word if word != "" else " " for piece in pieces for word, _ in piece - ] + return attacut.tokenize(text) elif language == "my": if not is_module_available("pyidaungsu"):