From 10bd5d6ba70048e0557c38794e4eba28e1025748 Mon Sep 17 00:00:00 2001
From: Dmitrii Mukhutdinov <flyingleafe@gmail.com>
Date: Thu, 7 Dec 2023 09:56:06 +0000
Subject: [PATCH] Use `attacut` module for Thai word tokenization

---
 lhotse/workflows/forced_alignment/mms_aligner.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)
diff --git a/lhotse/workflows/forced_alignment/mms_aligner.py b/lhotse/workflows/forced_alignment/mms_aligner.py
index 4375e1a5c..53330e89f 100644
--- a/lhotse/workflows/forced_alignment/mms_aligner.py
+++ b/lhotse/workflows/forced_alignment/mms_aligner.py
@@ -143,19 +143,15 @@ def _word_tokenize(text: str, language: Optional[str] = None) -> List[str]:
         return kss.split_morphemes(text, return_pos=False)
 
     elif language == "th":
-        # `pythainlp` is alive and much better, but it is a huge package bloated with dependencies
-        if not is_module_available("tltk"):
+        if not is_module_available("attacut"):
             raise ImportError(
-                "MMSForcedAligner requires the 'tltk' module to be installed to align Thai text."
-                "Please install it with 'pip install tltk'."
+                "MMSForcedAligner requires the 'attacut' module to be installed to align Thai text."
+                "Please install it with 'pip install attacut'."
             )
 
-        from tltk import nlp
+        import attacut
 
-        pieces = nlp.pos_tag(text)
-        return [
-            word if word != "<s/>" else " " for piece in pieces for word, _ in piece
-        ]
+        return attacut.tokenize(text)
 
     elif language == "my":
         if not is_module_available("pyidaungsu"):