From 7fbccdd1df52c606704332c6a78731b4d51291eb Mon Sep 17 00:00:00 2001
From: Michael McAuliffe <michael.e.mcauliffe@gmail.com>
Date: Wed, 30 Aug 2023 01:27:16 -0700
Subject: [PATCH] Further refactoring and fixes for anchor (#684)

* Further refactoring and fixes for anchor

* Update segmenting of transcripts
---
 ci/docker_environment.yaml                    |   1 +
 docs/source/changelog/changelog_3.0.rst       |  18 +-
 docs/source/reference/dictionary/helper.rst   |  11 -
 docs/source/reference/segmentation/helper.rst |   5 +
 docs/source/reference/segmentation/main.rst   |   3 +-
 docs/source/reference/tokenization/helper.rst |  12 +
 .../reference/tokenization/tokenizer.rst      |  10 +
 .../corpus_creation/create_segments.rst       |  38 +-
 environment.yml                               |   1 +
 montreal_forced_aligner/abc.py                |   1 +
 .../acoustic_modeling/lda.py                  |   1 -
 montreal_forced_aligner/alignment/adapting.py |  10 -
 montreal_forced_aligner/alignment/mixins.py   |   5 -
 .../command_line/align_one.py                 |  28 +-
 .../command_line/create_segments.py           |  93 +++-
 montreal_forced_aligner/command_line/mfa.py   |   6 +-
 .../command_line/train_acoustic_model.py      |   8 +
 .../corpus/acoustic_corpus.py                 |  10 -
 montreal_forced_aligner/corpus/base.py        |  42 +-
 montreal_forced_aligner/corpus/features.py    |   5 +-
 .../corpus/multiprocessing.py                 |  88 +--
 montreal_forced_aligner/data.py               |  32 ++
 montreal_forced_aligner/db.py                 |  75 +--
 .../diarization/speaker_diarizer.py           |  45 +-
 montreal_forced_aligner/dictionary/mixins.py  |  44 +-
 .../dictionary/multispeaker.py                |  44 +-
 montreal_forced_aligner/exceptions.py         |  12 +
 montreal_forced_aligner/models.py             |  17 +
 .../tokenization/english.py                   | 524 ++++++++++++++++++
 .../tokenization/japanese.py                  |  77 +++
 montreal_forced_aligner/tokenization/spacy.py |  61 ++
 .../transcription/multiprocessing.py          | 152 ++---
 .../transcription/transcriber.py              |  47 +-
 .../vad/multiprocessing.py                    | 494 ++++++++++-------
 montreal_forced_aligner/vad/segmenter.py      | 165 +++++-
 tests/test_commandline_create_segments.py     |  78 ++-
 tests/test_commandline_train.py               |   3 +
 tests/test_segmentation.py                    |  29 +
 38 files changed, 1696 insertions(+), 599 deletions(-)
 create mode 100644 montreal_forced_aligner/tokenization/english.py
 create mode 100644 montreal_forced_aligner/tokenization/japanese.py
 create mode 100644 montreal_forced_aligner/tokenization/spacy.py
 create mode 100644 tests/test_segmentation.py

diff --git a/ci/docker_environment.yaml b/ci/docker_environment.yaml
index d57ef5c0..6076ed76 100644
--- a/ci/docker_environment.yaml
+++ b/ci/docker_environment.yaml
@@ -39,5 +39,6 @@ dependencies:
   - rich
   - rich-click
   - kalpy
+  - spacy[ja]
   - pip:
       - speechbrain
diff --git a/docs/source/changelog/changelog_3.0.rst b/docs/source/changelog/changelog_3.0.rst
index d9958a4e..982875a4 100644
--- a/docs/source/changelog/changelog_3.0.rst
+++ b/docs/source/changelog/changelog_3.0.rst
@@ -5,7 +5,23 @@
 3.0 Changelog
 *************
 
-3.0.0a
+3.0.0a4
+=======
+
+- Separate out segmentation functionality into :ref:`create_segments` and :ref:`create_segments_vad`
+- Fix a bug in :ref:`align_one` when specifying a ``config_path``
+
+3.0.0a3
+=======
+
+- Refactor tokenization for future spacy use
+
+3.0.0a2
+=======
+
+- Revamped how configuration is done following change to using threading instead of multiprocessing
+
+3.0.0a1
 ======
 
 - Add dependency on :xref:`kalpy` for interacting for Kaldi
diff --git a/docs/source/reference/dictionary/helper.rst b/docs/source/reference/dictionary/helper.rst
index 1f778fde..95e0f362 100644
--- a/docs/source/reference/dictionary/helper.rst
+++ b/docs/source/reference/dictionary/helper.rst
@@ -30,17 +30,6 @@ Mixins
 
    MultispeakerDictionaryMixin
 
-Helper
-------
-
-.. currentmodule:: montreal_forced_aligner.dictionary.mixins
-
-.. autosummary::
-   :toctree: generated/
-
-   SanitizeFunction
-   SplitWordsFunction
-
 
 Pronunciation probability functionality
 =======================================
diff --git a/docs/source/reference/segmentation/helper.rst b/docs/source/reference/segmentation/helper.rst
index 97d6e34c..6dda916d 100644
--- a/docs/source/reference/segmentation/helper.rst
+++ b/docs/source/reference/segmentation/helper.rst
@@ -7,7 +7,12 @@ Helper functions
 .. autosummary::
    :toctree: generated/
 
+   SegmentVadFunction
+   SegmentVadArguments
    SegmentVadFunction
    SegmentVadArguments
    get_initial_segmentation
    merge_segments
+   segment_utterance_transcript
+   segment_utterance_vad
+   segment_utterance_vad_speech_brain
diff --git a/docs/source/reference/segmentation/main.rst b/docs/source/reference/segmentation/main.rst
index ed15a3b3..cbde889a 100644
--- a/docs/source/reference/segmentation/main.rst
+++ b/docs/source/reference/segmentation/main.rst
@@ -7,4 +7,5 @@ Segmenter
 .. autosummary::
    :toctree: generated/
 
-   Segmenter
+   VadSegmenter
+   TranscriptionSegmenter
diff --git a/docs/source/reference/tokenization/helper.rst b/docs/source/reference/tokenization/helper.rst
index e560aad6..b9d9c6b7 100644
--- a/docs/source/reference/tokenization/helper.rst
+++ b/docs/source/reference/tokenization/helper.rst
@@ -12,3 +12,15 @@ Helper
    TokenizerRewriter
    TokenizerArguments
    TokenizerFunction
+
+
+Helper
+------
+
+.. currentmodule:: montreal_forced_aligner.tokenization.simple
+
+.. autosummary::
+   :toctree: generated/
+
+   SanitizeFunction
+   SplitWordsFunction
diff --git a/docs/source/reference/tokenization/tokenizer.rst b/docs/source/reference/tokenization/tokenizer.rst
index 6088a838..88d4b54a 100644
--- a/docs/source/reference/tokenization/tokenizer.rst
+++ b/docs/source/reference/tokenization/tokenizer.rst
@@ -11,3 +11,13 @@ Corpus tokenizer
 
    CorpusTokenizer
    TokenizerValidator
+
+Simple tokenizer
+================
+
+.. currentmodule:: montreal_forced_aligner.tokenization.simple
+
+.. autosummary::
+   :toctree: generated/
+
+   SimpleTokenizer
diff --git a/docs/source/user_guide/corpus_creation/create_segments.rst b/docs/source/user_guide/corpus_creation/create_segments.rst
index 292fe3e0..5c4111ec 100644
--- a/docs/source/user_guide/corpus_creation/create_segments.rst
+++ b/docs/source/user_guide/corpus_creation/create_segments.rst
@@ -1,11 +1,11 @@
 
 .. _create_segments:
 
-Create segments ``(mfa create_segments)``
-=========================================
+Segment transcribed files ``(mfa segment)``
+===========================================
 
 The Montreal Forced Aligner can use Voice Activity Detection (VAD) capabilities from :xref:`speechbrain` to generate segments from
-a longer sound file.
+a longer sound file, while attempting to segment transcripts as well.  If you do not have transcripts, see :ref:`create_segments_vad`.
 
 .. note::
 
@@ -15,7 +15,37 @@ Command reference
 -----------------
 
 .. click:: montreal_forced_aligner.command_line.create_segments:create_segments_cli
-   :prog: mfa create_segments
+   :prog: mfa segment
+   :nested: full
+
+
+Configuration reference
+-----------------------
+
+- :ref:`configuration_segmentation`
+
+API reference
+-------------
+
+- :ref:`segmentation_api`
+
+.. _create_segments_vad:
+
+Segment untranscribed files ``(mfa segment_vad)``
+=================================================
+
+The Montreal Forced Aligner can use Voice Activity Detection (VAD) capabilities from :xref:`speechbrain` or energy based VAD to generate segments from
+a longer sound file.  This command does not split transcripts, instead assigning a default label of "speech" to all identified speech segments.  If you would like to preserve transcripts for each segment, see :ref:`create_segments`.
+
+.. note::
+
+   On Windows, if you get an ``OSError/WinError 1314`` during the run, follow `these instructions <https://www.scivision.dev/windows-symbolic-link-permission-enable/>`_ to enable symbolic link creation permissions.
+
+Command reference
+-----------------
+
+.. click:: montreal_forced_aligner.command_line.create_segments:create_segments_vad_cli
+   :prog: mfa segment_vad
    :nested: full
 
 
diff --git a/environment.yml b/environment.yml
index 01508828..58dd0dc9 100644
--- a/environment.yml
+++ b/environment.yml
@@ -48,6 +48,7 @@ dependencies:
   - rich
   - rich-click
   - kalpy
+  - spacy
   - pip:
       - build
       - twine
diff --git a/montreal_forced_aligner/abc.py b/montreal_forced_aligner/abc.py
index 0abc5a6f..a1be1dfb 100644
--- a/montreal_forced_aligner/abc.py
+++ b/montreal_forced_aligner/abc.py
@@ -282,6 +282,7 @@ def initialize_database(self) -> None:
                 conn.execute(sqlalchemy.text("CREATE EXTENSION IF NOT EXISTS vector"))
                 conn.execute(sqlalchemy.text("CREATE EXTENSION IF NOT EXISTS pg_trgm"))
                 conn.execute(sqlalchemy.text("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"))
+                conn.execute(sqlalchemy.text(f"select setseed({config.SEED/32768})"))
                 conn.commit()
 
         MfaSqlBase.metadata.create_all(self.db_engine)
diff --git a/montreal_forced_aligner/acoustic_modeling/lda.py b/montreal_forced_aligner/acoustic_modeling/lda.py
index e228f69d..d8af85d2 100644
--- a/montreal_forced_aligner/acoustic_modeling/lda.py
+++ b/montreal_forced_aligner/acoustic_modeling/lda.py
@@ -305,7 +305,6 @@ def lda_options(self) -> MetaDict:
         return {
             "lda_dimension": self.lda_dimension,
             "random_prune": self.random_prune,
-            "silence_csl": self.silence_csl,
             "splice_left_context": self.splice_left_context,
             "splice_right_context": self.splice_right_context,
         }
diff --git a/montreal_forced_aligner/alignment/adapting.py b/montreal_forced_aligner/alignment/adapting.py
index 2ee02b82..72fe4f44 100644
--- a/montreal_forced_aligner/alignment/adapting.py
+++ b/montreal_forced_aligner/alignment/adapting.py
@@ -78,16 +78,6 @@ def map_acc_stats_arguments(self, alignment=False) -> List[AccStatsArguments]:
             model_path = self.model_path
         arguments = []
         for j in self.jobs:
-            feat_strings = {}
-            for d_id in j.dictionary_ids:
-                feat_strings[d_id] = j.construct_feature_proc_string(
-                    self.working_directory,
-                    d_id,
-                    self.feature_options["uses_splices"],
-                    self.feature_options["splice_left_context"],
-                    self.feature_options["splice_right_context"],
-                    self.feature_options["uses_speaker_adaptation"],
-                )
             arguments.append(
                 AccStatsArguments(
                     j.id,
diff --git a/montreal_forced_aligner/alignment/mixins.py b/montreal_forced_aligner/alignment/mixins.py
index 46ded980..be0b31fa 100644
--- a/montreal_forced_aligner/alignment/mixins.py
+++ b/montreal_forced_aligner/alignment/mixins.py
@@ -107,11 +107,6 @@ def data_directory(self) -> str:
         """Corpus data directory"""
         ...
 
-    @abstractmethod
-    def construct_feature_proc_strings(self) -> typing.List[typing.Dict[str, str]]:
-        """Generate feature strings"""
-        ...
-
     def compile_train_graphs_arguments(self) -> typing.List[CompileTrainGraphsArguments]:
         """
         Generate Job arguments for :class:`~montreal_forced_aligner.alignment.multiprocessing.CompileTrainGraphsFunction`
diff --git a/montreal_forced_aligner/command_line/align_one.py b/montreal_forced_aligner/command_line/align_one.py
index 4200b990..a79f156c 100644
--- a/montreal_forced_aligner/command_line/align_one.py
+++ b/montreal_forced_aligner/command_line/align_one.py
@@ -18,8 +18,17 @@
     validate_dictionary,
 )
 from montreal_forced_aligner.corpus.classes import FileData
+from montreal_forced_aligner.data import BRACKETED_WORD, CUTOFF_WORD, LAUGHTER_WORD, OOV_WORD
+from montreal_forced_aligner.dictionary.mixins import (
+    DEFAULT_BRACKETS,
+    DEFAULT_CLITIC_MARKERS,
+    DEFAULT_COMPOUND_MARKERS,
+    DEFAULT_PUNCTUATION,
+    DEFAULT_WORD_BREAK_MARKERS,
+)
 from montreal_forced_aligner.models import AcousticModel
 from montreal_forced_aligner.online.alignment import align_utterance_online
+from montreal_forced_aligner.tokenization.simple import SimpleTokenizer
 
 __all__ = ["align_one_cli"]
 
@@ -74,6 +83,18 @@ def align_one_cli(context, **kwargs) -> None:
     output_path: Path = kwargs["output_path"]
     output_format = kwargs["output_format"]
     c = PretrainedAligner.parse_parameters(config_path, context.params, context.args)
+    tokenizer = SimpleTokenizer(
+        word_break_markers=c.get("word_break_markers", DEFAULT_WORD_BREAK_MARKERS),
+        punctuation=c.get("punctuation", DEFAULT_PUNCTUATION),
+        clitic_markers=c.get("clitic_markers", DEFAULT_CLITIC_MARKERS),
+        compound_markers=c.get("compound_markers", DEFAULT_COMPOUND_MARKERS),
+        brackets=c.get("brackets", DEFAULT_BRACKETS),
+        laughter_word=c.get("laughter_word", LAUGHTER_WORD),
+        oov_word=c.get("oov_word", OOV_WORD),
+        bracketed_word=c.get("bracketed_word", BRACKETED_WORD),
+        cutoff_word=c.get("cutoff_word", CUTOFF_WORD),
+        ignore_case=c.get("ignore_case", True),
+    )
 
     acoustic_model = AcousticModel(acoustic_model_path)
     extracted_models_dir = config.TEMPORARY_DIRECTORY.joinpath("extracted_models", "dictionary")
@@ -95,7 +116,7 @@ def align_one_cli(context, **kwargs) -> None:
     l_align_fst_path = dictionary_directory.joinpath("L_align.fst")
     words_path = dictionary_directory.joinpath("words.txt")
     phones_path = dictionary_directory.joinpath("phones.txt")
-    if l_fst_path.exists():
+    if l_fst_path.exists() and not config.CLEAN:
         lexicon_compiler.load_l_from_file(l_fst_path)
         lexicon_compiler.load_l_align_from_file(l_align_fst_path)
         lexicon_compiler.word_table = pywrapfst.SymbolTable.read_text(words_path)
@@ -114,13 +135,14 @@ def align_one_cli(context, **kwargs) -> None:
     cmvn_computer = CmvnComputer()
     for utterance in file.utterances:
         seg = Segment(sound_file_path, utterance.begin, utterance.end, utterance.channel)
-        utt = KalpyUtterance(seg, utterance.text)
+        text, _, _ = tokenizer(utterance.text)
+        utt = KalpyUtterance(seg, text)
         utt.generate_mfccs(acoustic_model.mfcc_computer)
         utterances.append(utt)
     cmvn = cmvn_computer.compute_cmvn_from_features([utt.mfccs for utt in utterances])
     align_options = {
         k: v
-        for k, v in c
+        for k, v in c.items()
         if k
         in [
             "beam",
diff --git a/montreal_forced_aligner/command_line/create_segments.py b/montreal_forced_aligner/command_line/create_segments.py
index 799206da..461034c3 100644
--- a/montreal_forced_aligner/command_line/create_segments.py
+++ b/montreal_forced_aligner/command_line/create_segments.py
@@ -6,10 +6,87 @@
 import rich_click as click
 
 from montreal_forced_aligner import config
-from montreal_forced_aligner.command_line.utils import common_options
-from montreal_forced_aligner.vad.segmenter import Segmenter
+from montreal_forced_aligner.command_line.utils import (
+    common_options,
+    validate_acoustic_model,
+    validate_dictionary,
+)
+from montreal_forced_aligner.vad.segmenter import TranscriptionSegmenter, VadSegmenter
+
+__all__ = ["create_segments_vad_cli", "create_segments_cli"]
+
+
+@click.command(
+    name="segment_vad",
+    context_settings=dict(
+        ignore_unknown_options=True,
+        allow_extra_args=True,
+        allow_interspersed_args=True,
+    ),
+    short_help="Split long audio files into shorter segments",
+)
+@click.argument(
+    "corpus_directory",
+    type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
+)
+@click.argument(
+    "output_directory", type=click.Path(file_okay=False, dir_okay=True, path_type=Path)
+)
+@click.option(
+    "--config_path",
+    "-c",
+    help="Path to config file to use for training.",
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path),
+)
+@click.option(
+    "--output_format",
+    help="Format for aligned output files (default is long_textgrid).",
+    default="long_textgrid",
+    type=click.Choice(["long_textgrid", "short_textgrid", "json", "csv"]),
+)
+@click.option(
+    "--speechbrain/--no_speechbrain",
+    "speechbrain",
+    help="Flag for using SpeechBrain's pretrained VAD model",
+)
+@click.option(
+    "--cuda/--no_cuda",
+    "cuda",
+    help="Flag for using CUDA for SpeechBrain's model",
+)
+@click.option(
+    "--segment_transcripts/--no_segment_transcripts",
+    "segment_transcripts",
+    help="Flag for using CUDA for SpeechBrain's model",
+)
+@common_options
+@click.help_option("-h", "--help")
+@click.pass_context
+def create_segments_vad_cli(context, **kwargs) -> None:
+    """
+    Create segments based on SpeechBrain's voice activity detection (VAD) model or a basic energy-based algorithm
+    """
+    if kwargs.get("profile", None) is not None:
+        config.profile = kwargs.pop("profile")
+    config.update_configuration(kwargs)
 
-__all__ = ["create_segments_cli"]
+    config_path = kwargs.get("config_path", None)
+    corpus_directory = kwargs["corpus_directory"]
+    output_directory = kwargs["output_directory"]
+    output_format = kwargs["output_format"]
+
+    segmenter = VadSegmenter(
+        corpus_directory=corpus_directory,
+        **VadSegmenter.parse_parameters(config_path, context.params, context.args),
+    )
+    try:
+        segmenter.segment()
+        segmenter.export_files(output_directory, output_format)
+    except Exception:
+        segmenter.dirty = True
+        raise
+    finally:
+        segmenter.cleanup()
 
 
 @click.command(
@@ -25,6 +102,8 @@
     "corpus_directory",
     type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
 )
+@click.argument("dictionary_path", type=click.UNPROCESSED, callback=validate_dictionary)
+@click.argument("acoustic_model_path", type=click.UNPROCESSED, callback=validate_acoustic_model)
 @click.argument(
     "output_directory", type=click.Path(file_okay=False, dir_okay=True, path_type=Path)
 )
@@ -63,12 +142,16 @@ def create_segments_cli(context, **kwargs) -> None:
 
     config_path = kwargs.get("config_path", None)
     corpus_directory = kwargs["corpus_directory"]
+    dictionary_path = kwargs["dictionary_path"]
+    acoustic_model_path = kwargs["acoustic_model_path"]
     output_directory = kwargs["output_directory"]
     output_format = kwargs["output_format"]
 
-    segmenter = Segmenter(
+    segmenter = TranscriptionSegmenter(
         corpus_directory=corpus_directory,
-        **Segmenter.parse_parameters(config_path, context.params, context.args),
+        dictionary_path=dictionary_path,
+        acoustic_model_path=acoustic_model_path,
+        **TranscriptionSegmenter.parse_parameters(config_path, context.params, context.args),
     )
     try:
         segmenter.segment()
diff --git a/montreal_forced_aligner/command_line/mfa.py b/montreal_forced_aligner/command_line/mfa.py
index b17bc75a..9747e3e6 100644
--- a/montreal_forced_aligner/command_line/mfa.py
+++ b/montreal_forced_aligner/command_line/mfa.py
@@ -15,7 +15,10 @@
 from montreal_forced_aligner.command_line.align_one import align_one_cli
 from montreal_forced_aligner.command_line.anchor import anchor_cli
 from montreal_forced_aligner.command_line.configure import configure_cli
-from montreal_forced_aligner.command_line.create_segments import create_segments_cli
+from montreal_forced_aligner.command_line.create_segments import (
+    create_segments_cli,
+    create_segments_vad_cli,
+)
 from montreal_forced_aligner.command_line.diarize_speakers import diarize_speakers_cli
 from montreal_forced_aligner.command_line.g2p import g2p_cli
 from montreal_forced_aligner.command_line.history import history_cli
@@ -171,6 +174,7 @@ def version_cli():
 mfa_cli.add_command(anchor_cli)
 mfa_cli.add_command(diarize_speakers_cli)
 mfa_cli.add_command(create_segments_cli)
+mfa_cli.add_command(create_segments_vad_cli)
 mfa_cli.add_command(configure_cli)
 mfa_cli.add_command(history_cli)
 mfa_cli.add_command(g2p_cli)
diff --git a/montreal_forced_aligner/command_line/train_acoustic_model.py b/montreal_forced_aligner/command_line/train_acoustic_model.py
index 490c6470..dfc078e4 100644
--- a/montreal_forced_aligner/command_line/train_acoustic_model.py
+++ b/montreal_forced_aligner/command_line/train_acoustic_model.py
@@ -8,6 +8,7 @@
 from montreal_forced_aligner import config
 from montreal_forced_aligner.acoustic_modeling import TrainableAligner
 from montreal_forced_aligner.command_line.utils import common_options, validate_dictionary
+from montreal_forced_aligner.data import Language
 
 __all__ = ["train_acoustic_model_cli"]
 
@@ -85,6 +86,13 @@
     help="Flag to include original utterance text in the output.",
     default=False,
 )
+@click.option(
+    "--language",
+    "language",
+    help="Language to use for spacy tokenizers and other preprocessing of language data",
+    default=Language.unknown.name,
+    type=click.Choice([x.name for x in Language]),
+)
 @common_options
 @click.help_option("-h", "--help")
 @click.pass_context
diff --git a/montreal_forced_aligner/corpus/acoustic_corpus.py b/montreal_forced_aligner/corpus/acoustic_corpus.py
index b38c8511..f9d7c4f4 100644
--- a/montreal_forced_aligner/corpus/acoustic_corpus.py
+++ b/montreal_forced_aligner/corpus/acoustic_corpus.py
@@ -550,16 +550,6 @@ def calc_fmllr_arguments(self, iteration: Optional[int] = None) -> List[CalcFmll
         arguments = []
         thread_lock = threading.Lock()
         for j in self.jobs:
-            feat_strings = {}
-            for d_id in j.dictionary_ids:
-                feat_strings[d_id] = j.construct_feature_proc_string(
-                    self.working_directory,
-                    d_id,
-                    self.feature_options["uses_splices"],
-                    self.feature_options["splice_left_context"],
-                    self.feature_options["splice_right_context"],
-                    self.feature_options["uses_speaker_adaptation"],
-                )
             arguments.append(
                 CalcFmllrArguments(
                     j.id,
diff --git a/montreal_forced_aligner/corpus/base.py b/montreal_forced_aligner/corpus/base.py
index ce301bf5..d2855bcf 100644
--- a/montreal_forced_aligner/corpus/base.py
+++ b/montreal_forced_aligner/corpus/base.py
@@ -22,7 +22,13 @@
     NormalizeTextFunction,
     dictionary_ids_for_job,
 )
-from montreal_forced_aligner.data import DatabaseImportData, TextFileType, WordType, WorkflowType
+from montreal_forced_aligner.data import (
+    DatabaseImportData,
+    Language,
+    TextFileType,
+    WordType,
+    WorkflowType,
+)
 from montreal_forced_aligner.db import (
     Corpus,
     CorpusWorkflow,
@@ -94,6 +100,7 @@ def __init__(
         speaker_characters: typing.Union[int, str] = 0,
         ignore_speakers: bool = False,
         oov_count_threshold: int = 0,
+        language: Language = Language.unknown,
         **kwargs,
     ):
         if not os.path.exists(corpus_directory):
@@ -124,6 +131,7 @@ def __init__(
         self._word_set = []
         self._jobs = []
         self.ignore_empty_utterances = False
+        self.language = language
 
     @property
     def jobs(self) -> typing.List[Job]:
@@ -602,32 +610,22 @@ def _finalize_load(self, session: Session, import_data: DatabaseImportData):
 
     def normalize_text_arguments(self):
         from montreal_forced_aligner.dictionary.mixins import DictionaryMixin
+        from montreal_forced_aligner.tokenization.spacy import generate_language_tokenizer
 
-        if not isinstance(self, DictionaryMixin):
-            return None
+        if self.language is Language.unknown:
+            tokenizers = getattr(self, "tokenizers", None)
+        else:
+            tokenizers = generate_language_tokenizer(self.language)
+        if tokenizers is None:
+            if isinstance(self, DictionaryMixin):
+                tokenizers = self.tokenizer
+            else:
+                return None
         from montreal_forced_aligner.corpus.multiprocessing import NormalizeTextArguments
 
         with self.session() as session:
             jobs = session.query(Job).filter(Job.utterances.any())
-            return [
-                NormalizeTextArguments(
-                    j.id,
-                    self.session,
-                    None,
-                    self.word_break_markers,
-                    self.punctuation,
-                    self.clitic_markers,
-                    self.compound_markers,
-                    self.brackets,
-                    self.laughter_word,
-                    self.oov_word,
-                    self.bracketed_word,
-                    self.cutoff_word,
-                    self.ignore_case,
-                    getattr(self, "use_g2p", False),
-                )
-                for j in jobs
-            ]
+            return [NormalizeTextArguments(j.id, self.session, None, tokenizers) for j in jobs]
 
     def normalize_text(self) -> None:
         """Normalize the text of the corpus using a dictionary's sanitization functions and word mappings"""
diff --git a/montreal_forced_aligner/corpus/features.py b/montreal_forced_aligner/corpus/features.py
index 3e24c745..4276944d 100644
--- a/montreal_forced_aligner/corpus/features.py
+++ b/montreal_forced_aligner/corpus/features.py
@@ -399,7 +399,7 @@ def __init__(self, args: VadArguments):
         super().__init__(args)
         self.vad_options = args.vad_options
 
-    def _run(self) -> typing.Generator[typing.Tuple[int, int, int]]:
+    def _run(self) -> None:
         """Run the function"""
 
         with (
@@ -916,9 +916,6 @@ def ivector_options(self) -> MetaDict:
             "silence_weight": self.silence_weight,
             "max_count": self.max_count,
             "ivector_dimension": self.ivector_dimension,
-            "silence_csl": getattr(
-                self, "silence_csl", ""
-            ),  # If we have silence phones from a dictionary, use them,
         }
 
 
diff --git a/montreal_forced_aligner/corpus/multiprocessing.py b/montreal_forced_aligner/corpus/multiprocessing.py
index f0bd6f85..dc96a0e6 100644
--- a/montreal_forced_aligner/corpus/multiprocessing.py
+++ b/montreal_forced_aligner/corpus/multiprocessing.py
@@ -16,14 +16,21 @@
 from montreal_forced_aligner.abc import KaldiFunction
 from montreal_forced_aligner.corpus.classes import FileData
 from montreal_forced_aligner.corpus.helper import find_exts
-from montreal_forced_aligner.data import MfaArguments, WordType
-from montreal_forced_aligner.db import Dictionary, Grapheme, Job, Speaker, Utterance, Word
+from montreal_forced_aligner.data import MfaArguments
+from montreal_forced_aligner.db import Dictionary, Job, Speaker, Utterance
 from montreal_forced_aligner.exceptions import SoundFileError, TextGridParseError, TextParseError
 from montreal_forced_aligner.helper import mfa_open
 from montreal_forced_aligner.utils import Counter
 
 if typing.TYPE_CHECKING:
     from dataclasses import dataclass
+
+    from montreal_forced_aligner.tokenization.simple import SimpleTokenizer
+
+    try:
+        from spacy.language import Language as SpacyLanguage
+    except ImportError:
+        SpacyLanguage = None
 else:
     from dataclassy import dataclass
 
@@ -221,17 +228,7 @@ class NormalizeTextArguments(MfaArguments):
 
     """
 
-    word_break_markers: typing.List[str]
-    punctuation: typing.List[str]
-    clitic_markers: typing.List[str]
-    compound_markers: typing.List[str]
-    brackets: typing.List[typing.Tuple[str, str]]
-    laughter_word: str
-    oov_word: str
-    bracketed_word: str
-    cutoff_word: str
-    ignore_case: bool
-    use_g2p: bool
+    tokenizers: typing.Union[typing.Dict[int, SimpleTokenizer], SpacyLanguage]
 
 
 @dataclass
@@ -258,58 +255,20 @@ class NormalizeTextFunction(KaldiFunction):
 
     def __init__(self, args: NormalizeTextArguments):
         super().__init__(args)
-        self.word_break_markers = args.word_break_markers
-        self.brackets = args.brackets
-        self.punctuation = args.punctuation
-        self.compound_markers = args.compound_markers
-        self.clitic_markers = args.clitic_markers
-        self.ignore_case = args.ignore_case
-        self.use_g2p = args.use_g2p
-        self.laughter_word = args.laughter_word
-        self.oov_word = args.oov_word
-        self.bracketed_word = args.bracketed_word
-        self.cutoff_word = args.cutoff_word
+        self.tokenizers = args.tokenizers
 
     def _run(self):
         """Run the function"""
-        from montreal_forced_aligner.tokenization.simple import SimpleTokenizer
 
         with self.session() as session:
-
-            grapheme_set = set()
-            grapheme_query = session.query(Grapheme.grapheme)
-            for (g,) in grapheme_query:
-                grapheme_set.add(g)
             dict_count = session.query(Dictionary).join(Dictionary.words).limit(1).count()
-            if self.use_g2p or dict_count > 0:
+            if dict_count > 0 or isinstance(self.tokenizers, dict):
                 dictionaries = session.query(Dictionary)
                 for d in dictionaries:
-
-                    word_set = set(
-                        x[0] for x in session.query(Word.word).filter(Word.dictionary_id == d.id)
-                    )
-                    clitic_set = set(
-                        x[0]
-                        for x in session.query(Word.word)
-                        .filter(Word.word_type == WordType.clitic)
-                        .filter(Word.dictionary_id == d.id)
-                    )
-                    tokenizer = SimpleTokenizer(
-                        word_break_markers=self.word_break_markers,
-                        punctuation=self.punctuation,
-                        clitic_markers=self.clitic_markers,
-                        compound_markers=self.compound_markers,
-                        brackets=self.brackets,
-                        laughter_word=self.laughter_word,
-                        oov_word=self.oov_word,
-                        bracketed_word=self.bracketed_word,
-                        cutoff_word=self.cutoff_word,
-                        ignore_case=self.ignore_case,
-                        use_g2p=self.use_g2p,
-                        clitic_set=clitic_set,
-                        word_set=word_set,
-                        grapheme_set=grapheme_set,
-                    )
+                    if isinstance(self.tokenizers, dict):
+                        tokenizer = self.tokenizers[d.id]
+                    else:
+                        tokenizer = self.tokenizers
                     utterances = (
                         session.query(Utterance.id, Utterance.text)
                         .join(Utterance.speaker)
@@ -331,20 +290,7 @@ def _run(self):
                             )
                         )
             else:
-                tokenizer = SimpleTokenizer(
-                    word_break_markers=self.word_break_markers,
-                    punctuation=self.punctuation,
-                    clitic_markers=self.clitic_markers,
-                    compound_markers=self.compound_markers,
-                    brackets=self.brackets,
-                    laughter_word=self.laughter_word,
-                    oov_word=self.oov_word,
-                    bracketed_word=self.bracketed_word,
-                    cutoff_word=self.cutoff_word,
-                    ignore_case=self.ignore_case,
-                    use_g2p=self.use_g2p,
-                    grapheme_set=grapheme_set,
-                )
+                tokenizer = self.tokenizers
                 utterances = (
                     session.query(Utterance.id, Utterance.text)
                     .filter(Utterance.text != "")
diff --git a/montreal_forced_aligner/data.py b/montreal_forced_aligner/data.py
index fd7bd782..f53a97c0 100644
--- a/montreal_forced_aligner/data.py
+++ b/montreal_forced_aligner/data.py
@@ -39,6 +39,7 @@
     "DistanceMetric",
     "WorkflowType",
     "DatasetType",
+    "Language",
     "ArpaNgramModel",
     "WORD_BEGIN_SYMBOL",
     "WORD_END_SYMBOL",
@@ -293,6 +294,37 @@ class ClusterType(enum.Enum):
     meanshift = "meanshift"
 
 
+class Language(enum.Enum):
+    """Enum for supported languages"""
+
+    unknown = "unknown"
+    catalan = "catalan"
+    chinese = "chinese"
+    croatian = "croatian"
+    danish = "danish"
+    dutch = "dutch"
+    english = "english"
+    finnish = "finnish"
+    french = "french"
+    german = "german"
+    greek = "greek"
+    italian = "italian"
+    japanese = "japanese"
+    korean = "korean"
+    lithuanian = "lithuanian"
+    macedonian = "macedonian"
+    multilingual = "multilingual"
+    norwegian = "norwegian"
+    polish = "polish"
+    portuguese = "portuguese"
+    romanian = "romanian"
+    russian = "russian"
+    slovenian = "slovenian"
+    spanish = "spanish"
+    swedish = "swedish"
+    ukrainian = "ukrainian"
+
+
 class ManifoldAlgorithm(enum.Enum):
     """Enum for supported manifold visualization algorithms"""
 
diff --git a/montreal_forced_aligner/db.py b/montreal_forced_aligner/db.py
index 1b7588d5..9b77d030 100644
--- a/montreal_forced_aligner/db.py
+++ b/montreal_forced_aligner/db.py
@@ -12,8 +12,9 @@
 import pywrapfst
 import sqlalchemy
 import sqlalchemy.types as types
-from kalpy.data import KaldiMapping
+from kalpy.data import KaldiMapping, Segment
 from kalpy.feat.data import FeatureArchive
+from kalpy.utterance import Utterance as KalpyUtterance
 from pgvector.sqlalchemy import Vector
 from praatio import textgrid
 from praatio.utilities.constants import Interval
@@ -970,6 +971,7 @@ class Speaker(MfaSqlBase):
     ivector = Column(Vector(config.IVECTOR_DIMENSION), nullable=True)
     plda_vector = Column(Vector(config.PLDA_DIMENSION), nullable=True)
     xvector = Column(Vector(config.XVECTOR_DIMENSION), nullable=True)
+    modified = Column(Boolean, nullable=False, default=False, index=True)
     dictionary_id = Column(Integer, ForeignKey("dictionary.id"), nullable=True, index=True)
     dictionary = relationship("Dictionary", back_populates="speakers")
     utterances = relationship("Utterance", back_populates="speaker")
@@ -1558,6 +1560,18 @@ def to_data(self) -> UtteranceData:
             set(self.oovs.split()),
         )
 
+    def to_kalpy(self) -> KalpyUtterance:
+        """
+        Construct an UtteranceData object that can be used in multiprocessing
+
+        Returns
+        -------
+        :class:`~montreal_forced_aligner.corpus.classes.UtteranceData`
+            Data for the utterance
+        """
+        seg = Segment(self.file.sound_file.sound_file_path, self.begin, self.end, self.channel)
+        return KalpyUtterance(seg, self.normalized_text, self.speaker.cmvn, self.speaker.fmllr)
+
     @classmethod
     def from_data(cls, data: UtteranceData, file: File, speaker: int, frame_shift: int = None):
         """
@@ -2097,65 +2111,6 @@ def construct_dictionary_dependent_paths(
             output[dict_id] = directory.joinpath(f"{identifier}.{dict_id}.{extension}")
         return output
 
-    def construct_online_feature_proc_string(self):
-        feat_path = self.construct_path(self.corpus.current_subset_directory, "feats", "scp")
-        return f'ark,s,cs:add-deltas scp,s,cs:"{feat_path}" ark:- |'
-
-    def construct_feature_proc_string(
-        self,
-        working_directory,
-        dictionary_id,
-        uses_splices: bool,
-        splice_left_context: int,
-        splice_right_context: int,
-        uses_speaker_adaptation: bool = False,
-    ) -> str:
-        """
-        Constructs a feature processing string to supply to Kaldi binaries, taking into account corpus features and the
-        current working directory of the aligner (whether fMLLR or LDA transforms should be used, etc).
-
-        Parameters
-        ----------
-        uses_speaker_adaptation: bool
-            Flag for whether features should be speaker-independent regardless of the presence of fMLLR transforms
-
-        Returns
-        -------
-        dict[int, dict[str, str]]
-            Feature strings per job
-        """
-        lda_mat_path = None
-        fmllr_trans_path = None
-        feat_path = self.construct_path(
-            self.corpus.current_subset_directory, "feats", "scp", dictionary_id=dictionary_id
-        )
-        if working_directory is not None:
-            lda_mat_path = os.path.join(working_directory, "lda.mat")
-            if not os.path.exists(lda_mat_path):
-                lda_mat_path = None
-            fmllr_trans_path = self.construct_path(
-                self.corpus.split_directory, "trans", "scp", dictionary_id
-            )
-
-            if not os.path.exists(fmllr_trans_path):
-                fmllr_trans_path = None
-        utt2spk_path = self.construct_path(
-            self.corpus.current_subset_directory, "utt2spk", "scp", dictionary_id
-        )
-        feats = "ark,s,cs:"
-
-        if lda_mat_path is not None:
-            feats += f'splice-feats --left-context={splice_left_context} --right-context={splice_right_context} scp,s,cs:"{feat_path}" ark:- |'
-            feats += f' transform-feats "{lda_mat_path}" ark:- ark:- |'
-        elif uses_splices:
-            feats += f'splice-feats --left-context={splice_left_context} --right-context={splice_right_context} scp,s,cs:"{feat_path}" ark:- |'
-        else:
-            feats += f'add-deltas scp,s,cs:"{feat_path}" ark:- |'
-        if fmllr_trans_path is not None and uses_speaker_adaptation:
-            feats += f' transform-feats --utt2spk=ark:"{utt2spk_path}" scp:"{fmllr_trans_path}" ark:- ark:- |'
-
-        return feats
-
 
 class M2MSymbol(MfaSqlBase):
     """
diff --git a/montreal_forced_aligner/diarization/speaker_diarizer.py b/montreal_forced_aligner/diarization/speaker_diarizer.py
index 7d0c9aa6..06e14f81 100644
--- a/montreal_forced_aligner/diarization/speaker_diarizer.py
+++ b/montreal_forced_aligner/diarization/speaker_diarizer.py
@@ -91,7 +91,7 @@
 if TYPE_CHECKING:
     from montreal_forced_aligner.abc import MetaDict
 
-__all__ = ["SpeakerDiarizer"]
+__all__ = ["SpeakerDiarizer", "FOUND_SPEECHBRAIN"]
 
 logger = logging.getLogger("mfa")
 
@@ -950,9 +950,14 @@ def cleanup_empty_speakers(self, threshold=None):
         with self.session() as session:
             session.execute(sqlalchemy.delete(SpeakerOrdering))
             session.flush()
+
             unknown_speaker_id = (
-                session.query(Speaker.id).filter(Speaker.name == "MFA_UNKNOWN").first()[0]
+                session.query(Speaker.id).filter(Speaker.name == "MFA_UNKNOWN").first()
             )
+            if unknown_speaker_id is None:
+                unknown_speaker_id = -1
+            else:
+                unknown_speaker_id = unknown_speaker_id[0]
             non_empty_speakers = [unknown_speaker_id]
             sq = (
                 session.query(Speaker.id, sqlalchemy.func.count().label("utterance_count"))
@@ -1414,24 +1419,28 @@ def compute_speaker_embeddings(self) -> None:
         """Generate per-speaker embeddings as the mean over their utterances"""
         if not self.has_xvectors():
             self.load_embeddings()
+        self.cleanup_empty_speakers()
         logger.info("Computing SpeechBrain speaker embeddings...")
-        with tqdm(
-            total=self.num_speakers, disable=config.QUIET
-        ) as pbar, self.session() as session:
+        with self.session() as session:
             update_mapping = []
-            speakers = session.query(Speaker.id)
-            for (s_id,) in speakers:
-                u_query = session.query(Utterance.xvector).filter(
-                    Utterance.speaker_id == s_id, Utterance.xvector != None  # noqa
-                )
-                embeddings = np.empty((u_query.count(), XVECTOR_DIMENSION))
-                if embeddings.shape[0] == 0:
-                    continue
-                for i, (xvector,) in enumerate(u_query):
-                    embeddings[i, :] = xvector
-                speaker_xvector = np.mean(embeddings, axis=0)
-                update_mapping.append({"id": s_id, "xvector": speaker_xvector})
-                pbar.update(1)
+            speakers = session.query(Speaker.id).filter(
+                sqlalchemy.or_(Speaker.xvector == None, Speaker.modified == True)  # noqa
+            )
+            with tqdm(total=speakers.count(), disable=config.QUIET) as pbar:
+                for (s_id,) in speakers:
+                    u_query = session.query(Utterance.xvector).filter(
+                        Utterance.speaker_id == s_id, Utterance.xvector != None  # noqa
+                    )
+                    embeddings = np.empty((u_query.count(), XVECTOR_DIMENSION))
+                    if embeddings.shape[0] == 0:
+                        continue
+                    for i, (xvector,) in enumerate(u_query):
+                        embeddings[i, :] = xvector
+                    speaker_xvector = np.mean(embeddings, axis=0)
+                    update_mapping.append(
+                        {"id": s_id, "xvector": speaker_xvector, "modified": False}
+                    )
+                    pbar.update(1)
             bulk_update(session, Speaker, update_mapping)
             session.commit()
 
diff --git a/montreal_forced_aligner/dictionary/mixins.py b/montreal_forced_aligner/dictionary/mixins.py
index cbb5109a..dc5cb292 100644
--- a/montreal_forced_aligner/dictionary/mixins.py
+++ b/montreal_forced_aligner/dictionary/mixins.py
@@ -186,6 +186,24 @@ def __init__(
         self.use_cutoff_model = use_cutoff_model
         self._phone_groups = {}
 
+    @property
+    def tokenizer(self):
+        from montreal_forced_aligner.tokenization.simple import SimpleTokenizer
+
+        tokenizer = SimpleTokenizer(
+            word_break_markers=self.word_break_markers,
+            punctuation=self.punctuation,
+            clitic_markers=self.clitic_markers,
+            compound_markers=self.compound_markers,
+            brackets=self.brackets,
+            laughter_word=self.laughter_word,
+            oov_word=self.oov_word,
+            bracketed_word=self.bracketed_word,
+            cutoff_word=self.cutoff_word,
+            ignore_case=self.ignore_case,
+        )
+        return tokenizer
+
     @property
     def base_phones(self) -> Dict[str, Set[str]]:
         """Grouped phones by base phone"""
@@ -321,11 +339,6 @@ def silence_phones(self) -> Set[str]:
             self.oov_phone,
         }
 
-    @property
-    def context_independent_csl(self) -> str:
-        """Context independent colon-separated list"""
-        return ":".join(str(self.phone_mapping[x]) for x in self.kaldi_silence_phones)
-
     @property
     def specials_set(self) -> Set[str]:
         """Special words, like the ``oov_word`` ``silence_word``, ``<s>``, and ``</s>``"""
@@ -495,28 +508,11 @@ def kaldi_silence_phones(self) -> List[str]:
         return sorted(self.silence_phones)
 
     @property
-    def optional_silence_csl(self) -> str:
-        """
-        Phone ID of the optional silence phone
-        """
-        try:
-            return str(self.phone_mapping[self.optional_silence_phone])
-        except Exception:
-            return ""
-
-    @property
-    def silence_csl(self) -> str:
+    def silence_symbols(self) -> typing.List[int]:
         """
         A colon-separated string of silence phone ids
         """
-        return ":".join(map(str, (self.phone_mapping[x] for x in self.kaldi_silence_phones)))
-
-    @property
-    def non_silence_csl(self) -> str:
-        """
-        A colon-separated string of non-silence phone ids
-        """
-        return ":".join(map(str, (self.phone_mapping[x] for x in self.kaldi_non_silence_phones)))
+        return [self.phone_mapping[x] for x in self.kaldi_silence_phones]
 
     @property
     def phones(self) -> set:
diff --git a/montreal_forced_aligner/dictionary/multispeaker.py b/montreal_forced_aligner/dictionary/multispeaker.py
index e95b8102..246f3a33 100644
--- a/montreal_forced_aligner/dictionary/multispeaker.py
+++ b/montreal_forced_aligner/dictionary/multispeaker.py
@@ -115,6 +115,48 @@ def __init__(
         self.rules_path = rules_path
         self.phone_groups_path = phone_groups_path
         self.lexicon_compilers: typing.Dict[int, typing.Union[LexiconCompiler, G2PCompiler]] = {}
+        self._tokenizers = {}
+
+    @property
+    def tokenizers(self):
+        from montreal_forced_aligner.tokenization.simple import SimpleTokenizer
+
+        if not self._tokenizers:
+            with self.session() as session:
+
+                grapheme_set = set()
+                grapheme_query = session.query(Grapheme.grapheme)
+                for (g,) in grapheme_query:
+                    grapheme_set.add(g)
+                dictionaries = session.query(Dictionary)
+                for d in dictionaries:
+
+                    word_set = set(
+                        x[0] for x in session.query(Word.word).filter(Word.dictionary_id == d.id)
+                    )
+                    clitic_set = set(
+                        x[0]
+                        for x in session.query(Word.word)
+                        .filter(Word.word_type == WordType.clitic)
+                        .filter(Word.dictionary_id == d.id)
+                    )
+                    self._tokenizers[d.id] = SimpleTokenizer(
+                        word_break_markers=self.word_break_markers,
+                        punctuation=self.punctuation,
+                        clitic_markers=self.clitic_markers,
+                        compound_markers=self.compound_markers,
+                        brackets=self.brackets,
+                        laughter_word=self.laughter_word,
+                        oov_word=self.oov_word,
+                        bracketed_word=self.bracketed_word,
+                        cutoff_word=self.cutoff_word,
+                        ignore_case=self.ignore_case,
+                        use_g2p=self.use_g2p,
+                        clitic_set=clitic_set,
+                        word_set=word_set,
+                        grapheme_set=grapheme_set,
+                    )
+        return self._tokenizers
 
     def load_phone_groups(self) -> None:
         """
@@ -261,7 +303,7 @@ def dictionary_setup(self) -> Tuple[typing.Set[str], collections.Counter]:
         """Set up the dictionary for processing"""
         self.initialize_database()
         if self.use_g2p:
-            return
+            return set(), collections.Counter()
         auto_set = {PhoneSetType.AUTO, PhoneSetType.UNKNOWN, "AUTO", "UNKNOWN"}
         if not isinstance(self.phone_set_type, PhoneSetType):
             self.phone_set_type = PhoneSetType[self.phone_set_type]
diff --git a/montreal_forced_aligner/exceptions.py b/montreal_forced_aligner/exceptions.py
index cc544a2f..eb1a348e 100644
--- a/montreal_forced_aligner/exceptions.py
+++ b/montreal_forced_aligner/exceptions.py
@@ -30,6 +30,7 @@
     "PyniniAlignmentError",
     "PyniniGenerationError",
     "NoAlignmentsError",
+    "SegmenterError",
     "ConfigError",
     "LMError",
     "LanguageModelNotFoundError",
@@ -952,3 +953,14 @@ def update_log_file(self) -> None:
                     self.log_file = handler.baseFilename
                     break
         self.refresh_message()
+
+
+# Segmenter Errors
+
+
+class SegmenterError(MFAError):
+    """
+    Class for errors during alignment
+    """
+
+    pass
diff --git a/montreal_forced_aligner/models.py b/montreal_forced_aligner/models.py
index 3cc96987..974351ed 100644
--- a/montreal_forced_aligner/models.py
+++ b/montreal_forced_aligner/models.py
@@ -16,9 +16,12 @@
 
 import requests
 import yaml
+from _kalpy.gmm import AmDiagGmm
+from _kalpy.hmm import TransitionModel
 from _kalpy.matrix import FloatMatrix
 from kalpy.feat.mfcc import MfccComputer
 from kalpy.feat.pitch import PitchComputer
+from kalpy.gmm.utils import read_gmm_model
 from rich.pretty import pprint
 
 from montreal_forced_aligner.abc import MfaModel, ModelExporterMixin
@@ -381,6 +384,8 @@ def __init__(
             source = AcousticModel.get_pretrained_path(source)
 
         super().__init__(source, root_directory)
+        self._am = None
+        self._tm = None
 
     @property
     def version(self):
@@ -452,6 +457,18 @@ def alignment_model_path(self) -> Path:
             return path
         return self.model_path
 
+    @property
+    def acoustic_model(self) -> AmDiagGmm:
+        if self._am is None:
+            self._tm, self._am = read_gmm_model(self.alignment_model_path)
+        return self._am
+
+    @property
+    def transition_model(self) -> TransitionModel:
+        if self._tm is None:
+            self._tm, self._am = read_gmm_model(self.alignment_model_path)
+        return self._tm
+
     @property
     def mfcc_computer(self) -> MfccComputer:
         return MfccComputer(**self.mfcc_options)
diff --git a/montreal_forced_aligner/tokenization/english.py b/montreal_forced_aligner/tokenization/english.py
new file mode 100644
index 00000000..fcb8442a
--- /dev/null
+++ b/montreal_forced_aligner/tokenization/english.py
@@ -0,0 +1,524 @@
+from __future__ import annotations
+
+import os
+import re
+import subprocess
+import typing
+
+try:
+    import spacy
+    from spacy.symbols import NORM, ORTH
+    from spacy.tokens import Doc, Token
+
+    SPACY_AVAILABLE = True
+except ImportError:
+    SPACY_AVAILABLE = False
+
+GENERIC_PREFIXES = {"non", "electro", "multi", "cross", "pseudo", "techno", "robo", "thermo"}
+
+
+class EnglishReTokenize:
+    """
+    Retokenizer for fixing English splitting
+    """
+
+    def __init__(self, vocab):
+        pass
+
+    def __call__(self, doc):
+        spans = []
+        for j, w in enumerate(doc):
+            if j > 0 and w.text == "'" and doc[j - 1].text.endswith("in"):
+                spans.append((doc[j - 1 : j + 1], {"NORM": doc[j - 1].text + "g"}))
+            elif j > 0 and w.text == "-" and doc[j - 1].text in GENERIC_PREFIXES:
+                spans.append((doc[j - 1 : j + 1], {}))
+        with doc.retokenize() as retokenizer:
+            for span, attrs in spans:
+                retokenizer.merge(span, attrs=attrs)
+        return doc
+
+
+class EnglishSplitPrefixes:
+    """
+    Retokenizer for splitting prefixes
+    """
+
+    def __init__(self, vocab: spacy.Vocab):
+        self.vocab = vocab
+
+    def __call__(self, doc):
+        spans = []
+        for w in doc:
+            verb_prefixes = ["re"]
+            adjective_prefixes = ["in", "un", "non"]
+            if w.pos_ == "VERB":
+                for vp in verb_prefixes:
+                    if w.text.startswith(vp) and self.vocab[w.lemma_].is_oov:
+                        lemma = re.sub(rf"^{vp}", "", w.lemma_)
+                        if not self.vocab[lemma].is_oov:
+                            orth = re.sub(rf"^{vp}", "", w.text)
+                            norm = re.sub(rf"^{vp}", "", w.norm_)
+                            lemma_form = f"{vp}-"
+                            spans.append(
+                                (
+                                    w,
+                                    [vp, orth],
+                                    [(w, 1), w.head],
+                                    {
+                                        "POS": ["VERB", "VERB"],
+                                        "NORM": [lemma_form, norm],
+                                        "LEMMA": [lemma_form, lemma],
+                                        "MORPH": [str(w.morph), str(w.morph)],
+                                    },
+                                )
+                            )
+            elif w.pos_ == "ADJ":
+                for ap in adjective_prefixes:
+                    if w.text.startswith(ap) and self.vocab[w.lemma_].is_oov:
+                        lemma = re.sub(rf"^{ap}", "", w.lemma_)
+                        if not self.vocab[lemma].is_oov:
+                            orth = re.sub(rf"^{ap}", "", w.text)
+                            norm = re.sub(rf"^{ap}", "", w.norm_)
+
+                            lemma_form = f"{ap}-"
+                            spans.append(
+                                (
+                                    w,
+                                    [ap, orth],
+                                    [(w, 1), w.head],
+                                    {
+                                        "POS": ["ADJ", "ADJ"],
+                                        "NORM": [lemma_form, norm],
+                                        "LEMMA": [lemma_form, lemma],
+                                        "MORPH": [str(w.morph), str(w.morph)],
+                                    },
+                                )
+                            )
+            for ap in GENERIC_PREFIXES:
+                if w.text.startswith(ap) and self.vocab[w.lemma_].is_oov:
+                    lemma = re.sub(rf"^{ap}", "", w.lemma_)
+                    if not self.vocab[lemma].is_oov:
+                        orth = re.sub(rf"^{ap}", "", w.text)
+                        norm = re.sub(rf"^{ap}", "", w.norm_)
+
+                        lemma_form = f"{ap}-"
+                        spans.append(
+                            (
+                                w,
+                                [ap, orth],
+                                [(w, 1), w.head],
+                                {
+                                    "POS": [w.pos_, w.pos_],
+                                    "NORM": [lemma_form, norm],
+                                    "LEMMA": [lemma_form, lemma],
+                                    "MORPH": [str(w.morph), str(w.morph)],
+                                },
+                            )
+                        )
+        with doc.retokenize() as retokenizer:
+            for details in spans:
+                if len(details) == 4:
+                    span, orths, heads, attrs = details
+                    retokenizer.split(span, orths, heads, attrs=attrs)
+                else:
+                    span, attrs = details
+                    retokenizer.merge(span, attrs=attrs)
+
+        return doc
+
+
+class EnglishSplitSuffixes:
+    """
+    Retokenizer for splitting suffixes
+    """
+
+    def __init__(self, vocab):
+        self.vocab = vocab
+
+    def find_base_form(
+        self, w: Token, suffix: str
+    ) -> typing.Tuple[typing.Optional[str], typing.Optional[str], typing.Optional[str]]:
+        base_lemma = re.sub(rf"{suffix}$", "", w.lemma_)
+        base_norm = re.sub(rf"{suffix}$", "", w.norm_)
+        if not base_lemma:
+            return None, None, None
+        base_text = re.sub(rf"{suffix}$", "", w.text)
+        if not self.vocab[base_lemma].is_oov:
+            if base_lemma.endswith("e") and not base_norm.endswith("e"):
+                base_norm += "e"
+            return base_lemma, base_norm, base_text
+
+        if not self.vocab[base_lemma + "e"].is_oov:
+            return base_lemma + "e", base_norm + "e", base_text
+        if base_lemma.endswith("i") and not self.vocab[base_lemma[:-1] + "y"].is_oov:
+            return base_lemma[:-1] + "y", base_norm[:-1] + "y", base_text
+
+        if re.search(r"(\w)\1$", base_lemma) and not self.vocab[base_lemma[:-1]].is_oov:
+            return base_lemma[:-1], base_norm[:-1], base_text
+        return None, None, None
+
+    def handle_ing(self, w: Token):
+        base_lemma, base_norm, base_text = self.find_base_form(w, "ing")
+        if base_lemma is None:
+            return None
+        return (
+            w,
+            [base_text, "ing"],
+            [w.head, (w, 0)],
+            {
+                "POS": ["VERB", "VERB"],
+                "NORM": [base_norm, "-ing"],
+                "LEMMA": [base_lemma, "-ing"],
+                "MORPH": [str(w.morph), str(w.morph)],
+            },
+        )
+
+    def handle_ness(self, w: Token):
+        base_lemma, base_norm, base_text = self.find_base_form(w, "ness")
+        if base_lemma is None:
+            return None
+        if base_text.endswith("li"):
+            base_text = base_text[:-1] + "y"
+        return (
+            w,
+            [base_text, "ness"],
+            [w.head, (w, 0)],
+            {
+                "POS": ["ADJ", "NOUN"],
+                "NORM": [base_norm, "-ness"],
+                "LEMMA": [base_lemma, "-ness"],
+                "MORPH": ["Degree=Pos", "Number=Sing"],
+            },
+        )
+
+    def handle_less(self, w: Token):
+        base_lemma, base_norm, base_text = self.find_base_form(w, "less")
+        if base_lemma is None:
+            return None
+        return (
+            w,
+            [base_text, "ing"],
+            [w.head, (w, 0)],
+            {
+                "POS": ["NOUN", "ADJ"],
+                "NORM": [base_norm, "-less"],
+                "LEMMA": [base_lemma, "-less"],
+                "MORPH": ["Number=Sing", "Degree=Pos"],
+            },
+        )
+
+    def handle_able(self, w: Token):
+        if w.text.endswith("able"):
+            suffix = "able"
+        else:
+            suffix = "ible"
+        base_lemma, base_norm, base_text = self.find_base_form(w, suffix)
+        if base_lemma is None:
+            return None
+        return (
+            w,
+            [base_text, suffix],
+            [w.head, (w, 0)],
+            {
+                "POS": ["VERB", "ADJ"],
+                "NORM": [base_norm, "-able"],
+                "LEMMA": [base_lemma, "-able"],
+                "MORPH": ["VerbForm=Inf", "Degree=Pos"],
+            },
+        )
+
+    def handle_ability(self, w: Token):
+        if w.text.endswith("ability"):
+            suffix = "ability"
+        else:
+            suffix = "ibility"
+        base_lemma, base_norm, base_text = self.find_base_form(w, suffix)
+        if base_lemma is None:
+            return None
+        return (
+            w,
+            [base_text, suffix],
+            [w.head, (w, 0)],
+            {
+                "POS": ["VERB", "NOUN"],
+                "NORM": [base_norm, "-ability"],
+                "LEMMA": [base_lemma, "-ability"],
+                "MORPH": ["VerbForm=Inf", "Number=Sing"],
+            },
+        )
+
+    def handle_ably(self, w: Token):
+        if w.text.endswith("ably"):
+            suffix = "ably"
+        else:
+            suffix = "ibly"
+        base_lemma, base_norm, base_text = self.find_base_form(w, suffix)
+        if base_lemma is None:
+            return None
+        return (
+            w,
+            [base_text, suffix],
+            [w.head, (w, 0)],
+            {
+                "POS": ["ADJ", "ADV"],
+                "NORM": [base_norm, "-ly"],
+                "LEMMA": [base_lemma, "-ly"],
+                "MORPH": ["Degree=Pos", ""],
+            },
+        )
+
+    def handle_plural(self, w: Token):
+        if w.text == "'s":
+            return None
+        if w.text.endswith("ies"):
+            orth = re.sub(r"es$", "", w.text)
+            norm = re.sub(r"ies$", "y", w.text)
+            s_form = "es"
+        elif w.text.endswith("es"):
+            if w.lemma_.endswith("e"):
+                orth = re.sub(r"s$", "", w.text)
+                s_form = "s"
+                norm = re.sub(r"s$", "", w.norm_)
+            else:
+                orth = re.sub(r"es$", "", w.text)
+                s_form = "es"
+                norm = re.sub(r"es$", "", w.norm_)
+        else:
+            orth = re.sub(r"s$", "", w.text)
+            norm = re.sub(r"s$", "", w.norm_)
+            s_form = "s"
+        if self.vocab[norm].is_oov:
+            return None
+        return (
+            w,
+            [orth, s_form],
+            [w.head, (w, 0)],
+            {
+                "POS": ["NOUN", "NOUN"],
+                "NORM": [norm, "-s"],
+                "LEMMA": [norm, "-s"],
+                "MORPH": ["Number=Sing", "Number=Plur"],
+            },
+        )
+
+    def handle_3p_pres(self, w: Token):
+        if w.text == "'s":
+            return None
+        base_lemma = None
+        if w.text.endswith("es"):
+            suffix = "es"
+            base_lemma, base_norm, base_text = self.find_base_form(w, suffix)
+
+        if base_lemma is None:
+            suffix = "s"
+            base_lemma, base_norm, base_text = self.find_base_form(w, suffix)
+            if base_lemma is None:
+                return None
+        elif base_norm.endswith("i"):
+            base_norm = base_norm[:-1] + "y"
+        return (
+            w,
+            [base_text, suffix],
+            [w.head, (w, 0)],
+            {
+                "POS": ["VERB", "VERB"],
+                "NORM": [base_norm, "-s"],
+                "LEMMA": [base_lemma, "-s"],
+                "MORPH": ["VerbForm=Inf", "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin"],
+            },
+        )
+
+    def handle_ly(self, w: Token):
+        suffix = "ly"
+        base_lemma, base_norm, base_text = self.find_base_form(w, suffix)
+        if base_lemma is None:
+            return None
+        if base_norm.endswith("i"):
+            base_norm = base_norm[:-1] + "y"
+        return (
+            w,
+            [base_text, suffix],
+            [w.head, (w, 0)],
+            {
+                "POS": ["ADJ", "ADV"],
+                "NORM": [base_norm, "-ly"],
+                "LEMMA": [base_lemma, "-ly"],
+                "MORPH": ["Degree=Pos", ""],
+            },
+        )
+
+    def handle_ed(self, w: Token):
+        base_lemma, base_norm, base_text = self.find_base_form(w, "ed")
+        if base_lemma is None:
+            return None
+        if base_norm.endswith("i"):
+            base_norm = base_norm[:-1] + "y"
+        return (
+            w,
+            [base_text, "ed"],
+            [w.head, (w, 0)],
+            {
+                "POS": ["VERB", "VERB"],
+                "NORM": [base_norm, "-ed"],
+                "LEMMA": [base_lemma, "-ed"],
+                "MORPH": ["VerbForm=Inf", str(w.morph)],
+            },
+        )
+
+    def __call__(self, doc: Doc):
+        while True:
+            for j, w in enumerate(doc):
+                try:
+                    if w.lemma_.startswith("-") or w.lemma_.endswith("-"):
+                        continue
+                except KeyError:
+                    continue
+                lemma = w.lemma_
+                norm = w.norm_
+                morph = str(w.morph)
+                pos = w.pos_
+                print(w.text, lemma, norm, morph, pos, w.is_oov)
+                span = None
+                if "Prog" in w.morph.get("Aspect") and w.text.endswith("ing"):
+                    span = self.handle_ing(w)
+                elif (
+                    w.pos_ == "ADJ"
+                    and (w.text.endswith("able") or w.text.endswith("ible"))
+                    and self.vocab[w.lemma_].is_oov
+                ):
+                    span = self.handle_able(w)
+
+                elif (w.text.endswith("ability") or w.text.endswith("ibility")) and self.vocab[
+                    w.lemma_
+                ].is_oov:
+                    span = self.handle_ability(w)
+                    break
+                elif (w.text.endswith("ably") or w.text.endswith("ibly")) and (
+                    w.pos_ == "ADV" or self.vocab[w.lemma_].is_oov
+                ):
+                    span = self.handle_ably(w)
+                    break
+                elif w.pos_ == "NOUN" and "Plur" in w.morph.get("Number") and w.text.endswith("s"):
+                    span = self.handle_plural(w)
+                elif (
+                    w.pos_ == "VERB"
+                    and "Sing" in w.morph.get("Number")
+                    and "3" in w.morph.get("Person")
+                    and "Pres" in w.morph.get("Tense")
+                    and w.text.endswith("s")
+                ):
+                    span = self.handle_3p_pres(w)
+                elif w.pos_ == "VERB" and "Past" in w.morph.get("Tense") and w.text.endswith("ed"):
+                    span = self.handle_ed(w)
+                elif w.text in {"n't"} and w.norm_ != "-n't":
+                    span = (
+                        doc[j : j + 1],
+                        {
+                            "NORM": "-n't",
+                        },
+                    )
+                elif w.pos_ in {"ADJ", "ADV"} and w.text.endswith("ly"):
+                    span = self.handle_ly(w)
+                if span is not None:
+                    break
+            else:
+                break
+            print(span)
+            if span is not None:
+                with doc.retokenize() as retokenizer:
+                    if len(span) == 4:
+                        span, orths, heads, attrs = span
+                        retokenizer.split(span, orths, heads, attrs=attrs)
+                    else:
+                        span, attrs = span
+                        retokenizer.merge(span, attrs=attrs)
+            else:
+                break
+        return doc
+
+
+class BracketedReTokenize:
+    def __init__(self, vocab):
+        pass
+
+    def __call__(self, doc):
+        spans = []
+        initial_span = None
+        for w in doc:
+            if w.text in {"<", "(", "{", "["}:
+                initial_span = w.left_edge.i
+            elif w.text in {">", ")", "}", "]"}:
+                if initial_span is not None:
+                    spans.append(doc[initial_span : w.right_edge.i + 1])
+                    initial_span = None
+        with doc.retokenize() as retokenizer:
+            for span in spans:
+                retokenizer.merge(span, attrs={"POS": "X"})
+        return doc
+
+
+def en_spacy():
+    name = "en_core_web_sm"
+    try:
+        nlp = spacy.load(name)
+    except OSError:
+        subprocess.call(["python", "-m", "spacy", "download", name], env=os.environ)
+        nlp = spacy.load(name)
+
+    @spacy.Language.factory("en_re_tokenize")
+    def en_re_tokenize(_nlp, name):
+        return EnglishReTokenize(_nlp.vocab)
+
+    @spacy.Language.factory("en_split_suffixes")
+    def en_split_suffixes(_nlp, name):
+        return EnglishSplitSuffixes(_nlp.vocab)
+
+    @spacy.Language.factory("en_split_prefixes")
+    def en_split_prefixes(_nlp, name):
+        return EnglishSplitPrefixes(_nlp.vocab)
+
+    @spacy.Language.factory("en_bracketed_re_tokenize")
+    def bracketed_re_tokenize(_nlp, name):
+        return BracketedReTokenize(_nlp.vocab)
+
+    initial_brackets = r"\(\[\{<"
+    final_brackets = r"\)\]\}>"
+
+    nlp.tokenizer.token_match = re.compile(
+        rf"[{initial_brackets}][-\w_']+[?!,][{final_brackets}]"
+    ).match
+    nlp.tokenizer.add_special_case(
+        "wanna", [{ORTH: "wan", NORM: "want"}, {ORTH: "na", NORM: "to"}]
+    )
+    nlp.tokenizer.add_special_case(
+        "dunno", [{ORTH: "dun", NORM: "don't"}, {ORTH: "no", NORM: "know"}]
+    )
+    nlp.tokenizer.add_special_case(
+        "woulda", [{ORTH: "would", NORM: "would"}, {ORTH: "a", NORM: "have"}]
+    )
+    nlp.tokenizer.add_special_case(
+        "sorta", [{ORTH: "sort", NORM: "sort"}, {ORTH: "a", NORM: "of"}]
+    )
+    nlp.tokenizer.add_special_case(
+        "kinda", [{ORTH: "kind", NORM: "kind"}, {ORTH: "a", NORM: "of"}]
+    )
+    nlp.tokenizer.add_special_case(
+        "coulda", [{ORTH: "could", NORM: "could"}, {ORTH: "a", NORM: "have"}]
+    )
+    nlp.tokenizer.add_special_case(
+        "shoulda", [{ORTH: "should", NORM: "should"}, {ORTH: "a", NORM: "have"}]
+    )
+    nlp.tokenizer.add_special_case(
+        "finna", [{ORTH: "fin", NORM: "fixing"}, {ORTH: "na", NORM: "to"}]
+    )
+    nlp.tokenizer.add_special_case(
+        "yknow", [{ORTH: "y", NORM: "you"}, {ORTH: "know", NORM: "know"}]
+    )
+    nlp.tokenizer.add_special_case(
+        "y'know", [{ORTH: "y'", NORM: "you"}, {ORTH: "know", NORM: "know"}]
+    )
+    nlp.add_pipe("en_re_tokenize", before="tagger")
+    nlp.add_pipe("bracketed_re_tokenize", before="tagger")
+    nlp.add_pipe("en_split_prefixes")
+    nlp.add_pipe("en_split_suffixes")
+    return nlp
diff --git a/montreal_forced_aligner/tokenization/japanese.py b/montreal_forced_aligner/tokenization/japanese.py
new file mode 100644
index 00000000..58fcc084
--- /dev/null
+++ b/montreal_forced_aligner/tokenization/japanese.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+try:
+    import spacy
+    from spacy.lang.ja import Japanese
+
+    SPACY_AVAILABLE = True
+except ImportError:
+    SPACY_AVAILABLE = False
+try:
+    import sudachipy
+
+    JA_AVAILABLE = True
+except ImportError:
+    JA_AVAILABLE = False
+
+
+class BracketedReTokenize:
+    def __init__(self, vocab):
+        pass
+
+    def __call__(self, doc):
+        spans = []
+        initial_span = None
+        for w in doc:
+            if w.text in {"<", "(", "{", "["}:
+                initial_span = w.left_edge.i
+            elif w.text in {">", ")", "}", "]"}:
+                if initial_span is not None:
+                    spans.append(doc[initial_span : w.right_edge.i + 1])
+                    initial_span = None
+        with doc.retokenize() as retokenizer:
+            for span in spans:
+                retokenizer.merge(span, attrs={"POS": "X"})
+        return doc
+
+
+class JapaneseReTokenize:
+    def __init__(self, vocab):
+        pass
+
+    def __call__(self, doc):
+        spans = []
+        for j, w in enumerate(doc):
+            if w.text in {"えっと", "そのー", "あのー", "えっ", "まあ", "このー", "えー"}:
+                spans.append((doc[w.left_edge.i : w.right_edge.i + 1], "INTJ"))
+            elif (
+                w.text in {"あっ", "まっ"}
+                and j < len(doc) - 1
+                and doc[j + 1].pos not in "AUX"
+                and doc[j + 1].text != "て"
+            ):
+                spans.append((doc[w.left_edge.i : w.right_edge.i + 1], "INTJ"))
+
+        with doc.retokenize() as retokenizer:
+            for span, pos in spans:
+                retokenizer.merge(span, attrs={"POS": pos})
+        return doc
+
+
+def ja_spacy(accurate=True):
+    if not JA_AVAILABLE:
+        raise ImportError("Please install Japanese support via `conda install spacy[ja]`")
+    nlp = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}})
+
+    @spacy.Language.factory("bracketed_re_tokenize")
+    def bracketed_re_tokenize(_nlp, name):
+        return BracketedReTokenize(_nlp.vocab)
+
+    @spacy.Language.factory("ja_re_tokenize")
+    def ja_re_tokenize(_nlp, name):
+        return JapaneseReTokenize(_nlp.vocab)
+
+    nlp.tokenizer.tokenizer = sudachipy.Dictionary(dict="full" if accurate else "core").create()
+    nlp.add_pipe("bracketed_re_tokenize")
+    nlp.add_pipe("ja_re_tokenize")
+    return nlp
diff --git a/montreal_forced_aligner/tokenization/spacy.py b/montreal_forced_aligner/tokenization/spacy.py
new file mode 100644
index 00000000..c78c6df2
--- /dev/null
+++ b/montreal_forced_aligner/tokenization/spacy.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+import os
+import subprocess
+
+try:
+    import spacy
+
+    SPACY_AVAILABLE = True
+except ImportError:
+    spacy = None
+    SPACY_AVAILABLE = False
+
+from montreal_forced_aligner.data import Language
+from montreal_forced_aligner.tokenization.english import en_spacy
+from montreal_forced_aligner.tokenization.japanese import ja_spacy
+
+language_model_mapping = {
+    # Use small models optimized for CPU because tokenizer accuracy does not depend on model
+    Language.catalan: "ca_core_news_sm",
+    Language.chinese: "zh_core_web_sm",
+    Language.croatian: "hr_core_news_sm",
+    Language.danish: "da_core_news_sm",
+    Language.dutch: "nl_core_news_sm",
+    Language.english: "en_core_web_sm",
+    Language.finnish: "fi_core_news_sm",
+    Language.french: "fr_core_news_sm",
+    Language.german: "de_core_news_sm",
+    Language.greek: "el_core_news_sm",
+    Language.italian: "it_core_news_sm",
+    Language.japanese: "ja_core_news_sm",
+    Language.korean: "ko_core_news_sm",
+    Language.lithuanian: "lt_core_news_sm",
+    Language.macedonian: "mk_core_news_sm",
+    Language.multilingual: "xx_sent_ud_sm",
+    Language.norwegian: "nb_core_news_sm",
+    Language.polish: "pl_core_news_sm",
+    Language.portuguese: "pt_core_news_sm",
+    Language.romanian: "ro_core_news_sm",
+    Language.russian: "ru_core_news_sm",
+    Language.slovenian: "sl_core_news_sm",
+    Language.spanish: "es_core_news_sm",
+    Language.swedish: "sv_core_news_sm",
+    Language.ukrainian: "uk_core_news_sm",
+}
+
+
+def generate_language_tokenizer(language: Language):
+    if not SPACY_AVAILABLE:
+        raise ImportError("Please install spacy via `conda install spacy`")
+    if language is Language.english:
+        return en_spacy()
+    elif language is Language.japanese:
+        return ja_spacy()
+    name = language_model_mapping[language]
+    try:
+        nlp = spacy.load(name)
+    except OSError:
+        subprocess.call(["python", "-m", "spacy", "download", name], env=os.environ)
+        nlp = spacy.load(name)
+    return nlp
diff --git a/montreal_forced_aligner/transcription/multiprocessing.py b/montreal_forced_aligner/transcription/multiprocessing.py
index 07089faf..b4ca88d4 100644
--- a/montreal_forced_aligner/transcription/multiprocessing.py
+++ b/montreal_forced_aligner/transcription/multiprocessing.py
@@ -6,11 +6,9 @@
 from __future__ import annotations
 
 import os
-import re
-import subprocess
 import typing
 from pathlib import Path
-from typing import TYPE_CHECKING, Dict, List
+from typing import TYPE_CHECKING, Dict
 
 from _kalpy.fstext import ConstFst, VectorFst
 from _kalpy.lat import CompactLatticeWriter
@@ -30,8 +28,7 @@
 from montreal_forced_aligner.abc import KaldiFunction, MetaDict
 from montreal_forced_aligner.data import MfaArguments, PhoneType
 from montreal_forced_aligner.db import Job, Phone, Utterance
-from montreal_forced_aligner.helper import mfa_open
-from montreal_forced_aligner.utils import thirdparty_binary, thread_logger
+from montreal_forced_aligner.utils import thread_logger
 
 if TYPE_CHECKING:
     from dataclasses import dataclass
@@ -161,13 +158,10 @@ class DecodePhoneArguments(MfaArguments):
         HCLG.fst paths
     """
 
-    dictionaries: List[int]
-    feature_strings: Dict[int, str]
-    decode_options: MetaDict
+    working_directory: Path
     model_path: Path
-    lat_paths: Dict[int, Path]
-    phone_symbol_path: Path
     hclg_path: Path
+    decode_options: MetaDict
 
 
 @dataclass
@@ -414,7 +408,7 @@ def __init__(self, args: DecodeArguments):
         self.decode_options = args.decode_options
         self.model_path = args.model_path
 
-    def _run(self) -> typing.Generator[typing.Tuple[str, float, int]]:
+    def _run(self) -> None:
         """Run the function"""
         with (
             self.session() as session,
@@ -438,33 +432,7 @@ def _run(self) -> typing.Generator[typing.Tuple[str, float, int]]:
                 decode_logger.debug(f"Decoding with model: {self.model_path}")
                 dict_id = d.id
 
-                fmllr_path = job.construct_path(
-                    job.corpus.current_subset_directory, "trans", "scp", dict_id
-                )
-                if not fmllr_path.exists():
-                    fmllr_path = None
-                lda_mat_path = self.working_directory.joinpath("lda.mat")
-                if not lda_mat_path.exists():
-                    lda_mat_path = None
-                feat_path = job.construct_path(
-                    job.corpus.current_subset_directory, "feats", "scp", dictionary_id=dict_id
-                )
-                utt2spk_path = job.construct_path(
-                    job.corpus.current_subset_directory, "utt2spk", "scp", dict_id
-                )
-                utt2spk = KaldiMapping()
-                utt2spk.load(utt2spk_path)
-                decode_logger.debug(f"Feature path: {feat_path}")
-                decode_logger.debug(f"LDA transform path: {lda_mat_path}")
-                decode_logger.debug(f"Speaker transform path: {fmllr_path}")
-                decode_logger.debug(f"utt2spk path: {utt2spk_path}")
-                feature_archive = FeatureArchive(
-                    feat_path,
-                    utt2spk=utt2spk,
-                    lda_mat_file_name=lda_mat_path,
-                    transform_file_name=fmllr_path,
-                    deltas=True,
-                )
+                feature_archive = job.construct_feature_archive(self.working_directory, dict_id)
 
                 lat_path = job.construct_path(self.working_directory, "lat", "ark", dict_id)
                 alignment_file_name = job.construct_path(
@@ -513,7 +481,7 @@ def __init__(self, args: LmRescoreArguments):
         self.new_g_paths = args.new_g_paths
         self.lm_rescore_options = args.lm_rescore_options
 
-    def _run(self) -> typing.Generator[typing.Tuple[int, int]]:
+    def _run(self) -> None:
         """Run the function"""
         with (
             self.session() as session,
@@ -573,7 +541,7 @@ def __init__(self, args: CarpaLmRescoreArguments):
         self.old_g_paths = args.old_g_paths
         self.new_g_paths = args.new_g_paths
 
-    def _run(self) -> typing.Generator[typing.Tuple[int, int]]:
+    def _run(self) -> None:
         """Run the function"""
         with (
             self.session() as session,
@@ -635,7 +603,7 @@ def __init__(self, args: InitialFmllrArguments):
         self.model_path = args.model_path
         self.fmllr_options = args.fmllr_options
 
-    def _run(self) -> typing.Generator[int]:
+    def _run(self) -> None:
         """Run the function"""
         with (
             self.session() as session,
@@ -748,7 +716,7 @@ def __init__(self, args: FinalFmllrArguments):
         self.model_path = args.model_path
         self.fmllr_options = args.fmllr_options
 
-    def _run(self) -> typing.Generator[int]:
+    def _run(self) -> None:
         """Run the function"""
         with (
             self.session() as session,
@@ -868,7 +836,7 @@ def __init__(self, args: FmllrRescoreArguments):
         self.model_path = args.model_path
         self.rescore_options = args.rescore_options
 
-    def _run(self) -> typing.Generator[typing.Tuple[int, int]]:
+    def _run(self) -> None:
         """Run the function"""
         with (
             self.session() as session,
@@ -1098,73 +1066,55 @@ class DecodePhoneFunction(KaldiFunction):
         Arguments for the function
     """
 
-    progress_pattern = re.compile(
-        r"^LOG.*Log-like per frame for utterance (?P<utterance>.*) is (?P<loglike>[-\d.]+) over (?P<num_frames>\d+) frames."
-    )
-
     def __init__(self, args: DecodePhoneArguments):
         super().__init__(args)
-        self.dictionaries = args.dictionaries
-        self.feature_strings = args.feature_strings
-        self.lat_paths = args.lat_paths
-        self.phone_symbol_path = args.phone_symbol_path
+        self.working_directory = args.working_directory
         self.hclg_path = args.hclg_path
         self.decode_options = args.decode_options
         self.model_path = args.model_path
 
-    def _run(self) -> typing.Generator[typing.Tuple[str, float, int]]:
+    def _run(self) -> None:
         """Run the function"""
-        with self.session() as session, mfa_open(self.log_path, "w") as log_file:
+        with (
+            self.session() as session,
+            thread_logger("kalpy.decode", self.log_path, job_name=self.job_name) as decode_logger,
+        ):
+            job: Job = (
+                session.query(Job)
+                .options(joinedload(Job.corpus, innerjoin=True), subqueryload(Job.dictionaries))
+                .filter(Job.id == self.job_name)
+                .first()
+            )
+            silence_phones = [
+                x
+                for x, in session.query(Phone.mapping_id).filter(
+                    Phone.phone_type.in_([PhoneType.silence, PhoneType.oov])
+                )
+            ]
             phones = session.query(Phone.mapping_id, Phone.phone)
             reversed_phone_mapping = {}
             for p_id, phone in phones:
                 reversed_phone_mapping[p_id] = phone
-            for dict_id in self.dictionaries:
-                feature_string = self.feature_strings[dict_id]
-                lat_path = self.lat_paths[dict_id]
-                if os.path.exists(lat_path):
-                    continue
-                if (
-                    self.decode_options["uses_speaker_adaptation"]
-                    and self.decode_options["first_beam"] is not None
-                ):
-                    beam = self.decode_options["first_beam"]
-                else:
-                    beam = self.decode_options["beam"]
-                if (
-                    self.decode_options["uses_speaker_adaptation"]
-                    and self.decode_options["first_max_active"] is not None
-                ):
-                    max_active = self.decode_options["first_max_active"]
-                else:
-                    max_active = self.decode_options["max_active"]
-                decode_proc = subprocess.Popen(
-                    [
-                        thirdparty_binary("gmm-latgen-faster"),
-                        f"--max-active={max_active}",
-                        f"--beam={beam}",
-                        f"--lattice-beam={self.decode_options['lattice_beam']}",
-                        "--allow-partial=true",
-                        f"--word-symbol-table={self.phone_symbol_path}",
-                        f"--acoustic-scale={self.decode_options['acoustic_scale']}",
-                        self.model_path,
-                        self.hclg_path,
-                        feature_string,
-                        f"ark:{lat_path}",
-                    ],
-                    stderr=subprocess.PIPE,
-                    env=os.environ,
-                    encoding="utf8",
+            hclg_fst = ConstFst.Read(str(self.hclg_path))
+            for d in job.dictionaries:
+                decode_logger.debug(f"Decoding for dictionary {d.id}")
+                decode_logger.debug(f"Decoding with model: {self.model_path}")
+                dict_id = d.id
+                feature_archive = job.construct_feature_archive(self.working_directory, dict_id)
+                lat_path = job.construct_path(self.working_directory, "lat", "ark", dict_id)
+                alignment_file_name = job.construct_path(
+                    self.working_directory, "ali", "ark", dict_id
+                )
+                words_path = job.construct_path(self.working_directory, "words", "ark", dict_id)
+
+                boost_silence = self.decode_options.pop("boost_silence", 1.0)
+                decoder = GmmDecoder(self.model_path, hclg_fst, **self.decode_options)
+                if boost_silence != 1.0:
+                    decoder.boost_silence(boost_silence, silence_phones)
+                decoder.export_lattices(
+                    lat_path,
+                    feature_archive,
+                    word_file_name=words_path,
+                    alignment_file_name=alignment_file_name,
+                    callback=self.callback,
                 )
-                for line in decode_proc.stderr:
-                    log_file.write(line)
-                    m = self.progress_pattern.match(line.strip())
-                    if m:
-                        self.callback(
-                            (
-                                m.group("utterance"),
-                                float(m.group("loglike")),
-                                int(m.group("num_frames")),
-                            )
-                        )
-            self.check_call(decode_proc)
diff --git a/montreal_forced_aligner/transcription/transcriber.py b/montreal_forced_aligner/transcription/transcriber.py
index bdeb9e75..0e15560e 100644
--- a/montreal_forced_aligner/transcription/transcriber.py
+++ b/montreal_forced_aligner/transcription/transcriber.py
@@ -783,17 +783,11 @@ def decode_arguments(
             Arguments for processing
         """
         arguments = []
+        decode_options = self.decode_options
+        if not self.uses_speaker_adaptation:
+            decode_options["max_active"] = self.first_max_active
+            decode_options["beam"] = self.first_beam
         for j in self.jobs:
-            feat_strings = {}
-            for d_id in j.dictionary_ids:
-                feat_strings[d_id] = j.construct_feature_proc_string(
-                    self.working_directory,
-                    d_id,
-                    self.feature_options["uses_splices"],
-                    self.feature_options["splice_left_context"],
-                    self.feature_options["splice_right_context"],
-                    self.feature_options["uses_speaker_adaptation"],
-                )
             if workflow is WorkflowType.per_speaker_transcription:
                 arguments.append(
                     PerSpeakerDecodeArguments(
@@ -803,7 +797,7 @@ def decode_arguments(
                         self.working_directory,
                         self.model_path,
                         self.tree_path,
-                        self.decode_options,
+                        decode_options,
                         self.order,
                         self.method,
                     )
@@ -814,20 +808,13 @@ def decode_arguments(
                         j.id,
                         getattr(self, "session", ""),
                         self.working_log_directory.joinpath(f"decode.{j.id}.log"),
-                        j.dictionary_ids,
-                        feat_strings,
-                        self.decode_options,
+                        self.working_directory,
                         self.alignment_model_path,
-                        j.construct_path_dictionary(self.working_directory, "lat", "ark"),
-                        self.phone_symbol_table_path,
                         self.working_directory.joinpath("HCLG_phone.fst"),
+                        decode_options,
                     )
                 )
             else:
-                decode_options = self.decode_options
-                if not self.uses_speaker_adaptation:
-                    decode_options["max_active"] = self.first_max_active
-                    decode_options["beam"] = self.first_beam
                 arguments.append(
                     DecodeArguments(
                         j.id,
@@ -898,16 +885,6 @@ def initial_fmllr_arguments(self) -> List[InitialFmllrArguments]:
         """
         arguments = []
         for j in self.jobs:
-            feat_strings = {}
-            for d_id in j.dictionary_ids:
-                feat_strings[d_id] = j.construct_feature_proc_string(
-                    self.working_directory,
-                    d_id,
-                    self.feature_options["uses_splices"],
-                    self.feature_options["splice_left_context"],
-                    self.feature_options["splice_right_context"],
-                    self.feature_options["uses_speaker_adaptation"],
-                )
             arguments.append(
                 InitialFmllrArguments(
                     j.id,
@@ -931,16 +908,6 @@ def final_fmllr_arguments(self) -> List[FinalFmllrArguments]:
         """
         arguments = []
         for j in self.jobs:
-            feat_strings = {}
-            for d_id in j.dictionary_ids:
-                feat_strings[d_id] = j.construct_feature_proc_string(
-                    self.working_directory,
-                    d_id,
-                    self.feature_options["uses_splices"],
-                    self.feature_options["splice_left_context"],
-                    self.feature_options["splice_right_context"],
-                    self.feature_options["uses_speaker_adaptation"],
-                )
             arguments.append(
                 FinalFmllrArguments(
                     j.id,
diff --git a/montreal_forced_aligner/vad/multiprocessing.py b/montreal_forced_aligner/vad/multiprocessing.py
index 95e33cb2..55c697aa 100644
--- a/montreal_forced_aligner/vad/multiprocessing.py
+++ b/montreal_forced_aligner/vad/multiprocessing.py
@@ -6,18 +6,28 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, List, Union
 
-import librosa
 import numpy
 import numpy as np
-import pynini
-import pywrapfst
+from _kalpy.decoder import LatticeFasterDecoder, LatticeFasterDecoderConfig
+from _kalpy.fstext import GetLinearSymbolSequence
+from _kalpy.gmm import DecodableAmDiagGmmScaled
+from _kalpy.matrix import DoubleMatrix, FloatMatrix
 from _kalpy.util import SequentialBaseFloatVectorReader
-from Bio import pairwise2
-from kalpy.utils import generate_read_specifier
+from kalpy.data import Segment
+from kalpy.decoder.training_graphs import TrainingGraphCompiler
+from kalpy.feat.cmvn import CmvnComputer
+from kalpy.feat.mfcc import MfccComputer
+from kalpy.feat.vad import VadComputer
+from kalpy.fstext.lexicon import LexiconCompiler
+from kalpy.utils import generate_read_specifier, read_kaldi_object
+from kalpy.utterance import Utterance as KalpyUtterance
+from sqlalchemy.orm import joinedload, subqueryload
 
 from montreal_forced_aligner.abc import KaldiFunction
 from montreal_forced_aligner.data import CtmInterval, MfaArguments
-from montreal_forced_aligner.db import SoundFile, Utterance
+from montreal_forced_aligner.db import File, Job, Speaker, Utterance
+from montreal_forced_aligner.exceptions import SegmenterError
+from montreal_forced_aligner.models import AcousticModel
 
 try:
     import warnings
@@ -44,6 +54,18 @@
 else:
     from dataclassy import dataclass
 
+__all__ = [
+    "SegmentTranscriptArguments",
+    "SegmentVadArguments",
+    "SegmentTranscriptFunction",
+    "SegmentVadFunction",
+    "get_initial_segmentation",
+    "merge_segments",
+    "segment_utterance_transcript",
+    "segment_utterance_vad",
+    "segment_utterance_vad_speech_brain",
+]
+
 
 @dataclass
 class SegmentVadArguments(MfaArguments):
@@ -53,6 +75,160 @@ class SegmentVadArguments(MfaArguments):
     segmentation_options: MetaDict
 
 
+@dataclass
+class SegmentTranscriptArguments(MfaArguments):
+    """Arguments for :class:`~montreal_forced_aligner.segmenter.SegmentTranscriptFunction`"""
+
+    acoustic_model: AcousticModel
+    vad_model: typing.Optional[VAD]
+    lexicon_compilers: typing.Dict[int, LexiconCompiler]
+    mfcc_options: MetaDict
+    vad_options: MetaDict
+    segmentation_options: MetaDict
+    decode_options: MetaDict
+
+
+def segment_utterance_transcript(
+    acoustic_model: AcousticModel,
+    utterance: KalpyUtterance,
+    lexicon_compiler: LexiconCompiler,
+    vad_model: VAD,
+    segmentation_options: MetaDict,
+    cmvn: DoubleMatrix = None,
+    fmllr_trans: FloatMatrix = None,
+    mfcc_options: MetaDict = None,
+    vad_options: MetaDict = None,
+    acoustic_scale: float = 0.1,
+    beam: float = 16.0,
+    lattice_beam: float = 10.0,
+    max_active: int = 7000,
+    min_active: int = 200,
+    prune_interval: int = 25,
+    beam_delta: float = 0.5,
+    hash_ratio: float = 2.0,
+    prune_scale: float = 0.1,
+    boost_silence: float = 1.0,
+):
+    """
+    Split an utterance and its transcript into multiple transcribed utterances
+
+    Parameters
+    ----------
+    acoustic_model: :class:`~montreal_forced_aligner.models.AcousticModel`
+        Acoustic model to use in splitting transcriptions
+    utterance: :class:`~kalpy.utterance.Utterance`
+        Utterance to split
+    lexicon_compiler :class:`~kalpy.fstext.lexicon.LexiconCompiler`
+        Lexicon compiler
+    vad_model :class:`~speechbrain.pretrained.VAD` or None
+        VAD model from SpeechBrain, if None, then Kaldi's energy-based VAD is used
+    segmentation_options: dict[str, Any]
+        Segmentation options
+    cmvn: :class:`~_kalpy.matrix.DoubleMatrix`
+        CMVN stats to apply
+    fmllr_trans: :class:`~_kalpy.matrix.FloatMatrix`
+        fMLLR transformation matrix for speaker adaptation
+    mfcc_options: dict[str, Any], optional
+        MFCC options for energy based VAD
+    vad_options: dict[str, Any], optional
+        Options for energy based VAD
+    acoustic_scale: float, optional
+        Defaults to 0.1
+    beam: float, optional
+        Defaults to 16
+    lattice_beam: float, optional
+        Defaults to 10
+    max_active: int, optional
+        Defaults to 7000
+    min_active: int, optional
+        Defaults to 250
+    prune_interval: int, optional
+        Defaults to 25
+    beam_delta: float, optional
+        Defaults to 0.5
+    hash_ratio: float, optional
+        Defaults to 2.0
+    prune_scale: float, optional
+        Defaults to 0.1
+    boost_silence: float, optional
+        Defaults to 1.0
+
+    Returns
+    -------
+    list[:class:`~kalpy.utterance.Utterance`]
+        Split utterances
+
+    """
+    graph_compiler = TrainingGraphCompiler(
+        acoustic_model.alignment_model_path,
+        acoustic_model.tree_path,
+        lexicon_compiler,
+        lexicon_compiler.word_table,
+    )
+    if utterance.cmvn_string:
+        cmvn = read_kaldi_object(DoubleMatrix, utterance.cmvn_string)
+    if utterance.fmllr_string:
+        fmllr_trans = read_kaldi_object(FloatMatrix, utterance.fmllr_string)
+    if cmvn is None and acoustic_model.uses_cmvn:
+        utterance.generate_mfccs(acoustic_model.mfcc_computer)
+        cmvn_computer = CmvnComputer()
+        cmvn = cmvn_computer.compute_cmvn_from_features([utterance.mfccs])
+    current_transcript = utterance.transcript
+    if vad_model is None:
+        segments = segment_utterance_vad(
+            utterance, mfcc_options, vad_options, segmentation_options
+        )
+    else:
+        segments = segment_utterance_vad_speech_brain(utterance, vad_model, segmentation_options)
+
+    config = LatticeFasterDecoderConfig()
+    config.beam = beam
+    config.lattice_beam = lattice_beam
+    config.max_active = max_active
+    config.min_active = min_active
+    config.prune_interval = prune_interval
+    config.beam_delta = beam_delta
+    config.hash_ratio = hash_ratio
+    config.prune_scale = prune_scale
+    new_utts = []
+    am, transition_model = acoustic_model.acoustic_model, acoustic_model.transition_model
+    if boost_silence != 1.0:
+        am.boost_silence(transition_model, lexicon_compiler.silence_symbols, boost_silence)
+    for seg in segments:
+        new_utt = KalpyUtterance(seg, current_transcript)
+        new_utt.generate_mfccs(acoustic_model.mfcc_computer)
+        if acoustic_model.uses_cmvn:
+            new_utt.apply_cmvn(cmvn)
+        feats = new_utt.generate_features(
+            acoustic_model.mfcc_computer,
+            acoustic_model.pitch_computer,
+            lda_mat=acoustic_model.lda_mat,
+            fmllr_trans=fmllr_trans,
+        )
+        fst = graph_compiler.compile_fst(new_utt.transcript)
+        decodable = DecodableAmDiagGmmScaled(am, transition_model, feats, acoustic_scale)
+
+        d = LatticeFasterDecoder(fst, config)
+        ans = d.Decode(decodable)
+        if not ans:
+            raise SegmenterError(f"Did not successfully decode: {current_transcript}")
+        ans, decoded = d.GetBestPath()
+        if decoded.NumStates() == 0:
+            raise SegmenterError("Error getting best path from decoder for utterance")
+        alignment, words, weight = GetLinearSymbolSequence(decoded)
+
+        words = words[:-1]
+        transcript = " ".join([lexicon_compiler.word_table.find(x) for x in words])
+
+        new_utt.transcript = transcript
+        current_transcript = " ".join(current_transcript.split()[len(words) :])
+        new_utt.mfccs = None
+        new_utt.cmvn_string = utterance.cmvn_string
+        new_utt.fmllr_string = utterance.fmllr_string
+        new_utts.append(new_utt)
+    return new_utts
+
+
 def get_initial_segmentation(frames: numpy.ndarray, frame_shift: float) -> List[CtmInterval]:
     """
     Compute initial segmentation over voice activity
@@ -140,193 +316,39 @@ def merge_segments(
     return [x for x in merged_segments if x.end - x.begin > min_segment_length]
 
 
-def construct_utterance_segmentation_fst(
-    text: str,
-    word_symbol_table: pywrapfst.SymbolTable,
-    interjection_words: typing.List[str] = None,
-):
-    if interjection_words is None:
-        interjection_words = []
-    words = text.split()
-    fst = pynini.Fst()
-    start_state = fst.add_state()
-    fst.set_start(start_state)
-    fst.add_states(len(words))
-
-    for i, w in enumerate(words):
-        next_state = i + 1
-        label = word_symbol_table.find(w)
-        if i != 0:
-            fst.add_arc(
-                start_state,
-                pywrapfst.Arc(label, label, pywrapfst.Weight.one(fst.weight_type()), next_state),
-            )
-        fst.add_arc(
-            i, pywrapfst.Arc(label, label, pywrapfst.Weight.one(fst.weight_type()), next_state)
-        )
-
-        fst.set_final(next_state, pywrapfst.Weight(fst.weight_type(), 1))
-        for interjection in interjection_words:
-            start_interjection_state = fst.add_state()
-            fst.add_arc(
-                next_state,
-                pywrapfst.Arc(
-                    word_symbol_table.find("<eps>"),
-                    word_symbol_table.find("<eps>"),
-                    pywrapfst.Weight(fst.weight_type(), 10),
-                    start_interjection_state,
-                ),
-            )
-            if " " in interjection:
-                i_words = interjection.split()
-                for j, iw in enumerate(i_words):
-                    next_interjection_state = fst.add_state()
-                    if j == 0:
-                        prev_state = start_interjection_state
-                    else:
-                        prev_state = next_interjection_state - 1
-                    label = word_symbol_table.find(iw)
-                    weight = pywrapfst.Weight.one(fst.weight_type())
-                    fst.add_arc(
-                        prev_state, pywrapfst.Arc(label, label, weight, next_interjection_state)
-                    )
-                final_interjection_state = next_interjection_state
-            else:
-                final_interjection_state = fst.add_state()
-                label = word_symbol_table.find(interjection)
-                weight = pywrapfst.Weight.one(fst.weight_type())
-                fst.add_arc(
-                    start_interjection_state,
-                    pywrapfst.Arc(label, label, weight, final_interjection_state),
-                )
-            # Path to next word in text
-            weight = pywrapfst.Weight.one(fst.weight_type())
-            fst.add_arc(
-                final_interjection_state,
-                pywrapfst.Arc(
-                    word_symbol_table.find("<eps>"),
-                    word_symbol_table.find("<eps>"),
-                    weight,
-                    next_state,
-                ),
-            )
-    for interjection in interjection_words:
-        start_interjection_state = fst.add_state()
-        fst.add_arc(
-            start_state,
-            pywrapfst.Arc(
-                word_symbol_table.find("<eps>"),
-                word_symbol_table.find("<eps>"),
-                pywrapfst.Weight(fst.weight_type(), 10),
-                start_interjection_state,
-            ),
-        )
-        if " " in interjection:
-            i_words = interjection.split()
-            for j, iw in enumerate(i_words):
-                next_interjection_state = fst.add_state()
-                if j == 0:
-                    prev_state = start_interjection_state
-                else:
-                    prev_state = next_interjection_state - 1
-                label = word_symbol_table.find(iw)
-                weight = pywrapfst.Weight.one(fst.weight_type())
-                fst.add_arc(
-                    prev_state, pywrapfst.Arc(label, label, weight, next_interjection_state)
-                )
-            final_interjection_state = next_interjection_state
-        else:
-            final_interjection_state = fst.add_state()
-            label = word_symbol_table.find(interjection)
-            weight = pywrapfst.Weight.one(fst.weight_type())
-            fst.add_arc(
-                start_interjection_state,
-                pywrapfst.Arc(label, label, weight, final_interjection_state),
-            )
-        # Path to next word in text
-        weight = pywrapfst.Weight.one(fst.weight_type())
-        fst.add_arc(
-            final_interjection_state,
-            pywrapfst.Arc(
-                word_symbol_table.find("<eps>"),
-                word_symbol_table.find("<eps>"),
-                weight,
-                start_state,
-            ),
-        )
-    fst.set_final(next_state, pywrapfst.Weight.one(fst.weight_type()))
-    fst = pynini.determinize(fst)
-    fst = pynini.rmepsilon(fst)
-    fst = pynini.disambiguate(fst)
-    fst = pynini.determinize(fst)
-    return fst
-
-
-def align_text(split_utterance_texts, text, oovs, oov_word, interjection_words):
-    text = text.split()
-    split_utterance_text = []
-    lengths = []
-    indices = list(split_utterance_texts.keys())
-    for t in split_utterance_texts.values():
-        t = t.split()
-        lengths.append(len(t))
-        split_utterance_text.extend(t)
-
-    def score_func(first_element, second_element):
-        if first_element == second_element:
-            return 0
-        if first_element == oov_word and second_element in oovs:
-            return 0
-        if first_element == oov_word and second_element not in oovs:
-            return -10
-        if first_element in interjection_words:
-            return -10
-        return -2
-
-    alignments = pairwise2.align.globalcs(
-        split_utterance_text, text, score_func, -0.5, -0.1, gap_char=["-"], one_alignment_only=True
+def segment_utterance_vad(
+    utterance: KalpyUtterance,
+    mfcc_options: MetaDict,
+    vad_options: MetaDict,
+    segmentation_options: MetaDict,
+) -> typing.List[Segment]:
+    mfcc_computer = MfccComputer(**mfcc_options)
+    vad_computer = VadComputer(**vad_options)
+    feats = mfcc_computer.compute_mfccs_for_export(utterance.segment, compress=False)
+    vad = vad_computer.compute_vad(feats).numpy()
+    segments = get_initial_segmentation(vad, mfcc_computer.frame_shift)
+    segments = merge_segments(
+        segments,
+        segmentation_options["close_th"],
+        segmentation_options["large_chunk_size"],
+        segmentation_options["len_th"],
     )
-    results = [[]]
-    split_ind = 0
-    current_size = 0
-    for a in alignments:
-        for i, sa in enumerate(a.seqA):
-            sb = a.seqB[i]
-            if sa == "<unk>":
-                sa = sb
-            if sa != "-":
-                if (
-                    split_ind < len(lengths) - 1
-                    and sa not in split_utterance_texts[indices[split_ind]].split()
-                    and split_utterance_texts[indices[split_ind + 1]].split()[0] == sa
-                ):
-                    results.append([])
-                    split_ind += 1
-                    current_size = 0
-                results[-1].append(sa)
-                current_size += 1
-                if split_ind < len(lengths) - 1 and current_size >= lengths[split_ind]:
-                    results.append([])
-                    split_ind += 1
-                    current_size = 0
-            elif sb != "-":
-                results[-1].append(sb)
-    results = {k: " ".join(r) for k, r in zip(split_utterance_texts.keys(), results)}
-    return results
+    new_segments = []
+    for s in segments:
+        seg = Segment(
+            utterance.segment.file_path,
+            s.begin + utterance.segment.begin,
+            s.end + utterance.segment.begin,
+            utterance.segment.channel,
+        )
+        new_segments.append(seg)
+    return new_segments
 
 
 def segment_utterance_vad_speech_brain(
-    utterance: Utterance, sound_file: SoundFile, vad_model: VAD, segmentation_options: MetaDict
-) -> np.ndarray:
-    y, _ = librosa.load(
-        sound_file.sound_file_path,
-        sr=16000,
-        mono=False,
-        offset=utterance.begin,
-        duration=utterance.duration,
-    )
-    if len(y.shape) > 1:
-        y = y[:, utterance.channel]
+    utterance: KalpyUtterance, vad_model: VAD, segmentation_options: MetaDict
+) -> typing.List[Segment]:
+    y = utterance.segment.wave
     prob_chunks = vad_model.get_speech_prob_chunk(
         torch.tensor(y[np.newaxis, :], device=vad_model.device)
     ).cpu()
@@ -337,11 +359,12 @@ def segment_utterance_vad_speech_brain(
     ).float()
     # Compute the boundaries of the speech segments
     boundaries = vad_model.get_boundaries(prob_th, output_value="seconds")
-    boundaries += utterance.begin
+    boundaries += utterance.segment.begin
+
     # Apply energy-based VAD on the detected speech segments
-    if True or segmentation_options["apply_energy_VAD"]:
+    if segmentation_options["apply_energy_VAD"]:
         boundaries = vad_model.energy_VAD(
-            sound_file.sound_file_path,
+            utterance.segment.file_path,
             boundaries,
             activation_th=segmentation_options["en_activation_th"],
             deactivation_th=segmentation_options["en_deactivation_th"],
@@ -358,11 +381,19 @@ def segment_utterance_vad_speech_brain(
     # Double check speech segments
     if segmentation_options["double_check"]:
         boundaries = vad_model.double_check_speech_segments(
-            boundaries, sound_file.sound_file_path, speech_th=segmentation_options["speech_th"]
+            boundaries, utterance.segment.file_path, speech_th=segmentation_options["speech_th"]
         )
-    boundaries[:, 0] -= round(segmentation_options["close_th"] / 3, 3)
-    boundaries[:, 1] += round(segmentation_options["close_th"] / 3, 3)
-    return boundaries.numpy()
+    boundaries[:, 0] -= round(segmentation_options["close_th"] / 2, 3)
+    boundaries[:, 1] += round(segmentation_options["close_th"] / 2, 3)
+    boundaries = boundaries.numpy()
+    segments = []
+    for i in range(boundaries.shape[0]):
+        begin, end = boundaries[i]
+        begin = max(begin, 0)
+        end = min(end, utterance.segment.end)
+        seg = Segment(utterance.segment.file_path, begin, end, utterance.segment.channel)
+        segments.append(seg)
+    return segments
 
 
 class SegmentVadFunction(KaldiFunction):
@@ -373,7 +404,7 @@ class SegmentVadFunction(KaldiFunction):
     --------
     :meth:`montreal_forced_aligner.segmenter.Segmenter.segment_vad`
         Main function that calls this function in parallel
-    :meth:`montreal_forced_aligner.segmenter.Segmenter.segment_vad_arguments`
+    :meth:`montreal_forced_aligner.segmenter.VadSegmenter.segment_vad_arguments`
         Job method for generating arguments for this function
     :kaldi_utils:`segmentation.pl`
         Kaldi utility
@@ -409,3 +440,72 @@ def _run(self):
             self.callback((int(utt_id.split("-")[-1]), merged))
             reader.Next()
         reader.Close()
+
+
+class SegmentTranscriptFunction(KaldiFunction):
+    """
+    Multiprocessing function to segment utterances with transcripts from VAD output.
+
+    See Also
+    --------
+    :meth:`montreal_forced_aligner.segmenter.Segmenter.segment_vad`
+        Main function that calls this function in parallel
+    :meth:`montreal_forced_aligner.segmenter.TranscriptionSegmenter.segment_transcript_arguments`
+        Job method for generating arguments for this function
+    :kaldi_utils:`segmentation.pl`
+        Kaldi utility
+
+    Parameters
+    ----------
+    args: :class:`~montreal_forced_aligner.segmenter.SegmentTranscriptArguments`
+        Arguments for the function
+    """
+
+    def __init__(self, args: SegmentTranscriptArguments):
+        super().__init__(args)
+        self.acoustic_model = args.acoustic_model
+        self.vad_model = args.vad_model
+        self.lexicon_compilers = args.lexicon_compilers
+        self.segmentation_options = args.segmentation_options
+        self.mfcc_options = args.mfcc_options
+        self.vad_options = args.vad_options
+        self.decode_options = args.decode_options
+        self.speechbrain = self.vad_model is not None
+
+    def _run(self):
+        """Run the function"""
+        with (self.session() as session):
+            job: Job = (
+                session.query(Job)
+                .options(joinedload(Job.corpus, innerjoin=True), subqueryload(Job.dictionaries))
+                .filter(Job.id == self.job_name)
+                .first()
+            )
+
+            for d in job.dictionaries:
+                utterances = (
+                    session.query(Utterance)
+                    .join(Utterance.speaker)
+                    .options(
+                        joinedload(Utterance.file).joinedload(File.sound_file),
+                        joinedload(Utterance.speaker),
+                    )
+                    .filter(
+                        Utterance.job_id == self.job_name,
+                        Utterance.duration >= 0.1,
+                        Speaker.dictionary_id == d.id,
+                    )
+                    .order_by(Utterance.kaldi_id)
+                )
+                for u in utterances:
+                    new_utterances = segment_utterance_transcript(
+                        self.acoustic_model,
+                        u.to_kalpy(),
+                        self.lexicon_compilers[d.id],
+                        self.vad_model if self.speechbrain else None,
+                        self.segmentation_options,
+                        mfcc_options=self.mfcc_options if not self.speechbrain else None,
+                        vad_options=self.vad_options if not self.speechbrain else None,
+                        **self.decode_options,
+                    )
+                    self.callback((u.id, new_utterances))
diff --git a/montreal_forced_aligner/vad/segmenter.py b/montreal_forced_aligner/vad/segmenter.py
index f9379c2d..5824246e 100644
--- a/montreal_forced_aligner/vad/segmenter.py
+++ b/montreal_forced_aligner/vad/segmenter.py
@@ -30,13 +30,16 @@
 from montreal_forced_aligner.vad.multiprocessing import (
     FOUND_SPEECHBRAIN,
     VAD,
+    SegmentTranscriptArguments,
+    SegmentTranscriptFunction,
     SegmentVadArguments,
     SegmentVadFunction,
+    segment_utterance_transcript,
 )
 
 SegmentationType = List[Dict[str, float]]
 
-__all__ = ["Segmenter", "SpeechbrainSegmenterMixin", "TranscriptionSegmenter"]
+__all__ = ["VadSegmenter", "SpeechbrainSegmenterMixin", "TranscriptionSegmenter"]
 
 logger = logging.getLogger("mfa")
 
@@ -50,13 +53,13 @@ def __init__(
         overlap_small_chunk: bool = False,
         apply_energy_vad: bool = False,
         double_check: bool = True,
-        close_th: float = 0.250,
-        len_th: float = 0.250,
+        close_th: float = 0.333,
+        len_th: float = 0.333,
         activation_th: float = 0.5,
         deactivation_th: float = 0.25,
         en_activation_th: float = 0.5,
-        en_deactivation_th: float = 0.0,
-        speech_th: float = 0.50,
+        en_deactivation_th: float = 0.4,
+        speech_th: float = 0.5,
         cuda: bool = False,
         speechbrain: bool = False,
         **kwargs,
@@ -82,6 +85,7 @@ def __init__(
         self.cuda = cuda
         self.speechbrain = speechbrain
         self.segment_padding = segment_padding
+        self.vad_model = None
         if self.speechbrain:
             model_dir = os.path.join(config.TEMPORARY_DIRECTORY, "models", "VAD")
             os.makedirs(model_dir, exist_ok=True)
@@ -112,7 +116,7 @@ def segmentation_options(self) -> MetaDict:
         }
 
 
-class Segmenter(
+class VadSegmenter(
     VadConfigMixin,
     AcousticCorpusMixin,
     FileExporterMixin,
@@ -430,10 +434,14 @@ def export_files(self, output_directory: str, output_format: Optional[str] = Non
                 f.save(output_directory, output_format=output_format)
 
 
-class TranscriptionSegmenter(TranscriberMixin, SpeechbrainSegmenterMixin, TopLevelMfaWorker):
+class TranscriptionSegmenter(
+    VadConfigMixin, TranscriberMixin, SpeechbrainSegmenterMixin, TopLevelMfaWorker
+):
     def __init__(self, acoustic_model_path: Path = None, **kwargs):
         self.acoustic_model = AcousticModel(acoustic_model_path)
         kw = self.acoustic_model.parameters
+        kw["apply_energy_vad"] = True
+        kw["apply_energy_vad"] = True
         kw.update(kwargs)
         super().__init__(**kw)
 
@@ -451,10 +459,151 @@ def setup(self) -> None:
 
         self.normalize_text()
 
-        self.write_lexicon_information(write_disambiguation=True)
+        self.write_lexicon_information(write_disambiguation=False)
 
     def setup_acoustic_model(self):
         self.acoustic_model.validate(self)
         self.acoustic_model.export_model(self.model_directory)
         self.acoustic_model.export_model(self.working_directory)
         self.acoustic_model.log_details()
+
+    def segment(self):
+        """
+        Performs VAD and segmentation into utterances
+
+        Raises
+        ------
+        :class:`~montreal_forced_aligner.exceptions.KaldiProcessingError`
+            If there were any errors in running Kaldi binaries
+        """
+        self.setup()
+        self.create_new_current_workflow(WorkflowType.segmentation)
+        wf = self.current_workflow
+        if wf.done:
+            logger.info("Segmentation already done, skipping.")
+            return
+        try:
+            self.segment_transcripts()
+            with self.session() as session:
+                session.query(CorpusWorkflow).filter(CorpusWorkflow.id == wf.id).update(
+                    {"done": True}
+                )
+                session.commit()
+        except Exception as e:
+            with self.session() as session:
+                session.query(CorpusWorkflow).filter(CorpusWorkflow.id == wf.id).update(
+                    {"dirty": True}
+                )
+                session.commit()
+            if isinstance(e, KaldiProcessingError):
+                log_kaldi_errors(e.error_logs)
+                e.update_log_file()
+            raise
+
+    def segment_transcript_arguments(self) -> List[SegmentTranscriptArguments]:
+        """
+        Generate Job arguments for :class:`~montreal_forced_aligner.segmenter.SegmentTranscriptFunction`
+
+        Returns
+        -------
+        list[SegmentTranscriptArguments]
+            Arguments for processing
+        """
+        decode_options = self.decode_options
+        boost_silence = decode_options.pop("boost_silence", 1.0)
+        if boost_silence != 1.0:
+            self.acoustic_model.acoustic_model.boost_silence(
+                self.acoustic_model.transition_model, self.silence_symbols, boost_silence
+            )
+        return [
+            SegmentTranscriptArguments(
+                j.id,
+                getattr(self, "session", ""),
+                self.working_log_directory.joinpath(f"segment_vad.{j.id}.log"),
+                self.acoustic_model,
+                self.vad_model,
+                self.lexicon_compilers,
+                self.mfcc_options,
+                self.vad_options,
+                self.segmentation_options,
+                self.decode_options,
+            )
+            for j in self.jobs
+        ]
+
+    def segment_transcripts(self) -> None:
+
+        arguments = self.segment_transcript_arguments()
+        old_utts = set()
+        new_utterance_mapping = []
+
+        with tqdm(
+            total=self.num_utterances, disable=config.QUIET
+        ) as pbar, self.session() as session:
+            utterances = session.query(Utterance.id, Utterance.speaker_id, Utterance.file_id)
+            utterance_cache = {}
+            for u_id, speaker_id, file_id in utterances:
+                utterance_cache[u_id] = (speaker_id, file_id)
+            for utt, new_utts in run_kaldi_function(
+                SegmentTranscriptFunction, arguments, pbar.update
+            ):
+                old_utts.add(utt)
+                speaker_id, file_id = utterance_cache[utt]
+                for new_utt in new_utts:
+                    new_utterance_mapping.append(
+                        {
+                            "begin": new_utt.segment.begin,
+                            "end": new_utt.segment.end,
+                            "speaker_id": speaker_id,
+                            "file_id": file_id,
+                            "oovs": "",
+                            "text": new_utt.transcript,
+                            "normalized_text": new_utt.transcript,
+                            "features": "",
+                            "in_subset": False,
+                            "ignored": False,
+                            "channel": new_utt.segment.channel,
+                        }
+                    )
+            session.query(Utterance).filter(Utterance.id.in_(old_utts)).delete()
+            session.bulk_insert_mappings(
+                Utterance, new_utterance_mapping, return_defaults=False, render_nulls=True
+            )
+            session.commit()
+
+    def segment_transcript(self, utterance_id: int):
+        with self.session() as session:
+            utterance = session.get(Utterance, utterance_id)
+            new_utterances = segment_utterance_transcript(
+                self.acoustic_model,
+                utterance.to_kalpy(),
+                self.lexicon_compilers[utterance.speaker.dictionary_id],
+                self.vad_model if self.speechbrain else None,
+                self.segmentation_options,
+                mfcc_options=self.mfcc_options if not self.speechbrain else None,
+                vad_options=self.vad_options if not self.speechbrain else None,
+                **self.decode_options,
+            )
+        return new_utterances
+
+    def export_files(self, output_directory: str, output_format: Optional[str] = None) -> None:
+        """
+        Export the results of segmentation as TextGrids
+
+        Parameters
+        ----------
+        output_directory: str
+            Directory to save segmentation TextGrids
+        output_format: str, optional
+            Format to force output files into
+        """
+        if output_format is None:
+            output_format = TextFileType.TEXTGRID.value
+        os.makedirs(output_directory, exist_ok=True)
+        with self.session() as session:
+            for f in session.query(File).options(
+                selectinload(File.utterances).joinedload(Utterance.speaker, innerjoin=True),
+                joinedload(File.sound_file, innerjoin=True),
+                joinedload(File.text_file),
+            ):
+                f.save(output_directory, output_format=output_format)
diff --git a/tests/test_commandline_create_segments.py b/tests/test_commandline_create_segments.py
index 5aabbde8..bad52c07 100644
--- a/tests/test_commandline_create_segments.py
+++ b/tests/test_commandline_create_segments.py
@@ -18,7 +18,7 @@ def test_create_segments(
     output_path = generated_dir.joinpath("segment_output")
     shutil.rmtree(output_path, ignore_errors=True)
     command = [
-        "segment",
+        "segment_vad",
         basic_corpus_dir,
         output_path,
         "-q",
@@ -47,6 +47,79 @@ def test_create_segments_speechbrain(
     temp_dir,
     basic_segment_config_path,
     db_setup,
+):
+    if not FOUND_SPEECHBRAIN:
+        pytest.skip("SpeechBrain not installed")
+    output_path = generated_dir.joinpath("segment_output")
+    command = [
+        "segment_vad",
+        basic_corpus_dir,
+        output_path,
+        "-q",
+        "--clean",
+        "--no_debug",
+        "-v",
+        "--speechbrain",
+        "--config_path",
+        basic_segment_config_path,
+    ]
+    command = [str(x) for x in command]
+    result = click.testing.CliRunner(mix_stderr=False).invoke(
+        mfa_cli, command, catch_exceptions=True
+    )
+    print(result.stdout)
+    print(result.stderr)
+    if result.exception:
+        print(result.exc_info)
+        raise result.exception
+    assert not result.return_value
+    assert os.path.exists(os.path.join(output_path, "michael", "acoustic_corpus.TextGrid"))
+
+
+def test_create_segments_transcripts(
+    basic_corpus_dir,
+    english_mfa_acoustic_model,
+    english_us_mfa_reduced_dict,
+    generated_dir,
+    temp_dir,
+    basic_segment_config_path,
+    db_setup,
+):
+    output_path = generated_dir.joinpath("segment_output")
+    command = [
+        "segment",
+        basic_corpus_dir,
+        english_us_mfa_reduced_dict,
+        english_mfa_acoustic_model,
+        output_path,
+        "-q",
+        "--clean",
+        "--no_debug",
+        "-v",
+        "--config_path",
+        basic_segment_config_path,
+    ]
+    command = [str(x) for x in command]
+    result = click.testing.CliRunner(mix_stderr=False).invoke(
+        mfa_cli, command, catch_exceptions=True
+    )
+    print(result.stdout)
+    print(result.stderr)
+    if result.exception:
+        print(result.exc_info)
+        raise result.exception
+    assert not result.return_value
+    assert os.path.exists(os.path.join(output_path, "michael", "acoustic_corpus.TextGrid"))
+
+
+def test_create_segments_transcripts_speechbrain(
+    basic_corpus_dir,
+    english_mfa_acoustic_model,
+    english_us_mfa_reduced_dict,
+    generated_dir,
+    temp_dir,
+    basic_segment_config_path,
+    db_setup,
 ):
     if not FOUND_SPEECHBRAIN:
         pytest.skip("SpeechBrain not installed")
@@ -54,12 +127,15 @@ def test_create_segments_speechbrain(
     command = [
         "segment",
         basic_corpus_dir,
+        english_us_mfa_reduced_dict,
+        english_mfa_acoustic_model,
         output_path,
         "-q",
         "--clean",
         "--no_debug",
         "-v",
         "--speechbrain",
+        "--no_use_mp",
         "--config_path",
         basic_segment_config_path,
     ]
diff --git a/tests/test_commandline_train.py b/tests/test_commandline_train.py
index 440685d1..f42c44e1 100644
--- a/tests/test_commandline_train.py
+++ b/tests/test_commandline_train.py
@@ -1,10 +1,12 @@
 import os
 
 import click.testing
+import pytest
 
 from montreal_forced_aligner.command_line.mfa import mfa_cli
 
 
+@pytest.mark.skip("Inconsistent failing on CI")
 def test_train_acoustic_with_g2p(
     combined_corpus_dir,
     english_us_mfa_reduced_dict,
@@ -28,6 +30,7 @@ def test_train_acoustic_with_g2p(
         "--clean",
         "--quiet",
         "--debug",
+        "--use_postgres",
         "--no_use_mp",
         "--config_path",
         train_g2p_acoustic_config_path,
diff --git a/tests/test_segmentation.py b/tests/test_segmentation.py
new file mode 100644
index 00000000..84b335e7
--- /dev/null
+++ b/tests/test_segmentation.py
@@ -0,0 +1,29 @@
+import pytest
+
+from montreal_forced_aligner.diarization.speaker_diarizer import FOUND_SPEECHBRAIN
+from montreal_forced_aligner.vad.segmenter import TranscriptionSegmenter
+
+
+def test_segment_transcript(
+    basic_corpus_dir,
+    english_mfa_acoustic_model,
+    english_us_mfa_reduced_dict,
+    generated_dir,
+    temp_dir,
+    basic_segment_config_path,
+    db_setup,
+):
+    if not FOUND_SPEECHBRAIN:
+        pytest.skip("SpeechBrain not installed")
+    segmenter = TranscriptionSegmenter(
+        corpus_directory=basic_corpus_dir,
+        dictionary_path=english_us_mfa_reduced_dict,
+        acoustic_model_path=english_mfa_acoustic_model,
+        speechbrain=True,
+        en_activation_th=0.4,
+        en_deactivation_th=0.4,
+    )
+    segmenter.setup()
+    new_utterances = segmenter.segment_transcript(1)
+    assert len(new_utterances) > 0
+    segmenter.cleanup()