Further refactoring and fixes for anchor (#684)

* Further refactoring and fixes for anchor * Update segmenting of transcripts
MontrealCorpusTools · Aug 30, 2023 · 7fbccdd · 7fbccdd
1 parent 68e7244
commit 7fbccdd
Show file tree

Hide file tree

Showing 38 changed files with 1,696 additions and 599 deletions.
diff --git a/ci/docker_environment.yaml b/ci/docker_environment.yaml
@@ -39,5 +39,6 @@ dependencies:
   - rich
   - rich-click
   - kalpy
+  - spacy[ja]
   - pip:
       - speechbrain
diff --git a/docs/source/changelog/changelog_3.0.rst b/docs/source/changelog/changelog_3.0.rst
@@ -5,7 +5,23 @@
 3.0 Changelog
 *************
 
-3.0.0a
+3.0.0a4
+=======
+
+- Separate out segmentation functionality into :ref:`create_segments` and :ref:`create_segments_vad`
+- Fix a bug in :ref:`align_one` when specifying a ``config_path``
+
+3.0.0a3
+=======
+
+- Refactor tokenization for future spacy use
+
+3.0.0a2
+=======
+
+- Revamped how configuration is done following change to using threading instead of multiprocessing
+
+3.0.0a1
 ======
 
 - Add dependency on :xref:`kalpy` for interacting for Kaldi

diff --git a/docs/source/reference/dictionary/helper.rst b/docs/source/reference/dictionary/helper.rst
@@ -30,17 +30,6 @@ Mixins
 
    MultispeakerDictionaryMixin
 
-Helper
-------
-
-.. currentmodule:: montreal_forced_aligner.dictionary.mixins
-
-.. autosummary::
-   :toctree: generated/
-
-   SanitizeFunction
-   SplitWordsFunction
-
 
 Pronunciation probability functionality
 =======================================

diff --git a/docs/source/reference/segmentation/helper.rst b/docs/source/reference/segmentation/helper.rst
@@ -7,7 +7,12 @@ Helper functions
 .. autosummary::
    :toctree: generated/
 
+   SegmentVadFunction
+   SegmentVadArguments
    SegmentVadFunction
    SegmentVadArguments
    get_initial_segmentation
    merge_segments
+   segment_utterance_transcript
+   segment_utterance_vad
+   segment_utterance_vad_speech_brain
diff --git a/docs/source/reference/segmentation/main.rst b/docs/source/reference/segmentation/main.rst
@@ -7,4 +7,5 @@ Segmenter
 .. autosummary::
    :toctree: generated/
 
-   Segmenter
+   VadSegmenter
+   TranscriptionSegmenter
diff --git a/docs/source/reference/tokenization/helper.rst b/docs/source/reference/tokenization/helper.rst
@@ -12,3 +12,15 @@ Helper
    TokenizerRewriter
    TokenizerArguments
    TokenizerFunction
+
+
+Helper
+------
+
+.. currentmodule:: montreal_forced_aligner.tokenization.simple
+
+.. autosummary::
+   :toctree: generated/
+
+   SanitizeFunction
+   SplitWordsFunction
diff --git a/docs/source/reference/tokenization/tokenizer.rst b/docs/source/reference/tokenization/tokenizer.rst
@@ -11,3 +11,13 @@ Corpus tokenizer
 
    CorpusTokenizer
    TokenizerValidator
+
+Simple tokenizer
+================
+
+.. currentmodule:: montreal_forced_aligner.tokenization.simple
+
+.. autosummary::
+   :toctree: generated/
+
+   SimpleTokenizer
diff --git a/docs/source/user_guide/corpus_creation/create_segments.rst b/docs/source/user_guide/corpus_creation/create_segments.rst
@@ -1,11 +1,11 @@
 
 .. _create_segments:
 
-Create segments ``(mfa create_segments)``
-=========================================
+Segment transcribed files ``(mfa segment)``
+===========================================
 
 The Montreal Forced Aligner can use Voice Activity Detection (VAD) capabilities from :xref:`speechbrain` to generate segments from
-a longer sound file.
+a longer sound file, while attempting to segment transcripts as well.  If you do not have transcripts, see :ref:`create_segments_vad`.
 
 .. note::
 
@@ -15,7 +15,37 @@ Command reference
 -----------------
 
 .. click:: montreal_forced_aligner.command_line.create_segments:create_segments_cli
-   :prog: mfa create_segments
+   :prog: mfa segment
+   :nested: full
+
+
+Configuration reference
+-----------------------
+
+- :ref:`configuration_segmentation`
+
+API reference
+-------------
+
+- :ref:`segmentation_api`
+
+.. _create_segments_vad:
+
+Segment untranscribed files ``(mfa segment_vad)``
+=================================================
+
+The Montreal Forced Aligner can use Voice Activity Detection (VAD) capabilities from :xref:`speechbrain` or energy based VAD to generate segments from
+a longer sound file.  This command does not split transcripts, instead assigning a default label of "speech" to all identified speech segments.  If you would like to preserve transcripts for each segment, see :ref:`create_segments`.
+
+.. note::
+
+   On Windows, if you get an ``OSError/WinError 1314`` during the run, follow `these instructions <https://www.scivision.dev/windows-symbolic-link-permission-enable/>`_ to enable symbolic link creation permissions.
+
+Command reference
+-----------------
+
+.. click:: montreal_forced_aligner.command_line.create_segments:create_segments_vad_cli
+   :prog: mfa segment_vad
    :nested: full
 
 

diff --git a/environment.yml b/environment.yml
@@ -48,6 +48,7 @@ dependencies:
   - rich
   - rich-click
   - kalpy
+  - spacy
   - pip:
       - build
       - twine

diff --git a/montreal_forced_aligner/abc.py b/montreal_forced_aligner/abc.py
@@ -282,6 +282,7 @@ def initialize_database(self) -> None:
                 conn.execute(sqlalchemy.text("CREATE EXTENSION IF NOT EXISTS vector"))
                 conn.execute(sqlalchemy.text("CREATE EXTENSION IF NOT EXISTS pg_trgm"))
                 conn.execute(sqlalchemy.text("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"))
+                conn.execute(sqlalchemy.text(f"select setseed({config.SEED/32768})"))
                 conn.commit()
 
         MfaSqlBase.metadata.create_all(self.db_engine)

diff --git a/montreal_forced_aligner/acoustic_modeling/lda.py b/montreal_forced_aligner/acoustic_modeling/lda.py
@@ -305,7 +305,6 @@ def lda_options(self) -> MetaDict:
         return {
             "lda_dimension": self.lda_dimension,
             "random_prune": self.random_prune,
-            "silence_csl": self.silence_csl,
             "splice_left_context": self.splice_left_context,
             "splice_right_context": self.splice_right_context,
         }

diff --git a/montreal_forced_aligner/alignment/adapting.py b/montreal_forced_aligner/alignment/adapting.py
@@ -78,16 +78,6 @@ def map_acc_stats_arguments(self, alignment=False) -> List[AccStatsArguments]:
             model_path = self.model_path
         arguments = []
         for j in self.jobs:
-            feat_strings = {}
-            for d_id in j.dictionary_ids:
-                feat_strings[d_id] = j.construct_feature_proc_string(
-                    self.working_directory,
-                    d_id,
-                    self.feature_options["uses_splices"],
-                    self.feature_options["splice_left_context"],
-                    self.feature_options["splice_right_context"],
-                    self.feature_options["uses_speaker_adaptation"],
-                )
             arguments.append(
                 AccStatsArguments(
                     j.id,

diff --git a/montreal_forced_aligner/alignment/mixins.py b/montreal_forced_aligner/alignment/mixins.py
@@ -107,11 +107,6 @@ def data_directory(self) -> str:
         """Corpus data directory"""
         ...
 
-    @abstractmethod
-    def construct_feature_proc_strings(self) -> typing.List[typing.Dict[str, str]]:
-        """Generate feature strings"""
-        ...
-
     def compile_train_graphs_arguments(self) -> typing.List[CompileTrainGraphsArguments]:
         """
         Generate Job arguments for :class:`~montreal_forced_aligner.alignment.multiprocessing.CompileTrainGraphsFunction`

diff --git a/montreal_forced_aligner/command_line/align_one.py b/montreal_forced_aligner/command_line/align_one.py
@@ -18,8 +18,17 @@
     validate_dictionary,
 )
 from montreal_forced_aligner.corpus.classes import FileData
+from montreal_forced_aligner.data import BRACKETED_WORD, CUTOFF_WORD, LAUGHTER_WORD, OOV_WORD
+from montreal_forced_aligner.dictionary.mixins import (
+    DEFAULT_BRACKETS,
+    DEFAULT_CLITIC_MARKERS,
+    DEFAULT_COMPOUND_MARKERS,
+    DEFAULT_PUNCTUATION,
+    DEFAULT_WORD_BREAK_MARKERS,
+)
 from montreal_forced_aligner.models import AcousticModel
 from montreal_forced_aligner.online.alignment import align_utterance_online
+from montreal_forced_aligner.tokenization.simple import SimpleTokenizer
 
 __all__ = ["align_one_cli"]
 
@@ -74,6 +83,18 @@ def align_one_cli(context, **kwargs) -> None:
     output_path: Path = kwargs["output_path"]
     output_format = kwargs["output_format"]
     c = PretrainedAligner.parse_parameters(config_path, context.params, context.args)
+    tokenizer = SimpleTokenizer(
+        word_break_markers=c.get("word_break_markers", DEFAULT_WORD_BREAK_MARKERS),
+        punctuation=c.get("punctuation", DEFAULT_PUNCTUATION),
+        clitic_markers=c.get("clitic_markers", DEFAULT_CLITIC_MARKERS),
+        compound_markers=c.get("compound_markers", DEFAULT_COMPOUND_MARKERS),
+        brackets=c.get("brackets", DEFAULT_BRACKETS),
+        laughter_word=c.get("laughter_word", LAUGHTER_WORD),
+        oov_word=c.get("oov_word", OOV_WORD),
+        bracketed_word=c.get("bracketed_word", BRACKETED_WORD),
+        cutoff_word=c.get("cutoff_word", CUTOFF_WORD),
+        ignore_case=c.get("ignore_case", True),
+    )
 
     acoustic_model = AcousticModel(acoustic_model_path)
     extracted_models_dir = config.TEMPORARY_DIRECTORY.joinpath("extracted_models", "dictionary")
@@ -95,7 +116,7 @@ def align_one_cli(context, **kwargs) -> None:
     l_align_fst_path = dictionary_directory.joinpath("L_align.fst")
     words_path = dictionary_directory.joinpath("words.txt")
     phones_path = dictionary_directory.joinpath("phones.txt")
-    if l_fst_path.exists():
+    if l_fst_path.exists() and not config.CLEAN:
         lexicon_compiler.load_l_from_file(l_fst_path)
         lexicon_compiler.load_l_align_from_file(l_align_fst_path)
         lexicon_compiler.word_table = pywrapfst.SymbolTable.read_text(words_path)
@@ -114,13 +135,14 @@ def align_one_cli(context, **kwargs) -> None:
     cmvn_computer = CmvnComputer()
     for utterance in file.utterances:
         seg = Segment(sound_file_path, utterance.begin, utterance.end, utterance.channel)
-        utt = KalpyUtterance(seg, utterance.text)
+        text, _, _ = tokenizer(utterance.text)
+        utt = KalpyUtterance(seg, text)
         utt.generate_mfccs(acoustic_model.mfcc_computer)
         utterances.append(utt)
     cmvn = cmvn_computer.compute_cmvn_from_features([utt.mfccs for utt in utterances])
     align_options = {
         k: v
-        for k, v in c
+        for k, v in c.items()
         if k
         in [
             "beam",

diff --git a/montreal_forced_aligner/command_line/create_segments.py b/montreal_forced_aligner/command_line/create_segments.py
@@ -6,10 +6,87 @@
 import rich_click as click
 
 from montreal_forced_aligner import config
-from montreal_forced_aligner.command_line.utils import common_options
-from montreal_forced_aligner.vad.segmenter import Segmenter
+from montreal_forced_aligner.command_line.utils import (
+    common_options,
+    validate_acoustic_model,
+    validate_dictionary,
+)
+from montreal_forced_aligner.vad.segmenter import TranscriptionSegmenter, VadSegmenter
+
+__all__ = ["create_segments_vad_cli", "create_segments_cli"]
+
+
+@click.command(
+    name="segment_vad",
+    context_settings=dict(
+        ignore_unknown_options=True,
+        allow_extra_args=True,
+        allow_interspersed_args=True,
+    ),
+    short_help="Split long audio files into shorter segments",
+)
+@click.argument(
+    "corpus_directory",
+    type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
+)
+@click.argument(
+    "output_directory", type=click.Path(file_okay=False, dir_okay=True, path_type=Path)
+)
+@click.option(
+    "--config_path",
+    "-c",
+    help="Path to config file to use for training.",
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path),
+)
+@click.option(
+    "--output_format",
+    help="Format for aligned output files (default is long_textgrid).",
+    default="long_textgrid",
+    type=click.Choice(["long_textgrid", "short_textgrid", "json", "csv"]),
+)
+@click.option(
+    "--speechbrain/--no_speechbrain",
+    "speechbrain",
+    help="Flag for using SpeechBrain's pretrained VAD model",
+)
+@click.option(
+    "--cuda/--no_cuda",
+    "cuda",
+    help="Flag for using CUDA for SpeechBrain's model",
+)
+@click.option(
+    "--segment_transcripts/--no_segment_transcripts",
+    "segment_transcripts",
+    help="Flag for using CUDA for SpeechBrain's model",
+)
+@common_options
+@click.help_option("-h", "--help")
+@click.pass_context
+def create_segments_vad_cli(context, **kwargs) -> None:
+    """
+    Create segments based on SpeechBrain's voice activity detection (VAD) model or a basic energy-based algorithm
+    """
+    if kwargs.get("profile", None) is not None:
+        config.profile = kwargs.pop("profile")
+    config.update_configuration(kwargs)
 
-__all__ = ["create_segments_cli"]
+    config_path = kwargs.get("config_path", None)
+    corpus_directory = kwargs["corpus_directory"]
+    output_directory = kwargs["output_directory"]
+    output_format = kwargs["output_format"]
+
+    segmenter = VadSegmenter(
+        corpus_directory=corpus_directory,
+        **VadSegmenter.parse_parameters(config_path, context.params, context.args),
+    )
+    try:
+        segmenter.segment()
+        segmenter.export_files(output_directory, output_format)
+    except Exception:
+        segmenter.dirty = True
+        raise
+    finally:
+        segmenter.cleanup()
 
 
 @click.command(
@@ -25,6 +102,8 @@
     "corpus_directory",
     type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
 )
+@click.argument("dictionary_path", type=click.UNPROCESSED, callback=validate_dictionary)
+@click.argument("acoustic_model_path", type=click.UNPROCESSED, callback=validate_acoustic_model)
 @click.argument(
     "output_directory", type=click.Path(file_okay=False, dir_okay=True, path_type=Path)
 )
@@ -63,12 +142,16 @@ def create_segments_cli(context, **kwargs) -> None:
 
     config_path = kwargs.get("config_path", None)
     corpus_directory = kwargs["corpus_directory"]
+    dictionary_path = kwargs["dictionary_path"]
+    acoustic_model_path = kwargs["acoustic_model_path"]
     output_directory = kwargs["output_directory"]
     output_format = kwargs["output_format"]
 
-    segmenter = Segmenter(
+    segmenter = TranscriptionSegmenter(
         corpus_directory=corpus_directory,
-        **Segmenter.parse_parameters(config_path, context.params, context.args),
+        dictionary_path=dictionary_path,
+        acoustic_model_path=acoustic_model_path,
+        **TranscriptionSegmenter.parse_parameters(config_path, context.params, context.args),
     )
     try:
         segmenter.segment()
-Original file line number
+Diff line change
@@ Expand Up / @@ -48,6 +48,7 @@ dependencies: @@
       - rich
       - rich-click
       - kalpy
+      - spacy
       - pip:
           - build
           - twine
@@ Expand Down @@