Skip to content

Commit

Permalink
2.2.8 (#615)
Browse files Browse the repository at this point in the history
* Bug fixes

* Fix bug in filter training utterances
  • Loading branch information
mmcauliffe authored Apr 17, 2023
1 parent 2f5cdd0 commit 2569caa
Show file tree
Hide file tree
Showing 28 changed files with 284 additions and 341 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ RUN useradd -ms /bin/bash mfauser
RUN chown -R mfauser /mfa
RUN chown -R mfauser /env
USER mfauser
ENV MFA_ROOT_ENVIRONMENT_VARIABLE=/mfa
ENV MFA_ROOT_DIR=/mfa
RUN conda run -p /env mfa server init

RUN echo "source activate /env && mfa server start" > ~/.bashrc
Expand Down
7 changes: 7 additions & 0 deletions docs/source/changelog/changelog_2.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@
2.2 Changelog
*************

2.2.8
=====
- Fixed a bug introduced in 2.2.4 that made segments overlap with silence intervals when using textgrid cleanup
- Changed databases to always use the root MFA rather than rely on temporary directories to make it more consistent where database files and sockets will get placed. This root directory can be changed via the environment variable :code:`MFA_ROOT_DIR`
- Optimized training graph and collecting alignments after changes to how unknown words were represented internally
- Changed feature generation to use piped audio loaded via PySoundFile rather than via calls to sox/ffmpeg directly

2.2.7
=====

Expand Down
4 changes: 2 additions & 2 deletions docs/source/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ A simple Dockerfile for installing MFA would be:
RUN chown -R mfauser /mfa
RUN chown -R mfauser /env
USER mfauser
ENV MFA_ROOT_ENVIRONMENT_VARIABLE=/mfa
ENV MFA_ROOT_DIR=/mfa
RUN conda run -p /env mfa server init
RUN echo "source activate /env && mfa server start" > ~/.bashrc
Expand All @@ -84,7 +84,7 @@ Crucially, note the useradd and subsequent user commands:
RUN chown -R mfauser /mfa
RUN chown -R mfauser /env
USER mfauser
ENV MFA_ROOT_ENVIRONMENT_VARIABLE=/mfa
ENV MFA_ROOT_DIR=/mfa
RUN conda run -p /env mfa server init
These lines ensure that the database is initialized without using Docker's default root user, avoiding a permissions error thrown by PostGreSQL.
Expand Down
3 changes: 3 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,18 @@ dependencies:
- python>=3.8
- numpy
- librosa
- pysoundfile
- tqdm
- requests
- pyyaml
- dataclassy
- kaldi=*=*cpu*
- sox
- ffmpeg
- scipy
- pynini
- openfst
- scikit-learn
- hdbscan
- baumwelch
- ngram
Expand Down
32 changes: 31 additions & 1 deletion montreal_forced_aligner/acoustic_modeling/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,14 @@
from montreal_forced_aligner.abc import KaldiFunction, ModelExporterMixin, TopLevelMfaWorker
from montreal_forced_aligner.config import GLOBAL_CONFIG
from montreal_forced_aligner.data import MfaArguments, WorkflowType
from montreal_forced_aligner.db import CorpusWorkflow, Dictionary, Job
from montreal_forced_aligner.db import (
CorpusWorkflow,
Dictionary,
Job,
Speaker,
Utterance,
bulk_update,
)
from montreal_forced_aligner.exceptions import ConfigError, KaldiProcessingError
from montreal_forced_aligner.helper import load_configuration, mfa_open, parse_old_features
from montreal_forced_aligner.models import AcousticModel, DictionaryModel
Expand Down Expand Up @@ -333,6 +340,28 @@ def setup_trainers(self):
wf.current = True
session.commit()

def filter_training_utterances(self):
logger.info("Filtering utterances with only unknown words...")
with self.session() as session:
dictionaries = session.query(Dictionary)
for d in dictionaries:
update_mapping = []
word_mapping = d.word_mapping
utterances = (
session.query(Utterance.id, Utterance.normalized_text)
.join(Utterance.speaker)
.filter(Utterance.ignored == False) # noqa
.filter(Speaker.dictionary_id == d.id)
)
for u_id, text in utterances:
words = text.split()
if any(x in word_mapping for x in words):
continue
update_mapping.append({"id": u_id, "ignored": True})
if update_mapping:
bulk_update(session, Utterance, update_mapping)
session.commit()

def setup(self) -> None:
"""Setup for acoustic model training"""
super().setup()
Expand All @@ -342,6 +371,7 @@ def setup(self) -> None:
try:
self.load_corpus()
self.setup_trainers()
self.filter_training_utterances()
except Exception as e:
if isinstance(e, KaldiProcessingError):
log_kaldi_errors(e.error_logs)
Expand Down
82 changes: 59 additions & 23 deletions montreal_forced_aligner/alignment/multiprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import pynini
import pywrapfst
import sqlalchemy
from pynini.lib import rewrite
from sqlalchemy.orm import Session, joinedload, selectinload, subqueryload

from montreal_forced_aligner.corpus.features import (
Expand Down Expand Up @@ -53,7 +54,7 @@
Word,
)
from montreal_forced_aligner.exceptions import AlignmentExportError, FeatureGenerationError
from montreal_forced_aligner.helper import mfa_open, split_phone_position
from montreal_forced_aligner.helper import align_pronunciations, mfa_open, split_phone_position
from montreal_forced_aligner.textgrid import (
construct_output_path,
construct_output_tiers,
Expand Down Expand Up @@ -95,6 +96,8 @@
"GeneratePronunciationsFunction",
]

logger = logging.getLogger("mfa")


def phones_to_prons(
text: str,
Expand All @@ -104,17 +107,23 @@ def phones_to_prons(
phone_symbol_table: pywrapfst.SymbolTableView,
optional_silence_phone: str,
transcription: bool = False,
clitic_marker=None,
clitic_marker: str = None,
oov_word: str = None,
use_g2p: bool = False,
):
if "<space>" in text:
if use_g2p:
words = [x.replace(" ", "") for x in text.split("<space>")]
else:
words = text.split()
word_begin = "#1"
word_end = "#2"
word_begin_symbol = phone_symbol_table.find(word_begin)
word_end_symbol = phone_symbol_table.find(word_end)
acceptor = pynini.accep(text, token_type=word_symbol_table)
if use_g2p:
kaldi_text = text
else:
kaldi_text = " ".join([x if word_symbol_table.member(x) else oov_word for x in words])
acceptor = pynini.accep(kaldi_text, token_type=word_symbol_table)
phone_to_word = pynini.compose(align_lexicon_fst, acceptor)
phone_fst = pynini.Fst()
current_state = phone_fst.add_state()
Expand Down Expand Up @@ -183,25 +192,27 @@ def phones_to_prons(
try:
path_string = pynini.shortestpath(lattice).project("input").string(phone_symbol_table)
except Exception:
logging.debug("For the text and intervals:")
logging.debug(text)
logging.debug([x.label for x in intervals])
logging.debug("There was an issue composing word and phone FSTs")
logging.debug("PHONE FST:")
logger.debug("For the text and intervals:")
logger.debug(text)
logger.debug(kaldi_text)
logger.debug([x.label for x in intervals])
logger.debug("There was an issue composing word and phone FSTs")
logger.debug("PHONE FST:")
phone_fst.set_input_symbols(phone_symbol_table)
phone_fst.set_output_symbols(phone_symbol_table)
logging.debug(phone_fst)
logging.debug("PHONE_TO_WORD FST:")
logger.debug(phone_fst)
logger.debug("PHONE_TO_WORD FST:")
phone_to_word.set_input_symbols(phone_symbol_table)
phone_to_word.set_output_symbols(word_symbol_table)
logging.debug(phone_to_word)
logger.debug(phone_to_word)
raise
path_string = path_string.replace(f"{word_end} {word_begin}", word_begin)
path_string = path_string.replace(f"{word_end}", word_begin)
path_string = re.sub(f"^{word_begin} ", "", path_string)
word_splits = re.split(rf" ?{word_begin} ?", path_string)
word_splits = [x.split() for x in word_splits if x != optional_silence_phone and x]

return list(zip(words, word_splits))
word_splits = [x.split() for x in word_splits if x != optional_silence_phone]
pronunciations = align_pronunciations(words, list(zip(words, word_splits)), oov_word)
return pronunciations


@dataclass
Expand Down Expand Up @@ -568,20 +579,27 @@ def _run(self) -> typing.Generator[typing.Tuple[int, int]]:
workflow.working_directory, f"{self.job_name}.ha_out_disambig.temp"
)
text_int_paths = job.per_dictionary_text_int_scp_paths
batch_size = 1000
if self.use_g2p:
import pynini
from pynini.lib import rewrite

from montreal_forced_aligner.g2p.generator import threshold_lattice_to_dfa

for d in job.dictionaries:
log_file.write(f"Compiling graphs for {d.name} ({d.id})...\n")
fst = pynini.Fst.read(d.lexicon_fst_path)
token_type = pynini.SymbolTable.read_text(d.grapheme_symbol_table_path)
words = d.word_mapping
if self.use_g2p:
token_type = pywrapfst.SymbolTable.read_text(d.grapheme_symbol_table_path)
text_column = Utterance.normalized_character_text
else:
token_type = pywrapfst.SymbolTable.read_text(d.words_symbol_path)
text_column = Utterance.normalized_text
fst.invert()
utterances = (
session.query(Utterance.kaldi_id, Utterance.normalized_character_text)
session.query(Utterance.kaldi_id, text_column)
.join(Utterance.speaker)
.filter(Utterance.ignored == False) # noqa
.filter(Utterance.normalized_character_text != "")
.filter(text_column != "")
.filter(Utterance.job_id == self.job_name)
.filter(Speaker.dictionary_id == d.id)
.order_by(Utterance.kaldi_id)
Expand All @@ -593,8 +611,19 @@ def _run(self) -> typing.Generator[typing.Tuple[int, int]]:
with mfa_open(fst_ark_path, "wb") as fst_output_file:
for utt_id, full_text in utterances:
try:
lattice = rewrite.rewrite_lattice(full_text, fst, token_type)
lattice = threshold_lattice_to_dfa(lattice, 2.0)
if self.use_g2p:
lattice = rewrite.rewrite_lattice(full_text, fst, token_type)
lattice = threshold_lattice_to_dfa(lattice, 2.0)
else:
text = " ".join(
[
x if x in words else d.oov_word
for x in full_text.split()
]
)
a = pynini.accep(text, token_type=token_type)
lattice = rewrite.rewrite_lattice(a, fst)
lattice.invert()
input = lattice.write_to_string()
except pynini.lib.rewrite.Error:
log_file.write(f'Error composing "{full_text}"\n')
Expand Down Expand Up @@ -703,6 +732,7 @@ def _run(self) -> typing.Generator[typing.Tuple[int, int]]:

else:
for d in job.dictionaries:
log_file.write(f"Compiling graphs for {d}")
fst_ark_path = job.construct_path(
workflow.working_directory, "fsts", "ark", d.id
)
Expand All @@ -711,6 +741,7 @@ def _run(self) -> typing.Generator[typing.Tuple[int, int]]:
[
thirdparty_binary("compile-train-graphs"),
f"--read-disambig-syms={d.disambiguation_symbols_int_path}",
f"--batch-size={batch_size}",
self.tree_path,
self.model_path,
d.lexicon_fst_path,
Expand All @@ -723,6 +754,7 @@ def _run(self) -> typing.Generator[typing.Tuple[int, int]]:
)
for line in proc.stderr:
log_file.write(line)
log_file.flush()
m = self.progress_pattern.match(line.strip())
if m:
yield int(m.group("succeeded")), int(m.group("failed"))
Expand Down Expand Up @@ -1766,6 +1798,7 @@ def _run(self) -> typing.Generator[typing.Tuple[int, int, str]]:
self.word_symbol_table,
self.phone_symbol_table,
self.optional_silence_phone,
oov_word=self.oov_word,
)
if d.position_dependent_phones:
word_pronunciations = [
Expand Down Expand Up @@ -1837,7 +1870,7 @@ def compile_information_func(
if decode_error_match:
data["unaligned"].append(decode_error_match.group("utt"))
continue
log_like_match = re.match(log_like_pattern, line)
log_like_match = re.search(log_like_pattern, line)
if log_like_match:
log_like = log_like_match.group("log_like")
frames = log_like_match.group("frames")
Expand Down Expand Up @@ -1923,6 +1956,7 @@ def cleanup_intervals(
self.phone_symbol_table,
self.optional_silence_phone,
self.transcription,
oov_word=self.oov_word,
)
actual_phone_intervals = []
actual_word_intervals = []
Expand Down Expand Up @@ -2018,6 +2052,8 @@ def cleanup_g2p_intervals(
self.phone_symbol_table,
self.optional_silence_phone,
clitic_marker=self.clitic_marker,
oov_word=self.oov_word,
use_g2p=True,
)
actual_phone_intervals = []
actual_word_intervals = []
Expand Down
3 changes: 2 additions & 1 deletion montreal_forced_aligner/alignment/pretrained.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,10 +296,11 @@ def align_one_utterance(self, utterance: Utterance, session: Session) -> None:
if not sox_string:
sox_string = utterance.file.sound_file.sound_file_path
text_int_path = self.working_directory.joinpath("text.int")
word_mapping = self.word_mapping(utterance.speaker.dictionary_id)
with mfa_open(text_int_path, "w") as f:
normalized_text_int = " ".join(
[
str(self.word_mapping(utterance.speaker.dictionary_id)[x])
str(word_mapping[x]) if x in word_mapping else str(word_mapping[self.oov_word])
for x in utterance.normalized_text.split()
]
)
Expand Down
8 changes: 1 addition & 7 deletions montreal_forced_aligner/command_line/mfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import atexit
import multiprocessing as mp
import os
import sys
import time
import warnings
Expand Down Expand Up @@ -34,11 +33,7 @@
validate_corpus_cli,
validate_dictionary_cli,
)
from montreal_forced_aligner.config import (
GLOBAL_CONFIG,
MFA_PROFILE_VARIABLE,
update_command_history,
)
from montreal_forced_aligner.config import GLOBAL_CONFIG, update_command_history
from montreal_forced_aligner.utils import check_third_party

BEGIN = time.time()
Expand Down Expand Up @@ -118,7 +113,6 @@ def mfa_cli(ctx: click.Context) -> None:
auto_server = False
run_check = True
if ctx.invoked_subcommand == "anchor":
os.environ[MFA_PROFILE_VARIABLE] = "anchor"

GLOBAL_CONFIG.current_profile.clean = False
GLOBAL_CONFIG.save()
Expand Down
5 changes: 5 additions & 0 deletions montreal_forced_aligner/command_line/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import rich_click as click

from montreal_forced_aligner.command_line.utils import (
common_options,
delete_server,
initialize_server,
start_server,
Expand All @@ -28,6 +29,7 @@ def server_cli():
default=None,
)
@click.help_option("-h", "--help")
@common_options
@click.pass_context
def init_cli(context, **kwargs):
if kwargs.get("profile", None) is not None:
Expand All @@ -46,6 +48,7 @@ def init_cli(context, **kwargs):
default=None,
)
@click.help_option("-h", "--help")
@common_options
@click.pass_context
def start_cli(context, **kwargs):
if kwargs.get("profile", None) is not None:
Expand All @@ -71,6 +74,7 @@ def start_cli(context, **kwargs):
default="fast",
)
@click.help_option("-h", "--help")
@common_options
@click.pass_context
def stop_cli(context, **kwargs):
if kwargs.get("profile", None) is not None:
Expand All @@ -89,6 +93,7 @@ def stop_cli(context, **kwargs):
default=None,
)
@click.help_option("-h", "--help")
@common_options
@click.pass_context
def delete_cli(context, **kwargs):
if kwargs.get("profile", None) is not None:
Expand Down
Loading

0 comments on commit 2569caa

Please sign in to comment.