diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/utils.py b/PyTorch/SpeechSynthesis/FastPitch/common/utils.py index 606770566..0482fbd5c 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/common/utils.py +++ b/PyTorch/SpeechSynthesis/FastPitch/common/utils.py @@ -16,13 +16,11 @@ import warnings from pathlib import Path from typing import Optional - import librosa import numpy as np - import torch from scipy.io.wavfile import read - +from csv import DictReader class BenchmarkStats: """ Tracks statistics used for benchmarking. """ @@ -69,21 +67,16 @@ def load_wav_to_torch(full_path, force_sampling_rate=None): def load_filepaths_and_text(fnames, dataset_path=None, has_speakers=False, split="|"): - def split_line(line, root=None): - parts = line.strip().split(split) - if has_speakers: - paths, non_paths = parts[:-2], parts[-2:] - else: - paths, non_paths = parts[:-1], parts[-1:] - if root: - return tuple(str(Path(root, p)) for p in paths) + tuple(non_paths) - else: - return tuple(str(Path(p)) for p in paths) + tuple(non_paths) + + #Reads in csv with headers mels|pitch|text|optional-speaker + #Returns list of dicts fpaths_and_text = [] for fname in fnames: with open(fname, encoding='utf-8') as f: - fpaths_and_text += [split_line(line, dataset_path) for line in f] + dict_reader = DictReader(f, delimiter='|') + fpaths_and_text = list(dict_reader) + return fpaths_and_text diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py index a007db86f..223fe6aa8 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py @@ -162,7 +162,7 @@ def __init__(self, self.dataset_path = dataset_path self.audiopaths_and_text = load_filepaths_and_text( audiopaths_and_text, dataset_path, - has_speakers=(n_speakers > 1)) + has_speakers=(n_speakers > 1)) #this now returns a list of dicts self.load_mel_from_disk = load_mel_from_disk if not load_mel_from_disk: self.max_wav_value = max_wav_value @@ -193,26 +193,25 @@ def __init__(self, assert not (load_pitch_from_disk and self.pitch_tmp_dir is not None) - if len(self.audiopaths_and_text[0]) < expected_columns: - raise ValueError(f'Expected {expected_columns} columns in audiopaths file. ' - 'The format is |[|][|]') - - if len(self.audiopaths_and_text[0]) > expected_columns: - print('WARNING: Audiopaths file has more columns than expected') to_tensor = lambda x: torch.Tensor([x]) if type(x) is float else x self.pitch_mean = to_tensor(pitch_mean) self.pitch_std = to_tensor(pitch_std) def __getitem__(self, index): - # Separate filename and text + + #Indexing items using dictionary entries if self.n_speakers > 1: - audiopath, *extra, text, speaker = self.audiopaths_and_text[index] + audiopath = self.audiopaths_and_text[index]['mels'] + text = self.audiopaths_and_text[index]['text'] + speaker = self.audiopaths_and_text[index]['speaker'] speaker = int(speaker) else: - audiopath, *extra, text = self.audiopaths_and_text[index] + audiopath = self.audiopaths_and_text[index]['mels'] + text = self.audiopaths_and_text[index]['text'] speaker = None + mel = self.get_mel(audiopath) text = self.get_text(text) pitch = self.get_pitch(index, mel.size(-1)) @@ -287,15 +286,15 @@ def get_prior(self, index, mel_len, text_len): return attn_prior def get_pitch(self, index, mel_len=None): - audiopath, *fields = self.audiopaths_and_text[index] + audiopath = self.audiopaths_and_text[index]['mels'] if self.n_speakers > 1: - spk = int(fields[-1]) + spk = int(self.audiopaths_and_text[index]['speaker']) else: spk = 0 if self.load_pitch_from_disk: - pitchpath = fields[0] + pitchpath = self.audiopaths_and_text[index]['pitch'] pitch = torch.load(pitchpath) if self.pitch_mean is not None: assert self.pitch_std is not None diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_test.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_test.txt index f6500cad0..c5dd7c085 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_test.txt +++ b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_test.txt @@ -1,3 +1,4 @@ +mels|pitch|text wavs/LJ045-0096.wav|pitch/LJ045-0096.pt|Mrs. De Mohrenschildt thought that Oswald, wavs/LJ049-0022.wav|pitch/LJ049-0022.pt|The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent. wavs/LJ033-0042.wav|pitch/LJ033-0042.pt|Between the hours of eight and nine p.m. they were occupied with the children in the bedrooms located at the extreme east end of the house. diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_train_v3.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_train_v3.txt index e1030e2b0..5cabdd61d 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_train_v3.txt +++ b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_train_v3.txt @@ -1,3 +1,4 @@ +mels|pitch|text wavs/LJ050-0234.wav|pitch/LJ050-0234.pt|It has used other Treasury law enforcement agents on special experiments in building and route surveys in places to which the President frequently travels. wavs/LJ019-0373.wav|pitch/LJ019-0373.pt|to avail himself of his powers, as it was difficult to bring home the derelictions of duties and evasion of the acts. Too much was left to the inspectors. wavs/LJ050-0207.wav|pitch/LJ050-0207.pt|Although Chief Rowley does not complain about the pay scale for Secret Service agents, diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_val.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_val.txt index cf33843e4..2643f67a9 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_val.txt +++ b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_val.txt @@ -1,3 +1,4 @@ +mels|pitch|text wavs/LJ016-0288.wav|pitch/LJ016-0288.pt|"Müller, Müller, He's the man," till a diversion was created by the appearance of the gallows, which was received with continuous yells. wavs/LJ028-0275.wav|pitch/LJ028-0275.pt|At last, in the twentieth month, wavs/LJ019-0273.wav|pitch/LJ019-0273.pt|which Sir Joshua Jebb told the committee he considered the proper elements of penal discipline. diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text.txt index 2658eedd6..a5fd21e6a 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text.txt +++ b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text.txt @@ -1,3 +1,4 @@ +mels|text wavs/LJ050-0234.wav|It has used other Treasury law enforcement agents on special experiments in building and route surveys in places to which the President frequently travels. wavs/LJ019-0373.wav|to avail himself of his powers, as it was difficult to bring home the derelictions of duties and evasion of the acts. Too much was left to the inspectors. wavs/LJ050-0207.wav|Although Chief Rowley does not complain about the pay scale for Secret Service agents, diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_test.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_test.txt index 93e232571..1b6e42d94 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_test.txt +++ b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_test.txt @@ -1,3 +1,4 @@ +mels|text wavs/LJ045-0096.wav|Mrs. De Mohrenschildt thought that Oswald, wavs/LJ049-0022.wav|The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent. wavs/LJ033-0042.wav|Between the hours of eight and nine p.m. they were occupied with the children in the bedrooms located at the extreme east end of the house. diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_train_v3.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_train_v3.txt index 657c4a90e..64b3e417f 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_train_v3.txt +++ b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_train_v3.txt @@ -1,3 +1,4 @@ +mels|text wavs/LJ050-0234.wav|It has used other Treasury law enforcement agents on special experiments in building and route surveys in places to which the President frequently travels. wavs/LJ019-0373.wav|to avail himself of his powers, as it was difficult to bring home the derelictions of duties and evasion of the acts. Too much was left to the inspectors. wavs/LJ050-0207.wav|Although Chief Rowley does not complain about the pay scale for Secret Service agents, diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_val.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_val.txt index d811997c0..83bd292c5 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_val.txt +++ b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_val.txt @@ -1,3 +1,4 @@ +mels|text wavs/LJ016-0288.wav|"Müller, Müller, He's the man," till a diversion was created by the appearance of the gallows, which was received with continuous yells. wavs/LJ028-0275.wav|At last, in the twentieth month, wavs/LJ019-0273.wav|which Sir Joshua Jebb told the committee he considered the proper elements of penal discipline.