evdv · evdv · Apr 28, 2022 · evdv · May 2, 2022 · evdv
diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/utils.py b/PyTorch/SpeechSynthesis/FastPitch/common/utils.py
@@ -16,13 +16,11 @@
 import warnings
 from pathlib import Path
 from typing import Optional
-
 import librosa
 import numpy as np
-
 import torch
 from scipy.io.wavfile import read
-
+from csv import DictReader
 
 class BenchmarkStats:
     """ Tracks statistics used for benchmarking. """
@@ -69,21 +67,16 @@ def load_wav_to_torch(full_path, force_sampling_rate=None):
 
 def load_filepaths_and_text(fnames, dataset_path=None, has_speakers=False,
                             split="|"):
-    def split_line(line, root=None):
-        parts = line.strip().split(split)
-        if has_speakers:
-            paths, non_paths = parts[:-2], parts[-2:]
-        else:
-            paths, non_paths = parts[:-1], parts[-1:]
-        if root:
-            return tuple(str(Path(root, p)) for p in paths) + tuple(non_paths)
-        else:
-            return tuple(str(Path(p)) for p in paths) + tuple(non_paths)
+
+    #Reads in csv with headers mels|pitch|text|optional-speaker
+    #Returns list of dicts
 
     fpaths_and_text = []
     for fname in fnames:
         with open(fname, encoding='utf-8') as f:
-            fpaths_and_text += [split_line(line, dataset_path) for line in f]
+            dict_reader = DictReader(f, delimiter='|')
+            fpaths_and_text = list(dict_reader)
+
     return fpaths_and_text
 
 

diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
@@ -162,7 +162,7 @@ def __init__(self,
         self.dataset_path = dataset_path
         self.audiopaths_and_text = load_filepaths_and_text(
             audiopaths_and_text, dataset_path,
-            has_speakers=(n_speakers > 1))
+            has_speakers=(n_speakers > 1)) #this now returns a list of dicts
         self.load_mel_from_disk = load_mel_from_disk
         if not load_mel_from_disk:
             self.max_wav_value = max_wav_value
@@ -193,26 +193,25 @@ def __init__(self,
 
         assert not (load_pitch_from_disk and self.pitch_tmp_dir is not None)
 
-        if len(self.audiopaths_and_text[0]) < expected_columns:
-            raise ValueError(f'Expected {expected_columns} columns in audiopaths file. '
-                             'The format is <mel_or_wav>|[<pitch>|]<text>[|<speaker_id>]')
-
-        if len(self.audiopaths_and_text[0]) > expected_columns:
-            print('WARNING: Audiopaths file has more columns than expected')
 
         to_tensor = lambda x: torch.Tensor([x]) if type(x) is float else x
         self.pitch_mean = to_tensor(pitch_mean)
         self.pitch_std = to_tensor(pitch_std)
 
     def __getitem__(self, index):
-        # Separate filename and text
+
+        #Indexing items using dictionary entries
         if self.n_speakers > 1:
-            audiopath, *extra, text, speaker = self.audiopaths_and_text[index]
+            audiopath = self.audiopaths_and_text[index]['mels']
+            text = self.audiopaths_and_text[index]['text']
+            speaker = self.audiopaths_and_text[index]['speaker']
             speaker = int(speaker)
         else:
-            audiopath, *extra, text = self.audiopaths_and_text[index]
+            audiopath = self.audiopaths_and_text[index]['mels']
+            text = self.audiopaths_and_text[index]['text']
             speaker = None
 
+
         mel = self.get_mel(audiopath)
         text = self.get_text(text)
         pitch = self.get_pitch(index, mel.size(-1))
@@ -287,15 +286,15 @@ def get_prior(self, index, mel_len, text_len):
         return attn_prior
 
     def get_pitch(self, index, mel_len=None):
-        audiopath, *fields = self.audiopaths_and_text[index]
+        audiopath = self.audiopaths_and_text[index]['mels']
 
         if self.n_speakers > 1:
-            spk = int(fields[-1])
+            spk = int(self.audiopaths_and_text[index]['speaker'])
         else:
             spk = 0
 
         if self.load_pitch_from_disk:
-            pitchpath = fields[0]
+            pitchpath = self.audiopaths_and_text[index]['pitch']
             pitch = torch.load(pitchpath)
             if self.pitch_mean is not None:
                 assert self.pitch_std is not None

diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_test.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_test.txt
@@ -1,3 +1,4 @@
+mels|pitch|text
 wavs/LJ045-0096.wav|pitch/LJ045-0096.pt|Mrs. De Mohrenschildt thought that Oswald,
 wavs/LJ049-0022.wav|pitch/LJ049-0022.pt|The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.
 wavs/LJ033-0042.wav|pitch/LJ033-0042.pt|Between the hours of eight and nine p.m. they were occupied with the children in the bedrooms located at the extreme east end of the house.

diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_train_v3.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_train_v3.txt
@@ -1,3 +1,4 @@
+mels|pitch|text
 wavs/LJ050-0234.wav|pitch/LJ050-0234.pt|It has used other Treasury law enforcement agents on special experiments in building and route surveys in places to which the President frequently travels.
 wavs/LJ019-0373.wav|pitch/LJ019-0373.pt|to avail himself of his powers, as it was difficult to bring home the derelictions of duties and evasion of the acts. Too much was left to the inspectors.
 wavs/LJ050-0207.wav|pitch/LJ050-0207.pt|Although Chief Rowley does not complain about the pay scale for Secret Service agents,

diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_val.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_pitch_text_val.txt
@@ -1,3 +1,4 @@
+mels|pitch|text
 wavs/LJ016-0288.wav|pitch/LJ016-0288.pt|"Müller, Müller, He's the man," till a diversion was created by the appearance of the gallows, which was received with continuous yells.
 wavs/LJ028-0275.wav|pitch/LJ028-0275.pt|At last, in the twentieth month,
 wavs/LJ019-0273.wav|pitch/LJ019-0273.pt|which Sir Joshua Jebb told the committee he considered the proper elements of penal discipline.

diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text.txt
@@ -1,3 +1,4 @@
+mels|text
 wavs/LJ050-0234.wav|It has used other Treasury law enforcement agents on special experiments in building and route surveys in places to which the President frequently travels.
 wavs/LJ019-0373.wav|to avail himself of his powers, as it was difficult to bring home the derelictions of duties and evasion of the acts. Too much was left to the inspectors.
 wavs/LJ050-0207.wav|Although Chief Rowley does not complain about the pay scale for Secret Service agents,

diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_test.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_test.txt
@@ -1,3 +1,4 @@
+mels|text
 wavs/LJ045-0096.wav|Mrs. De Mohrenschildt thought that Oswald,
 wavs/LJ049-0022.wav|The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.
 wavs/LJ033-0042.wav|Between the hours of eight and nine p.m. they were occupied with the children in the bedrooms located at the extreme east end of the house.

diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_train_v3.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_train_v3.txt
@@ -1,3 +1,4 @@
+mels|text
 wavs/LJ050-0234.wav|It has used other Treasury law enforcement agents on special experiments in building and route surveys in places to which the President frequently travels.
 wavs/LJ019-0373.wav|to avail himself of his powers, as it was difficult to bring home the derelictions of duties and evasion of the acts. Too much was left to the inspectors.
 wavs/LJ050-0207.wav|Although Chief Rowley does not complain about the pay scale for Secret Service agents,

diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_val.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/ljs_audio_text_val.txt
@@ -1,3 +1,4 @@
+mels|text
 wavs/LJ016-0288.wav|"Müller, Müller, He's the man," till a diversion was created by the appearance of the gallows, which was received with continuous yells.
 wavs/LJ028-0275.wav|At last, in the twentieth month,
 wavs/LJ019-0273.wav|which Sir Joshua Jebb told the committee he considered the proper elements of penal discipline.