diff --git a/LICENSE b/LICENSE index 5a01c58..4dff0f5 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2021 padmalcom +Copyright (c) 2021 Jonas Freiknecht Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 37ec9bc..2e69508 100644 --- a/README.md +++ b/README.md @@ -1,34 +1,46 @@ # jotts -JoTTS is a German text-to-speech engine using tacotron and griffin-lim. The synthesizer model -has been trained on my voice using Tacotron1. Due to real time usage I decided not to include a vocoder and use -griffin-lim instead which results in a more robotic voice but is much faster. +JoTTS is a German text-to-speech engine using tacotron and griffin-lim or wavernn as vocoder. The synthesizer model +has been trained on my voice using tacotron1. Using grifin-lim as vocoder makes the audio generation much faster +whereas using a trained vocoder returns better results in most cases. Buy Me A Coffee ## API -- First create an instance of JoTTS. The initializer takes force_model_download as an optional parameter -in case that the last download of the synthesizer failed and the model cannot be applied. +- First create an instance of *JoTTS*. -- Call speak with a *text* parameter that contains the text to speak out loud. The second parameter -can be set to True, to wait until speaking is done. +- (optional) List all models that are available *using list_models()*. You can also look them up in the browser: https://github.com/padmalcom/Real-Time-Voice-Cloning-German/releases -- Use *textToWav* to create a wav file instead of speaking the text. +- Load a model of your choice using *load_models()* which takes *force_model_download* as an optional parameter +in case that the last download of the synthesizer failed and the model cannot be applied. The parameter +*model_name* is validated against all available models on the release page. + +- Call speak with a *text* parameter that contains the text to speak out loud. The second parameter *wait_for_end* +can be set to True, to wait until speaking is done, e.g. to prevent your application to close. If you want +to use a trained vocoder, set *use_wavernn_vocoder* to True. + +- Use *textToWav* to create a wav file instead of speaking the text. *out_path* specifies where the wav file is +written to. Use *use_wavernn_vocoder* to use a trained vocoder. ## Example usage ```python from jotts import JoTTS -jotts = JoTTS() -jotts.speak("Das Wetter heute ist fantastisch.", True) -jotts.textToWav("Es war aber auch schon mal besser!") +if __name__ == "__main__": + tts = JoTTS() + tts.list_models() + tts.load_models(force_model_download=False, model_name="jonas_v0.1") + tts.speak("Das ist ein Test mit meiner Stimme.", wait_for_end = True, use_wavernn_vocoder=True) + tts.speak("Das ist ein Test mit meiner Stimme.", wait_for_end = True, use_wavernn_vocoder=False) + tts.textToWav(text="Das ist ein Test mit meiner Stimme.", out_path="vocoder_out.wav", use_wavernn_vocoder=True) + tts.textToWav(text="Das ist ein Test mit meiner Stimme.", out_path="griffin_lim_out.wav", use_wavernn_vocoder=False) ``` ## Todo - Add an option to change the default audio device to speak the text -- Add a parameter to select other models but the default model - Add threading or multi processing to allow speaking without blocking -- Add a vocoder instead of griffin-lim to improve audio output. +- Add a parameter to avoid online communication in case of running JoTTS on edge. +- Add a feature to quickly finetune a model with a arbitrary voice ## Training a model for your own voice Training a synthesizer model is easy - if you know how to do it. I created a course on udemy to show you how it is done. diff --git a/example/example.py b/example/example.py index 555002b..260a909 100644 --- a/example/example.py +++ b/example/example.py @@ -9,6 +9,6 @@ tts.list_models() tts.load_models(force_model_download=False, model_name="jonas_v0.1") #tts.speak("Das ist ein Test mit meiner Stimme.", wait_for_end = True, use_wavernn_vocoder=True) - #tts.speak("Das ist ein Test mit meiner Stimme.", wait_for_end = True, use_wavernn_vocoder=True) + #tts.speak("Das ist ein Test mit meiner Stimme.", wait_for_end = True, use_wavernn_vocoder=False) tts.textToWav(text="Das ist ein Test mit meiner Stimme.", out_path="vocoder_out.wav", use_wavernn_vocoder=True) tts.textToWav(text="Das ist ein Test mit meiner Stimme.", out_path="griffin_lim_out.wav", use_wavernn_vocoder=False) \ No newline at end of file diff --git a/jotts/jotts.py b/jotts/jotts.py index e85ee79..abda46d 100644 --- a/jotts/jotts.py +++ b/jotts/jotts.py @@ -71,9 +71,9 @@ def __prepare_model__(self, model_name, force_model_download): logger.debug("There is a newer model, downloading...") # Downloading the latest tts model release - logger.debug("Download synthesizer model: {}", syn_url) - with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=syn_url.split('/')[-1]) as t: - urllib.request.urlretrieve(syn_url, filename=synthesizer_model_path, reporthook=t.update_to) + logger.debug("Download synthesizer model: {}", synthesizer_url) + with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=synthesizer_url.split('/')[-1]) as t: + urllib.request.urlretrieve(synthesizer_url, filename=synthesizer_model_path, reporthook=t.update_to) if not os.path.exists(vocoder_model_path) or force_model_download: logger.debug("Download vocoder model: {}", self.VOCODER_DOWNLOAD_URL) diff --git a/jotts/synthesizer/audio.py b/jotts/synthesizer/audio.py index 83dc96c..2e03ae5 100644 --- a/jotts/synthesizer/audio.py +++ b/jotts/synthesizer/audio.py @@ -167,7 +167,7 @@ def _mel_to_linear(mel_spectrogram, hparams): def _build_mel_basis(hparams): assert hparams.fmax <= hparams.sample_rate // 2 - return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels, + return librosa.filters.mel(sr=hparams.sample_rate, n_fft=hparams.n_fft, n_mels=hparams.num_mels, fmin=hparams.fmin, fmax=hparams.fmax) def _amp_to_db(x, hparams): diff --git a/jotts/vocoder/__init__.py b/jotts/vocoder/__init__.py new file mode 100644 index 0000000..4287ca8 --- /dev/null +++ b/jotts/vocoder/__init__.py @@ -0,0 +1 @@ +# \ No newline at end of file diff --git a/jotts/vocoder/audio.py b/jotts/vocoder/audio.py index f555dbf..cf710c8 100644 --- a/jotts/vocoder/audio.py +++ b/jotts/vocoder/audio.py @@ -50,7 +50,7 @@ def linear_to_mel(spectrogram): def build_mel_basis(): - return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin) + return librosa.filters.mel(sr=hp.sample_rate, n_fft=hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin) def normalize(S): diff --git a/jotts/vocoder/models/__init__.py b/jotts/vocoder/models/__init__.py new file mode 100644 index 0000000..4287ca8 --- /dev/null +++ b/jotts/vocoder/models/__init__.py @@ -0,0 +1 @@ +# \ No newline at end of file diff --git a/jotts/vocoder/models/deepmind_version.py_bu b/jotts/vocoder/models/deepmind_version.py_bu deleted file mode 100644 index 1d973d9..0000000 --- a/jotts/vocoder/models/deepmind_version.py_bu +++ /dev/null @@ -1,170 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F -from utils.display import * -from utils.dsp import * - - -class WaveRNN(nn.Module) : - def __init__(self, hidden_size=896, quantisation=256) : - super(WaveRNN, self).__init__() - - self.hidden_size = hidden_size - self.split_size = hidden_size // 2 - - # The main matmul - self.R = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False) - - # Output fc layers - self.O1 = nn.Linear(self.split_size, self.split_size) - self.O2 = nn.Linear(self.split_size, quantisation) - self.O3 = nn.Linear(self.split_size, self.split_size) - self.O4 = nn.Linear(self.split_size, quantisation) - - # Input fc layers - self.I_coarse = nn.Linear(2, 3 * self.split_size, bias=False) - self.I_fine = nn.Linear(3, 3 * self.split_size, bias=False) - - # biases for the gates - self.bias_u = nn.Parameter(torch.zeros(self.hidden_size)) - self.bias_r = nn.Parameter(torch.zeros(self.hidden_size)) - self.bias_e = nn.Parameter(torch.zeros(self.hidden_size)) - - # display num params - self.num_params() - - - def forward(self, prev_y, prev_hidden, current_coarse) : - - # Main matmul - the projection is split 3 ways - R_hidden = self.R(prev_hidden) - R_u, R_r, R_e, = torch.split(R_hidden, self.hidden_size, dim=1) - - # Project the prev input - coarse_input_proj = self.I_coarse(prev_y) - I_coarse_u, I_coarse_r, I_coarse_e = \ - torch.split(coarse_input_proj, self.split_size, dim=1) - - # Project the prev input and current coarse sample - fine_input = torch.cat([prev_y, current_coarse], dim=1) - fine_input_proj = self.I_fine(fine_input) - I_fine_u, I_fine_r, I_fine_e = \ - torch.split(fine_input_proj, self.split_size, dim=1) - - # concatenate for the gates - I_u = torch.cat([I_coarse_u, I_fine_u], dim=1) - I_r = torch.cat([I_coarse_r, I_fine_r], dim=1) - I_e = torch.cat([I_coarse_e, I_fine_e], dim=1) - - # Compute all gates for coarse and fine - u = F.sigmoid(R_u + I_u + self.bias_u) - r = F.sigmoid(R_r + I_r + self.bias_r) - e = F.tanh(r * R_e + I_e + self.bias_e) - hidden = u * prev_hidden + (1. - u) * e - - # Split the hidden state - hidden_coarse, hidden_fine = torch.split(hidden, self.split_size, dim=1) - - # Compute outputs - out_coarse = self.O2(F.relu(self.O1(hidden_coarse))) - out_fine = self.O4(F.relu(self.O3(hidden_fine))) - - return out_coarse, out_fine, hidden - - - def generate(self, seq_len): - with torch.no_grad(): - # First split up the biases for the gates - b_coarse_u, b_fine_u = torch.split(self.bias_u, self.split_size) - b_coarse_r, b_fine_r = torch.split(self.bias_r, self.split_size) - b_coarse_e, b_fine_e = torch.split(self.bias_e, self.split_size) - - # Lists for the two output seqs - c_outputs, f_outputs = [], [] - - # Some initial inputs - out_coarse = torch.LongTensor([0]).cuda() - out_fine = torch.LongTensor([0]).cuda() - - # We'll meed a hidden state - hidden = self.init_hidden() - - # Need a clock for display - start = time.time() - - # Loop for generation - for i in range(seq_len) : - - # Split into two hidden states - hidden_coarse, hidden_fine = \ - torch.split(hidden, self.split_size, dim=1) - - # Scale and concat previous predictions - out_coarse = out_coarse.unsqueeze(0).float() / 127.5 - 1. - out_fine = out_fine.unsqueeze(0).float() / 127.5 - 1. - prev_outputs = torch.cat([out_coarse, out_fine], dim=1) - - # Project input - coarse_input_proj = self.I_coarse(prev_outputs) - I_coarse_u, I_coarse_r, I_coarse_e = \ - torch.split(coarse_input_proj, self.split_size, dim=1) - - # Project hidden state and split 6 ways - R_hidden = self.R(hidden) - R_coarse_u , R_fine_u, \ - R_coarse_r, R_fine_r, \ - R_coarse_e, R_fine_e = torch.split(R_hidden, self.split_size, dim=1) - - # Compute the coarse gates - u = F.sigmoid(R_coarse_u + I_coarse_u + b_coarse_u) - r = F.sigmoid(R_coarse_r + I_coarse_r + b_coarse_r) - e = F.tanh(r * R_coarse_e + I_coarse_e + b_coarse_e) - hidden_coarse = u * hidden_coarse + (1. - u) * e - - # Compute the coarse output - out_coarse = self.O2(F.relu(self.O1(hidden_coarse))) - posterior = F.softmax(out_coarse, dim=1) - distrib = torch.distributions.Categorical(posterior) - out_coarse = distrib.sample() - c_outputs.append(out_coarse) - - # Project the [prev outputs and predicted coarse sample] - coarse_pred = out_coarse.float() / 127.5 - 1. - fine_input = torch.cat([prev_outputs, coarse_pred.unsqueeze(0)], dim=1) - fine_input_proj = self.I_fine(fine_input) - I_fine_u, I_fine_r, I_fine_e = \ - torch.split(fine_input_proj, self.split_size, dim=1) - - # Compute the fine gates - u = F.sigmoid(R_fine_u + I_fine_u + b_fine_u) - r = F.sigmoid(R_fine_r + I_fine_r + b_fine_r) - e = F.tanh(r * R_fine_e + I_fine_e + b_fine_e) - hidden_fine = u * hidden_fine + (1. - u) * e - - # Compute the fine output - out_fine = self.O4(F.relu(self.O3(hidden_fine))) - posterior = F.softmax(out_fine, dim=1) - distrib = torch.distributions.Categorical(posterior) - out_fine = distrib.sample() - f_outputs.append(out_fine) - - # Put the hidden state back together - hidden = torch.cat([hidden_coarse, hidden_fine], dim=1) - - # Display progress - speed = (i + 1) / (time.time() - start) - stream('Gen: %i/%i -- Speed: %i', (i + 1, seq_len, speed)) - - coarse = torch.stack(c_outputs).squeeze(1).cpu().data.numpy() - fine = torch.stack(f_outputs).squeeze(1).cpu().data.numpy() - output = combine_signal(coarse, fine) - - return output, coarse, fine - - def init_hidden(self, batch_size=1) : - return torch.zeros(batch_size, self.hidden_size).cuda() - - def num_params(self) : - parameters = filter(lambda p: p.requires_grad, self.parameters()) - parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000 - print('Trainable Parameters: %.3f million' % parameters) \ No newline at end of file diff --git a/setup.py b/setup.py index 800c972..090994b 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="jotts", - version="1.0.0", + version="1.0.4", license="MIT", author="Jonas Freiknecht", author_email="j.freiknecht@googlemail.com", @@ -17,42 +17,45 @@ include_package_data=True, install_requires=[ "appdirs==1.4.4", - "audioread==2.1.9", + "audioread==3.0.0", "certifi==2021.10.8", "cffi==1.15.0", - "charset-normalizer==2.0.7", - "colorama==0.4.4", - "decorator==5.1.0", - "Deprecated==1.2.13", - "idna==3.3", - "inflect==5.3.0", - "joblib==1.1.0", - "librosa==0.8.1", - "llvmlite==0.37.0", - "loguru==0.5.3", - "numba==0.54.1", - "numpy==1.20.3", + "charset-normalizer==2.1.1", + "colorama==0.4.6", + "contourpy==1.0.6", + "cycler==0.11.0", + "decorator==5.1.1", + "fonttools==4.38.0", + "idna==3.4", + "inflect==6.0.2", + "joblib==1.2.0", + "kiwisolver==1.4.4", + "librosa==0.9.2", + "llvmlite==0.39.1", + "loguru==0.6.0", + "matplotlib==3.6.2", + "numba==0.56.4", + "numpy==1.23.4", "packaging==21.3", - "pooch==1.5.2", + "Pillow==9.3.0", + "pooch==1.6.0", "pycparser==2.21", - "PyGithub==1.55", - "PyJWT==2.3.0", - "PyNaCl==1.4.0", - "pyparsing==3.0.6", - "requests==2.26.0", - "resampy==0.2.2", - "scikit-learn==1.0.1", - "scipy==1.7.2", + "pydantic==1.10.2", + "pyparsing==3.0.9", + "python-dateutil==2.8.2", + "requests==2.28.1", + "resampy==0.4.2", + "scikit-learn==1.1.3", + "scipy==1.9.3", "six==1.16.0", - "sounddevice==0.4.3", - "SoundFile==0.10.3.post1", - "threadpoolctl==3.0.0", + "sounddevice==0.4.5", + "soundfile==0.11.0", + "threadpoolctl==3.1.0", "torch==1.10.0", - "tqdm==4.62.3", - "typing_extensions==4.0.0", - "Unidecode==1.3.2", - "urllib3==1.26.7", - "wrapt==1.13.3" + "tqdm==4.64.1", + "typing_extensions==4.4.0", + "Unidecode==1.3.6", + "urllib3==1.26.12" ], classifiers=[ "Development Status :: 4 - Beta",