Fixed librosa issue and cleanup

padmalcom · Nov 8, 2022 · ec15099 · ec15099
1 parent 64edc25
commit ec15099
Show file tree

Hide file tree

Showing 10 changed files with 68 additions and 221 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2021 padmalcom
+Copyright (c) 2021 Jonas Freiknecht
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -1,34 +1,46 @@
 # jotts
-JoTTS is a German text-to-speech engine using tacotron and griffin-lim. The synthesizer model
-has been trained on my voice using Tacotron1. Due to real time usage I decided not to include a vocoder and use
-griffin-lim instead which results in a more robotic voice but is much faster.
+JoTTS is a German text-to-speech engine using tacotron and griffin-lim or wavernn as vocoder. The synthesizer model
+has been trained on my voice using tacotron1. Using grifin-lim as vocoder makes the audio generation much faster
+whereas using a trained vocoder returns better results in most cases.
 
 <a href="https://www.buymeacoffee.com/padmalcom" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="41" width="174"></a>
 
 
 ## API
-- First create an instance of JoTTS. The initializer takes force_model_download as an optional parameter
-in case that the last download of the synthesizer failed and the model cannot be applied.
+- First create an instance of *JoTTS*.
 
-- Call speak with a *text* parameter that contains the text to speak out loud. The second parameter
-can be set to True, to wait until speaking is done.
+- (optional) List all models that are available *using list_models()*. You can also look them up in the browser: https://github.com/padmalcom/Real-Time-Voice-Cloning-German/releases
 
-- Use *textToWav* to create a wav file instead of speaking the text. 
+- Load a model of your choice using *load_models()* which takes *force_model_download* as an optional parameter
+in case that the last download of the synthesizer failed and the model cannot be applied. The parameter
+*model_name* is validated against all available models on the release page.
+
+- Call speak with a *text* parameter that contains the text to speak out loud. The second parameter *wait_for_end*
+can be set to True, to wait until speaking is done, e.g. to prevent your application to close. If you want
+to use a trained vocoder, set *use_wavernn_vocoder* to True.
+
+- Use *textToWav* to create a wav file instead of speaking the text. *out_path* specifies where the wav file is
+written to. Use *use_wavernn_vocoder* to use a trained vocoder.
 
 ## Example usage
 
 ```python
 from jotts import JoTTS
-jotts = JoTTS()
-jotts.speak("Das Wetter heute ist fantastisch.", True)
-jotts.textToWav("Es war aber auch schon mal besser!")
+if __name__ == "__main__":
+	tts = JoTTS()
+	tts.list_models()
+	tts.load_models(force_model_download=False, model_name="jonas_v0.1")
+	tts.speak("Das ist ein Test mit meiner Stimme.", wait_for_end = True, use_wavernn_vocoder=True)
+	tts.speak("Das ist ein Test mit meiner Stimme.", wait_for_end = True, use_wavernn_vocoder=False)
+	tts.textToWav(text="Das ist ein Test mit meiner Stimme.", out_path="vocoder_out.wav", use_wavernn_vocoder=True)
+	tts.textToWav(text="Das ist ein Test mit meiner Stimme.", out_path="griffin_lim_out.wav", use_wavernn_vocoder=False)
 ```
 
 ## Todo
 - Add an option to change the default audio device to speak the text
-- Add a parameter to select other models but the default model
 - Add threading or multi processing to allow speaking without blocking
-- Add a vocoder instead of griffin-lim to improve audio output.
+- Add a parameter to avoid online communication in case of running JoTTS on edge.
+- Add a feature to quickly finetune a model with a arbitrary voice
 
 ## Training a model for your own voice
 Training a synthesizer model is easy - if you know how to do it. I created a course on udemy to show you how it is done.

diff --git a/example/example.py b/example/example.py
@@ -9,6 +9,6 @@
 	tts.list_models()
 	tts.load_models(force_model_download=False, model_name="jonas_v0.1")
 	#tts.speak("Das ist ein Test mit meiner Stimme.", wait_for_end = True, use_wavernn_vocoder=True)
-	#tts.speak("Das ist ein Test mit meiner Stimme.", wait_for_end = True, use_wavernn_vocoder=True)
+	#tts.speak("Das ist ein Test mit meiner Stimme.", wait_for_end = True, use_wavernn_vocoder=False)
 	tts.textToWav(text="Das ist ein Test mit meiner Stimme.", out_path="vocoder_out.wav", use_wavernn_vocoder=True)
 	tts.textToWav(text="Das ist ein Test mit meiner Stimme.", out_path="griffin_lim_out.wav", use_wavernn_vocoder=False)
diff --git a/jotts/jotts.py b/jotts/jotts.py
@@ -71,9 +71,9 @@ def __prepare_model__(self, model_name, force_model_download):
 					logger.debug("There is a newer model, downloading...")
 
 			# Downloading the latest tts model release
-			logger.debug("Download synthesizer model: {}", syn_url)
-			with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=syn_url.split('/')[-1]) as t:
-				urllib.request.urlretrieve(syn_url, filename=synthesizer_model_path, reporthook=t.update_to)
+			logger.debug("Download synthesizer model: {}", synthesizer_url)
+			with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=synthesizer_url.split('/')[-1]) as t:
+				urllib.request.urlretrieve(synthesizer_url, filename=synthesizer_model_path, reporthook=t.update_to)
 
 		if not os.path.exists(vocoder_model_path) or force_model_download:
 			logger.debug("Download vocoder model: {}", self.VOCODER_DOWNLOAD_URL)

diff --git a/jotts/synthesizer/audio.py b/jotts/synthesizer/audio.py
@@ -167,7 +167,7 @@ def _mel_to_linear(mel_spectrogram, hparams):
 
 def _build_mel_basis(hparams):
     assert hparams.fmax <= hparams.sample_rate // 2
-    return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
+    return librosa.filters.mel(sr=hparams.sample_rate, n_fft=hparams.n_fft, n_mels=hparams.num_mels,
                                fmin=hparams.fmin, fmax=hparams.fmax)
 
 def _amp_to_db(x, hparams):

diff --git a/jotts/vocoder/__init__.py b/jotts/vocoder/__init__.py
@@ -0,0 +1 @@
+#
diff --git a/jotts/vocoder/audio.py b/jotts/vocoder/audio.py
@@ -50,7 +50,7 @@ def linear_to_mel(spectrogram):
 
 
 def build_mel_basis():
-    return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
+    return librosa.filters.mel(sr=hp.sample_rate, n_fft=hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
 
 
 def normalize(S):

diff --git a/jotts/vocoder/models/__init__.py b/jotts/vocoder/models/__init__.py
@@ -0,0 +1 @@
+#
diff --git a/jotts/vocoder/models/deepmind_version.py_bu b/jotts/vocoder/models/deepmind_version.py_bu
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="jotts",
-    version="1.0.0",
+    version="1.0.4",
 	license="MIT",
     author="Jonas Freiknecht",
     author_email="[email protected]",
@@ -17,42 +17,45 @@
 	include_package_data=True,
 	install_requires=[
 		"appdirs==1.4.4",
-		"audioread==2.1.9",
+		"audioread==3.0.0",
 		"certifi==2021.10.8",
 		"cffi==1.15.0",
-		"charset-normalizer==2.0.7",
-		"colorama==0.4.4",
-		"decorator==5.1.0",
-		"Deprecated==1.2.13",
-		"idna==3.3",
-		"inflect==5.3.0",
-		"joblib==1.1.0",
-		"librosa==0.8.1",
-		"llvmlite==0.37.0",
-		"loguru==0.5.3",
-		"numba==0.54.1",
-		"numpy==1.20.3",
+		"charset-normalizer==2.1.1",
+		"colorama==0.4.6",
+		"contourpy==1.0.6",
+		"cycler==0.11.0",
+		"decorator==5.1.1",
+		"fonttools==4.38.0",
+		"idna==3.4",
+		"inflect==6.0.2",
+		"joblib==1.2.0",
+		"kiwisolver==1.4.4",
+		"librosa==0.9.2",
+		"llvmlite==0.39.1",
+		"loguru==0.6.0",
+		"matplotlib==3.6.2",
+		"numba==0.56.4",
+		"numpy==1.23.4",
 		"packaging==21.3",
-		"pooch==1.5.2",
+		"Pillow==9.3.0",
+		"pooch==1.6.0",
 		"pycparser==2.21",
-		"PyGithub==1.55",
-		"PyJWT==2.3.0",
-		"PyNaCl==1.4.0",
-		"pyparsing==3.0.6",
-		"requests==2.26.0",
-		"resampy==0.2.2",
-		"scikit-learn==1.0.1",
-		"scipy==1.7.2",
+		"pydantic==1.10.2",
+		"pyparsing==3.0.9",
+		"python-dateutil==2.8.2",
+		"requests==2.28.1",
+		"resampy==0.4.2",
+		"scikit-learn==1.1.3",
+		"scipy==1.9.3",
 		"six==1.16.0",
-		"sounddevice==0.4.3",
-		"SoundFile==0.10.3.post1",
-		"threadpoolctl==3.0.0",
+		"sounddevice==0.4.5",
+		"soundfile==0.11.0",
+		"threadpoolctl==3.1.0",
 		"torch==1.10.0",
-		"tqdm==4.62.3",
-		"typing_extensions==4.0.0",
-		"Unidecode==1.3.2",
-		"urllib3==1.26.7",
-		"wrapt==1.13.3"
+		"tqdm==4.64.1",
+		"typing_extensions==4.4.0",
+		"Unidecode==1.3.6",
+		"urllib3==1.26.12"
 	],
     classifiers=[
         "Development Status :: 4 - Beta",