-
Notifications
You must be signed in to change notification settings - Fork 47
/
Copy pathapp2.py
157 lines (124 loc) · 4.95 KB
/
app2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from TTS.api import TTS
import time
import json
import random
import time
from pathlib import Path
import streamlit as st
import time
import html
import gradio as gr
import uuid
import torch
import librosa
import streamlit as st
from audio_recorder_streamlit import audio_recorder
from scipy.io.wavfile import write
params = {
"activate": True,
"autoplay": True,
"show_text": False,
"remove_trailing_dots": False,
"voice": "Rogger.wav",
"language": "English",
"model_name": "tts_models/multilingual/multi-dataset/xtts_v2",
# "model_path":"./models/",
# "config_path":"./models/config.json"
}
SUPPORTED_FORMATS = ['wav']
SAMPLE_RATE = 16000
speakers = {p.stem: str(p) for p in list(Path('targets').iterdir())}
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
# device = torch.device('cuda:0')
# device = torch.device('cuda:0')
import os
os.makedirs(os.path.join(".", "targets"), exist_ok=True)
os.makedirs(os.path.join(".", "outputs"), exist_ok=True)
@st.cache_resource
# @st.experimental_singleton
def load_model():
global tts
print("[XTTS] Loading XTTS...")
tts = TTS(model_name=params["model_name"]).to(device)
# model_path=params["model_path"],
# config_path=params["config_path"]).
return tts
tts=load_model()
def get_available_voices():
return sorted([voice.name for voice in Path(f"{this_dir}/targets").glob("*.wav")])
def random_sentence():
with open(Path("harvard_sentences.txt")) as f:
return random.choice(list(f))
st.title("TTS based Voice Cloning in 16 Languages.")
# st.image('logo.png', width=150)
st.header('Text to speech generation')
this_dir = str(Path(__file__).parent.resolve())
languages=None
with open(Path(f"{this_dir}/languages.json"), encoding='utf8') as f:
languages = json.load(f)
with st.sidebar:
voice_list=get_available_voices()
print (voice_list)
st.title("Text to Voice")
english = st.radio(
label="Choose your language", options=languages, index=0, horizontal=True)
default_speaker_name = "Rogger"
speaker_name = st.selectbox('Select target speaker:', options=[None] + list(speakers.keys()),
index=[key for key in speakers.keys()].index(default_speaker_name) + 1 if default_speaker_name in speakers else 0)
wav_tgt=None
if speaker_name is not None:
wav_tgt, _ = librosa.load(speakers[speaker_name],sr=22000)
wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
st.write('Selected Target:')
st.audio(wav_tgt, sample_rate=22000)
# Upload the WAV file
text = st.text_area('Enter text to convert to audio format',
value="Hello")
speed = st.slider('Speed', 0.1, 1.99, 0.8, 0.01)
st.caption ("Optional Microphone Recording. Download and rename your recording before using.")
audio_bytes = audio_recorder()
if audio_bytes:
st.audio(audio_bytes, format="audio/wav")
def gen_voice(string,spk):
string = html.unescape(string)
# Generate a short UUID
short_uuid = str(uuid.uuid4())[:8]
fl_name='outputs/' + spk + "-" + short_uuid +'.wav'
output_file = Path(fl_name)
tts.tts_to_file(
text=string,
speed=speed,
file_path=output_file,
speaker_wav=[f"{this_dir}/targets/" +spk + ".wav"],
language=languages[english]
)
return output_file
# Upload the WAV file
st.caption ("For the audio file, use the name of your Target, for instance ABIDA.wav")
new_tgt = st.file_uploader('Upload a new TARGET audio WAV file:', type=SUPPORTED_FORMATS, accept_multiple_files=False)
if new_tgt is not None:
# Get the original file name
file_name = new_tgt.name
# Save the file to the file system
file_path = os.path.join("./targets/", file_name)
st.info(f"Original file name: {file_name}")
# Extract the file name without the extension
file_name_without_extension = os.path.splitext(file_name)[0]
# Use librosa to load and process the WAV file
wav_tgt, _ = librosa.load(new_tgt, sr=22000)
wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
# Use scipy.io.wavfile.write to save the processed WAV file
write('./targets/' + file_name_without_extension + '.wav', 22000, wav_tgt)
st.success(f"New target saved successfully to {file_path}")
if st.button('Convert'):
# Run TTS
st.success('Converting ... please wait ...')
output_file=gen_voice(text, speaker_name)
# tts.tts_to_file(text=text, speed=speed, speaker=speaker, file_path="out.wav")
st.write(f'Target voice:'+speaker_name)
# st.success('Converted to audio successfully')
audio_file = open(output_file, 'rb')
audio_bytes = audio_file.read()
st.audio(audio_bytes, format='audio/wav')
# st.success("You can now play the audio by clicking on the play button.")