Skip to content

Commit

Permalink
use meloTTS and suno bark
Browse files Browse the repository at this point in the history
  • Loading branch information
gabrielchua committed Sep 30, 2024
1 parent 112bea7 commit 8fa13bc
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 52 deletions.
65 changes: 38 additions & 27 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,15 @@
"Turkish": "tr"
}

MELO_TTS_LANGUAGE_MAPPING = {
"English": "EN",
"Spanish": "ES",
"French": "FR",
"Chinese": "ZJ",
"Japanese": "JP",
"Korean": "KR",
}

class DialogueItem(BaseModel):
"""A single dialogue item."""

Expand Down Expand Up @@ -67,19 +76,14 @@ def generate_podcast(
tone: Optional[str],
length: Optional[str],
language: str,
use_advanced_audio: bool,
) -> Tuple[str, str]:
"""Generate the audio and transcript from the PDFs and/or URL."""
text = ""

# Change language to the appropriate code
language_mapping = {
"English": "EN",
"Spanish": "ES",
"French": "FR",
"Chinese": "ZH",
"Japanese": "JP",
"Korean": "KR",
}
# Check if the selected language is supported by MeloTTS when not using advanced audio
if not use_advanced_audio and language not in MELO_TTS_LANGUAGE_MAPPING:
raise gr.Error(f"The selected language '{language}' is not supported without advanced audio generation. Please enable advanced audio generation or choose a supported language.")

# Check if at least one input is provided
if not files and not url:
Expand Down Expand Up @@ -154,7 +158,7 @@ def generate_podcast(

# Get audio file path
audio_file_path = generate_podcast_audio(
line.text, line.speaker, LANGUAGE_MAPPING[language]
line.text, line.speaker, LANGUAGE_MAPPING[language], use_advanced_audio
)
# Read the audio file into an AudioSegment
audio_segment = AudioSegment.from_file(audio_file_path)
Expand Down Expand Up @@ -191,7 +195,7 @@ def generate_podcast(
<table style="border-collapse: collapse; border: none; padding: 20px;">
<tr style="border: none;">
<td style="border: none; vertical-align: top; padding-right: 30px; padding-left: 30px;">
<img src="https://raw.githubusercontent.com/gabrielchua/daily-ai-papers/main/_include/icon.png" alt="Open NotebookLM" width="120" style="margin-bottom: 10px;">
<img src="https://raw.githubusercontent.com/gabrielchua/daily-ai-papers/main/_includes/icon.png" alt="Open NotebookLM" width="120" style="margin-bottom: 10px;">
</td>
<td style="border: none; vertical-align: top; padding: 10px;">
<p style="margin-bottom: 15px;"><strong>Convert</strong> your PDFs into podcasts with open-source AI models (Llama 3.1 405B and MeloTTS).</p>
Expand Down Expand Up @@ -225,6 +229,10 @@ def generate_podcast(
value="English",
label="6. 🌐 Choose the language"
),
gr.Checkbox(
label="7. 🔄 Use advanced audio generation? (Experimental)",
value=False
)
],
outputs=[
gr.Audio(label="Podcast", format="mp3"),
Expand All @@ -242,23 +250,26 @@ def generate_podcast(
"Fun",
"Short (1-2 min)",
"English",
True
],
[
[],
"https://en.wikipedia.org/wiki/Hugging_Face",
"How did Hugging Face become so successful?",
"Fun",
"Short (1-2 min)",
"English",
False
],
[
[],
"https://simple.wikipedia.org/wiki/Taylor_Swift",
"Why is Taylor Swift so popular?",
"Fun",
"Short (1-2 min)",
"English",
False
],
# [
# [],
# "https://en.wikipedia.org/wiki/Hugging_Face",
# "How did Hugging Face become so successful?",
# "Fun",
# "Short (1-2 min)",
# "English",
# ],
# [
# [],
# "https://simple.wikipedia.org/wiki/Taylor_Swift",
# "Why is Taylor Swift so popular?",
# "Fun",
# "Short (1-2 min)",
# "English",
# ],
],
cache_examples=True,
)
Expand Down
51 changes: 26 additions & 25 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
api_key=os.getenv("FIREWORKS_API_KEY"),
)

# hf_client = Client("mrfakename/MeloTTS")
hf_client = Client("mrfakename/MeloTTS")

# download and load all models
preload_models()
Expand Down Expand Up @@ -78,34 +78,35 @@ def parse_url(url: str) -> str:
return response.text


def generate_podcast_audio(text: str, speaker: str, language: str) -> str:
def generate_podcast_audio(text: str, speaker: str, language: str, use_advanced_audio: bool) -> str:

audio_array = generate_audio(text, history_prompt=f"v2/{language}_speaker_{'1' if speaker == 'Host (Jane)' else '3'}")
if use_advanced_audio:
audio_array = generate_audio(text, history_prompt=f"v2/{language}_speaker_{'1' if speaker == 'Host (Jane)' else '3'}")

file_path = f"audio_{language}_{speaker}.mp3"
file_path = f"audio_{language}_{speaker}.mp3"

# save audio to disk
write_wav(file_path, SAMPLE_RATE, audio_array)
# save audio to disk
write_wav(file_path, SAMPLE_RATE, audio_array)

return file_path
return file_path


# """Get the audio from the TTS model from HF Spaces and adjust pitch if necessary."""
# if speaker == "Guest":
# accent = "EN-US" if language == "EN" else language
# speed = 0.9
# else: # host
# accent = "EN-Default" if language == "EN" else language
# speed = 1
# if language != "EN" and speaker != "Guest":
# speed = 1.1
else:
if speaker == "Guest":
accent = "EN-US" if language == "EN" else language
speed = 0.9
else: # host
accent = "EN-Default" if language == "EN" else language
speed = 1
if language != "EN" and speaker != "Guest":
speed = 1.1

# # Generate audio
# result = hf_client.predict(
# text=text,
# language=language,
# speaker=accent,
# speed=speed,
# api_name="/synthesize",
# )
# return result
# Generate audio
result = hf_client.predict(
text=text,
language=language,
speaker=accent,
speed=speed,
api_name="/synthesize",
)
return result

0 comments on commit 8fa13bc

Please sign in to comment.