Skip to content

Commit

Permalink
add whisperx example and update flux volume path (#47)
Browse files Browse the repository at this point in the history
  • Loading branch information
mernit authored Sep 30, 2024
1 parent 2f4a361 commit 1185a2e
Show file tree
Hide file tree
Showing 4 changed files with 141 additions and 1 deletion.
25 changes: 25 additions & 0 deletions audio_and_transcription/whisperx_stt/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# WhisperX Inference

## Deploy the endpoint in `app.py`

```
beam deploy app.py:transcribe_audio
```

## Send a request to the API

Add the deployment URL, auth token, and a URL with your audio file to `request.py`:

```
AUTH_TOKEN = "BEAM_AUTH_TOKEN"
BEAM_URL = "id/8836f704-b521-4e1c-8979-bc74c97dc47b"
AUDIO_URL = ""
```

> [You can find various audio samples here.](https://audio-samples.github.io/samples/mp3/blizzard_unconditional/sample-0.mp3)
Send the request by running `python request.py`. You'll get back a response like this:

```
{"result":{"segments":[{"start":0.309,"end":3.133,"text":" My thought, I have nobody by a beauty and will as you t'ward.","words":[{"word":"My","start":0.309,"end":0.45,"score":0.93},{"word":"thought,","start":0.49,"end":0.85,"score":0.863},{"word":"I","start":0.91,"end":0.97,"score":0.999},{"word":"have","start":1.01,"end":1.191,"score":0.874},{"word":"nobody","start":1.251,"end":1.571,"score":0.91},{"word":"by","start":1.611,"end":1.751,"score":0.863},{"word":"a","start":1.791,"end":1.832,"score":0.975},{"word":"beauty","start":1.872,"end":2.152,"score":0.836},{"word":"and","start":2.192,"end":2.272,"score":0.82},{"word":"will","start":2.292,"end":2.472,"score":0.853},{"word":"as","start":2.512,"end":2.593,"score":0.838},{"word":"you","start":2.613,"end":2.753,"score":0.842},{"word":"t'ward.","start":2.793,"end":3.133,"score":0.217}]},{"start":3.874,"end":9.943,"text":"Mr. Rochester is sub, and that so don't find simpus, and devoted abode, to hath might in a","words":[{"word":"Mr.","start":3.874,"end":4.175,"score":0.563},{"word":"Rochester","start":4.235,"end":4.756,"score":0.94},{"word":"is","start":4.836,"end":4.916,"score":0.816},{"word":"sub,","start":4.936,"end":5.236,"score":0.877},{"word":"and","start":5.276,"end":5.356,"score":0.802},{"word":"that","start":5.397,"end":5.577,"score":0.948},{"word":"so","start":5.617,"end":5.777,"score":0.982},{"word":"don't","start":5.817,"end":6.017,"score":0.863},{"word":"find","start":6.057,"end":6.358,"score":0.873},{"word":"simpus,","start":6.398,"end":6.839,"score":0.865},{"word":"and","start":7.399,"end":7.499,"score":0.884},{"word":"devoted","start":7.54,"end":7.92,"score":0.969},{"word":"abode,","start":8.0,"end":8.461,"score":0.635},{"word":"to","start":9.102,"end":9.222,"score":0.839},{"word":"hath","start":9.262,"end":9.402,"score":0.65},{"word":"might","start":9.442,"end":9.703,"score":0.855},{"word":"in","start":9.783,"end":9.883,"score":0.8},{"word":"a","start":9.923,"end":9.943,"score":0.97}]}],"word_segments":[{"word":"My","start":0.309,"end":0.45,"score":0.93},{"word":"thought,","start":0.49,"end":0.85,"score":0.863},{"word":"I","start":0.91,"end":0.97,"score":0.999},{"word":"have","start":1.01,"end":1.191,"score":0.874},{"word":"nobody","start":1.251,"end":1.571,"score":0.91},{"word":"by","start":1.611,"end":1.751,"score":0.863},{"word":"a","start":1.791,"end":1.832,"score":0.975},{"word":"beauty","start":1.872,"end":2.152,"score":0.836},{"word":"and","start":2.192,"end":2.272,"score":0.82},{"word":"will","start":2.292,"end":2.472,"score":0.853},{"word":"as","start":2.512,"end":2.593,"score":0.838},{"word":"you","start":2.613,"end":2.753,"score":0.842},{"word":"t'ward.","start":2.793,"end":3.133,"score":0.217},{"word":"Mr.","start":3.874,"end":4.175,"score":0.563},{"word":"Rochester","start":4.235,"end":4.756,"score":0.94},{"word":"is","start":4.836,"end":4.916,"score":0.816},{"word":"sub,","start":4.936,"end":5.236,"score":0.877},{"word":"and","start":5.276,"end":5.356,"score":0.802},{"word":"that","start":5.397,"end":5.577,"score":0.948},{"word":"so","start":5.617,"end":5.777,"score":0.982},{"word":"don't","start":5.817,"end":6.017,"score":0.863},{"word":"find","start":6.057,"end":6.358,"score":0.873},{"word":"simpus,","start":6.398,"end":6.839,"score":0.865},{"word":"and","start":7.399,"end":7.499,"score":0.884},{"word":"devoted","start":7.54,"end":7.92,"score":0.969},{"word":"abode,","start":8.0,"end":8.461,"score":0.635},{"word":"to","start":9.102,"end":9.222,"score":0.839},{"word":"hath","start":9.262,"end":9.402,"score":0.65},{"word":"might","start":9.442,"end":9.703,"score":0.855},{"word":"in","start":9.783,"end":9.883,"score":0.8},{"word":"a","start":9.923,"end":9.943,"score":0.97}]}}
```
89 changes: 89 additions & 0 deletions audio_and_transcription/whisperx_stt/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from beam import endpoint, Image, Volume, env

if env.is_remote():
import torch
import whisperx
import gc


# Define the custom image
image = (
Image()
.add_commands(["apt-get update -y", "apt-get install ffmpeg -y"])
.add_python_packages(
["faster-whisper==1.0.1", "whisperx==3.1.5", "torchaudio==2.0.2"]
)
)


volume_path = "./cached_models"
device = "cuda"
compute_type = "float16"
language_code = "en"


def on_start():
model_name = "large-v2"

# Load the main WhisperX model
model = whisperx.load_model(
model_name, device, download_root=volume_path, language=language_code
)

# Load the alignment model for word-level timestamps
alignment_model, metadata = whisperx.load_align_model(
language_code=language_code, device=device
)

return model, alignment_model, metadata


@endpoint(
name="whisperx-deployment",
image=image,
cpu=4,
memory="32Gi",
gpu="A10G",
volumes=[
Volume(
name="cached_models",
mount_path=volume_path,
)
],
on_start=on_start,
)
def transcribe_audio(context, **inputs):
# Retrieve values from on_start
model, alignment_model, metadata = context.on_start_value

url = inputs.get(
"url",
"https://audio-samples.github.io/samples/mp3/blizzard_unconditional/sample-0.mp3",
)

print(f"🚧 Loading audio from {url}...")
audio = whisperx.load_audio(url)
print("✅ Audio loaded")

print("Transcribing...")
result = model.transcribe(audio, batch_size=16)
print("🎉 Transcription done:")
print(result["segments"])

# Delete model if low on GPU resources
gc.collect()
torch.cuda.empty_cache()
del model

print("Aligning...")
result = whisperx.align(
result["segments"],
alignment_model,
metadata,
audio,
device,
return_char_alignments=False,
)
print("🎉 Alignment done")

return {"result": result}
26 changes: 26 additions & 0 deletions audio_and_transcription/whisperx_stt/request.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import requests

AUTH_TOKEN = "BEAM_AUTH_TOKEN"
BEAM_URL = (
"BEAM_URL" # Will look something like: id/618b1458-0a84-4be5-ae8f-7d70e76374d9
)
AUDIO_URL = (
"https://audio-samples.github.io/samples/mp3/blizzard_unconditional/sample-4.mp3"
)

payload = {"url": AUDIO_URL}

url = f"https://app.beam.cloud/endpoint/{BEAM_URL}"
headers = {
"Authorization": f"Bearer {AUTH_TOKEN}",
"Connection": "keep-alive",
"Content-Type": "application/json",
}

try:
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
result = response.json()
print("Response:", result)
except requests.exceptions.RequestException as e:
print("An error occurred:", e)
2 changes: 1 addition & 1 deletion image_generation/flux/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def load_models():

# Load model
pipe = FluxPipeline.from_pretrained(
"black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16
"black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, cache_dir=CACHE_PATH
)
pipe.enable_model_cpu_offload() # save some VRAM by offloading the model to CPU. Remove this if you have enough GPU power

Expand Down

0 comments on commit 1185a2e

Please sign in to comment.