Skip to content

Commit

Permalink
Merge pull request #12 from Sharrnah/audio-backend-api
Browse files Browse the repository at this point in the history
Audio backend api
  • Loading branch information
Sharrnah authored May 1, 2023
2 parents 4e58c05 + c01a633 commit 6a6c0b7
Show file tree
Hide file tree
Showing 5 changed files with 194 additions and 28 deletions.
32 changes: 32 additions & 0 deletions Models/STT/faster_whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,25 @@
},
}

TOKENIZER_LINKS = {
"normal": {
"urls": [
"https://usc1.contabostorage.com/8fcf133c506f4e688c7ab9ad537b5c18:ai-models/Whisper-CT2/tokenizer.zip",
"https://eu2.contabostorage.com/bf1a89517e2643359087e5d8219c0c67:ai-models/Whisper-CT2/tokenizer.zip",
"https://s3.libs.space:9000/ai-models/Whisper-CT2/tokenizer.zip",
],
"checksum": "f6233d181a04abce6e2ba20189d5872b58ce2e14917af525a99feb5619777d7d"
},
"en": {
"urls": [
"https://usc1.contabostorage.com/8fcf133c506f4e688c7ab9ad537b5c18:ai-models/Whisper-CT2/tokenizer.en.zip",
"https://eu2.contabostorage.com/bf1a89517e2643359087e5d8219c0c67:ai-models/Whisper-CT2/tokenizer.en.zip",
"https://s3.libs.space:9000/ai-models/Whisper-CT2/tokenizer.en.zip",
],
"checksum": "fb364e7cae84eedfd742ad116a397daa75e4eebba38f27e3f391ae4fee19afa9"
}
}


def download_model(model: str, compute_type: str = "float32"):
model_cache_path = Path(".cache/whisper")
Expand All @@ -204,6 +223,19 @@ def download_model(model: str, compute_type: str = "float32"):
MODEL_LINKS[model][compute_type]["checksum"]):
print("Model download failed")

tokenizer_file = Path(model_path / "tokenizer.json")
if not tokenizer_file.is_file() and Path(model_path).exists():
tokenizer_type = "normal"
if ".en" in model:
tokenizer_type = "en"
print("downloading tokenizer...")
if not downloader.download_extract(TOKENIZER_LINKS[tokenizer_type]["urls"],
str(model_path.resolve()),
TOKENIZER_LINKS[tokenizer_type]["checksum"]):
print("Tokenizer download failed")
elif not Path(model_path).exists():
print("no model downloaded for tokenizer.")


class FasterWhisper:
model = None
Expand Down
178 changes: 154 additions & 24 deletions audioWhisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,19 @@ def handle_exception(exc_type, exc_value, exc_traceback):
import numpy as np
import torch
import torchaudio
import resampy

import wave


def save_to_wav(data, filename, sample_rate, channels=1):
with wave.open(filename, 'wb') as wf:
wf.setnchannels(channels)
wf.setsampwidth(2) # Assuming 16-bit audio
wf.setframerate(sample_rate)
wf.writeframes(data)


import Plugins

torchaudio.set_audio_backend("soundfile")
Expand Down Expand Up @@ -126,6 +137,31 @@ def audio_bytes_to_wav(audio_bytes):
return return_data


# resample_audio function to resample audio data to a different sample rate and convert it to mono.
# set channels to '-1' to average the left and right channels to create mono audio (default)
# set channels to '0' to extract the first channel (left channel) data
# set channels to '1' to extract the first channel (right channel) data
# set channels to '2' to keep stereo channels
def resample_audio(audio_chunk, recorded_sample_rate, target_sample_rate, channels=-1):
audio_data = np.frombuffer(audio_chunk, dtype=np.int16)
# Reshape the array to separate the channels
audio_data = audio_data.reshape(-1, 2)

if channels == -1:
# Average the left and right channels to create mono audio
audio_data = audio_data.mean(axis=1)
elif channels == 0 or channels == 1:
# Extract the first channel (left channel) data
audio_data = audio_data[:, channels]

# Resample the audio data to the desired sample rate
audio_data = resampy.resample(audio_data, recorded_sample_rate, target_sample_rate)
# Convert the resampled data back to int16 dtype
int16_resampled_data = np.asarray(audio_data, dtype=np.int16)
# Convert the int16 numpy array to bytes
return int16_resampled_data.tobytes()


def typing_indicator_function(osc_ip, osc_port, send_websocket=True):
if osc_ip != "0" and settings.GetOption("osc_auto_processing_enabled") and settings.GetOption(
"osc_typing_indicator"):
Expand All @@ -151,11 +187,45 @@ def should_stop_recording(new_confidence, confidence_threshold, peak_amplitude,
time.time() - pause_time) > pause


def get_host_audio_api_names():
audio = pyaudio.PyAudio()
host_api_count = audio.get_host_api_count()
host_api_names = {}
for i in range(host_api_count):
host_api_info = audio.get_host_api_info_by_index(i)
host_api_names[i] = host_api_info["name"]
return host_api_names


def get_audio_device_index_by_name_and_api(name, api, is_input=True, default=None):
audio = pyaudio.PyAudio()
device_count = audio.get_device_count()
for i in range(device_count):
device_info = audio.get_device_info_by_index(i)
if device_info["hostApi"] == api and device_info[
"maxInputChannels" if is_input else "maxOutputChannels"] > 0 and name in device_info["name"]:
return i
return default


def get_audio_api_index_by_name(name):
audio = pyaudio.PyAudio()
host_api_count = audio.get_host_api_count()
for i in range(host_api_count):
host_api_info = audio.get_host_api_info_by_index(i)
if name in host_api_info["name"]:
print("using Audio API: " + host_api_info["name"])
return i
return 0


@click.command()
@click.option('--devices', default='False', help='print all available devices id', type=str)
@click.option('--device_index', default=-1, help='the id of the input device (-1 = default active Mic)', type=int)
@click.option('--device_out_index', default=-1, help='the id of the output device (-1 = default active Speaker)',
type=int)
@click.option('--audio_api', default='MME', help='the name of the audio API. ("MME", "DirectSound", "WASAPI")',
type=str)
@click.option('--sample_rate', default=whisper_audio.SAMPLE_RATE, help='sample rate of recording', type=int)
@click.option("--task", default="transcribe",
help="task for the model whether to only transcribe the audio or translate the audio to english",
Expand Down Expand Up @@ -205,55 +275,92 @@ def should_stop_recording(new_confidence, confidence_threshold, peak_amplitude,
type=str)
@click.option("--verbose", default=False, help="Whether to print verbose output", is_flag=True, type=bool)
@click.pass_context
def main(ctx, devices, device_index, sample_rate, dynamic_energy, open_browser, config, verbose, **kwargs):
# Load settings from file
if config is not None:
settings.SETTINGS_PATH = Path(Path.cwd() / config)
settings.LoadYaml(settings.SETTINGS_PATH)

# set process id
settings.SetOption("process_id", os.getpid())

for plugin_inst in Plugins.plugins:
plugin_inst.init()

def main(ctx, devices, sample_rate, dynamic_energy, open_browser, config, verbose, **kwargs):
if str2bool(devices):
host_audio_api_names = get_host_audio_api_names()
audio = pyaudio.PyAudio()
# print all available host apis
print("-------------------------------------------------------------------")
print(" Host APIs ")
print("-------------------------------------------------------------------")
for i in range(audio.get_host_api_count()):
print(f"Host API {i}: {audio.get_host_api_info_by_index(i)['name']}")
print("")
print("-------------------------------------------------------------------")
print(" Input Devices ")
print(" In form of: DEVICE_NAME [Sample Rate=?] [Loopback?] (Index=INDEX) ")
print("-------------------------------------------------------------------")
for device in audio.get_device_info_generator():
device_list_index = device["index"]
# device_list_api = device["hostApi"]
device_list_api = host_audio_api_names[device["hostApi"]]
device_list_name = device["name"]
device_list_sample_rate = int(device["defaultSampleRate"])
device_list_max_channels = audio.get_device_info_by_index(device_list_index)['maxInputChannels']
if device_list_max_channels >= 1:
print(f"{device_list_name} [Sample Rate={device_list_sample_rate}] (Index={device_list_index})")
print(
f"{device_list_name} [Sample Rate={device_list_sample_rate}, API={device_list_api}] (Index={device_list_index})")
print("")
print("-------------------------------------------------------------------")
print(" Output Devices ")
print("-------------------------------------------------------------------")
for device in audio.get_device_info_generator():
device_list_index = device["index"]
device_list_api = host_audio_api_names[device["hostApi"]]
device_list_name = device["name"]
device_list_sample_rate = int(device["defaultSampleRate"])
device_list_max_channels = audio.get_device_info_by_index(device_list_index)['maxOutputChannels']
if device_list_max_channels >= 1:
print(f"{device_list_name} [Sample Rate={device_list_sample_rate}] (Index={device_list_index})")
print(
f"{device_list_name} [Sample Rate={device_list_sample_rate}, API={device_list_api}] (Index={device_list_index})")
return

# Load settings from file
if config is not None:
settings.SETTINGS_PATH = Path(Path.cwd() / config)
settings.LoadYaml(settings.SETTINGS_PATH)

# set process id
settings.SetOption("process_id", os.getpid())

for plugin_inst in Plugins.plugins:
plugin_inst.init()

print("###################################")
print("# Whispering Tiger is starting... #")
print("###################################")

# set initial settings
settings.SetOption("whisper_task", settings.GetArgumentSettingFallback(ctx, "task", "whisper_task"))

# set audio settings
device_index = settings.GetArgumentSettingFallback(ctx, "device_index", "device_index")
settings.SetOption("device_index",
(device_index if device_index is None or device_index > -1 else None))
device_out_index = settings.GetArgumentSettingFallback(ctx, "device_out_index", "device_out_index")
settings.SetOption("device_out_index",
(device_out_index if device_out_index is None or device_out_index > -1 else None))

audio_api = settings.SetOption("audio_api", settings.GetArgumentSettingFallback(ctx, "audio_api", "audio_api"))
audio_api_index = get_audio_api_index_by_name(audio_api)

audio_input_device = settings.GetOption("audio_input_device")
if audio_input_device is not None and audio_input_device != "":
if audio_input_device == "Default":
device_index = None
else:
device_index = get_audio_device_index_by_name_and_api(audio_input_device, audio_api_index, True,
device_index)
settings.SetOption("device_index", device_index)

audio_output_device = settings.GetOption("audio_output_device")
if audio_output_device is not None and audio_output_device != "":
if audio_output_device == "Default":
device_out_index = None
else:
device_out_index = get_audio_device_index_by_name_and_api(audio_output_device, audio_api_index, False,
device_out_index)
settings.SetOption("device_out_index", device_out_index)

settings.SetOption("condition_on_previous_text",
settings.GetArgumentSettingFallback(ctx, "condition_on_previous_text",
"condition_on_previous_text"))
Expand Down Expand Up @@ -371,12 +478,28 @@ def main(ctx, devices, device_index, sample_rate, dynamic_energy, open_browser,
"vad_num_samples")))

frames = []
stream = py_audio.open(format=FORMAT,
channels=CHANNELS,
rate=SAMPLE_RATE,
input=True,
input_device_index=(device_index if device_index > -1 else None),
frames_per_buffer=CHUNK)

default_sample_rate = SAMPLE_RATE
recorded_sample_rate = SAMPLE_RATE
needs_sample_rate_conversion = False
try:
stream = py_audio.open(format=FORMAT,
channels=CHANNELS,
rate=default_sample_rate,
input=True,
input_device_index=(device_index if device_index > -1 else None),
frames_per_buffer=CHUNK)
except Exception as e:
print("opening stream failed, falling back to default sample rate")
dev_info = py_audio.get_device_info_by_index(device_index)
recorded_sample_rate = int(dev_info['defaultSampleRate'])
stream = py_audio.open(format=FORMAT,
channels=2,
rate=int(dev_info['defaultSampleRate']),
input=True,
input_device_index=(device_index if device_index > -1 else None),
frames_per_buffer=CHUNK)
needs_sample_rate_conversion = True

audioprocessor.start_whisper_thread()

Expand All @@ -397,7 +520,7 @@ def main(ctx, devices, device_index, sample_rate, dynamic_energy, open_browser,
clip_duration = phrase_time_limit
fps = 0
if clip_duration is not None:
fps = int(SAMPLE_RATE / CHUNK * clip_duration)
fps = int(default_sample_rate / CHUNK * clip_duration)

end_time = time.time()
elapsed_time = end_time - start_time
Expand All @@ -406,7 +529,11 @@ def main(ctx, devices, device_index, sample_rate, dynamic_energy, open_browser,

audio_chunk = stream.read(num_samples)

new_confidence, peak_amplitude = process_audio_chunk(audio_chunk, vad_model, SAMPLE_RATE)
# special case which seems to be needed for WASAPI
if needs_sample_rate_conversion:
audio_chunk = resample_audio(audio_chunk, recorded_sample_rate, default_sample_rate, -1)

new_confidence, peak_amplitude = process_audio_chunk(audio_chunk, vad_model, default_sample_rate)

# put frames with recognized speech into a list and send to whisper
# if (clip_duration is not None and len(frames) > fps) or (elapsed_time > 3 and len(frames) > 0):
Expand All @@ -425,10 +552,13 @@ def main(ctx, devices, device_index, sample_rate, dynamic_energy, open_browser,
if vad_clip_test:
audio_full_int16 = np.frombuffer(wavefiledata, np.int16)
audio_full_float32 = int2float(audio_full_int16)
full_audio_confidence = vad_model(torch.from_numpy(audio_full_float32), SAMPLE_RATE).item()
full_audio_confidence = vad_model(torch.from_numpy(audio_full_float32), default_sample_rate).item()
print(full_audio_confidence)

if (not vad_clip_test) or (vad_clip_test and full_audio_confidence >= confidence_threshold):
# debug save of audio clip
# save_to_wav(wavefiledata, "resampled_audio_chunk.wav", default_sample_rate)

audioprocessor.q.put(
{'time': time.time_ns(), 'data': audio_bytes_to_wav(wavefiledata), 'final': True})
# vad_iterator.reset_states() # reset model states after each audio
Expand Down
1 change: 1 addition & 0 deletions ignorelist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Untertitel der Amara.org-Community
Untertitel von Stephanie Geiges
you
Go to Beadaholique.com for all of your beading supply needs!
and hopefully you enjoyed the video.
MBC 뉴스 이재경입니다.
This is the end of this video. Thank you for watching.
Thanks for watching and don't forget to like and subscribe!
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ ffmpeg-python==0.2.0
click>=8.1.3
PyAudio==0.2.13
PyAudioWPatch==0.2.12.5
resampy==0.4.2
SpeechRecognition==3.10.0
pydub>=0.25.1
git+https://github.com/openai/whisper.git
Expand All @@ -30,7 +31,7 @@ easyocr==1.6.2
mss==7.0.1
scipy==1.10.1
num2words==0.5.12
onnxruntime
onnxruntime==1.14.1
requests
# downgradea of scikit-image to v1.23.4 to prevent https://github.com/scikit-image/scikit-image/issues/6784
scikit-image==v0.19.3
Expand Down
8 changes: 5 additions & 3 deletions settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,11 @@
"ocr_window_name": "VRChat", # window name for OCR image to text recognition.

# audio settings
"audio_input_device": "", # used by whispering tiger UI to select audio input device by name
"audio_output_device": "", # used by whispering tiger UI to select audio output device by name
"audio_api": "MME", # The name of the audio API. (MME, DirectSound, WASAPI)
"audio_input_device": "", # audio input device name - used by whispering tiger UI to select audio input device by name
"audio_output_device": "", # audio output device name - used by whispering tiger UI to select audio output device by name
"device_index": None, # input device index for STT
"device_out_index": None, # output device index for TTS

# whisper settings
"ai_device": None, # can be None (auto), "cuda" or "cpu".
Expand Down Expand Up @@ -77,7 +80,6 @@
"tts_enabled": True, # enable TTS
"tts_ai_device": "cpu", # can be "auto", "cuda" or "cpu".
"tts_answer": True, # send whisper results to TTS engine
"device_out_index": None, # output device index for TTS
"tts_model": ["en", "v3_en"], # TTS language and model to use
"tts_voice": "en_0", # TTS voice (one of silero tts voices, or "last" to use last used voice)
"tts_prosody_rate": "", # TTS voice speed. Can be "x-slow", "slow", "medium", "fast", "x-fast" or "" for default.
Expand Down

0 comments on commit 6a6c0b7

Please sign in to comment.