diff --git a/Models/Multi/mms.py b/Models/Multi/mms.py index bbb2450..278728d 100644 --- a/Models/Multi/mms.py +++ b/Models/Multi/mms.py @@ -1135,6 +1135,7 @@ class Mms(metaclass=SingletonMeta): device = None compute_type = "float32" compute_device = "cpu" + compute_device_str = "cpu" precision = None load_in_8bit = False @@ -1146,9 +1147,6 @@ class Mms(metaclass=SingletonMeta): language_identification = None def __init__(self, model='mms-1b-fl102', compute_type="float32", device="cpu"): - self.compute_type = compute_type - self.compute_device = device - self.load_model(model_size=model, compute_type=compute_type, device=device) @staticmethod @@ -1167,13 +1165,21 @@ def _str_to_dtype_dict(self, dtype_str): else: return {'dtype': torch.float32, '4bit': False, '8bit': False} - def set_device(self, device: str): - if device == "cuda" or device == "auto": - device = "cuda" if torch.cuda.is_available() else "cpu" - elif device == "direct-ml": + def set_device(self, device: str | None): + self.compute_device_str = device + if device is None or device == "cuda" or device == "auto" or device == "": + self.compute_device_str = "cuda" if torch.cuda.is_available() else "cpu" + device = torch.device(self.compute_device_str) + elif device == "cpu": + device = torch.device("cpu") + elif device.startswith("direct-ml"): + device_id = 0 + device_id_split = device.split(":") + if len(device_id_split) > 1: + device_id = int(device_id_split[1]) import torch_directml - device = torch_directml.device() - self.device = device + device = torch_directml.device(device_id) + self.compute_device = device def load_model(self, model_size='mms-1b-fl102', compute_type="float32", device="cpu"): model_path = Path(model_cache_path / model_size) @@ -1183,7 +1189,7 @@ def load_model(self, model_size='mms-1b-fl102', compute_type="float32", device=" compute_4bit = self._str_to_dtype_dict(self.compute_type).get('4bit', False) compute_8bit = self._str_to_dtype_dict(self.compute_type).get('8bit', False) self.compute_type = compute_type - self.compute_device = device + self.set_device(device) if self.model is None or model_size != self.previous_model: if self.model is not None: diff --git a/Models/Multi/seamless_m4t.py b/Models/Multi/seamless_m4t.py index 43eaeaf..4074413 100644 --- a/Models/Multi/seamless_m4t.py +++ b/Models/Multi/seamless_m4t.py @@ -178,14 +178,29 @@ def __init__(self, model='medium', compute_type="float32", device="cpu"): if self.device is None: self.device = device - if device == "cuda": + if device == "cuda" or device == "auto": self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + elif device.startswith("direct-ml"): + device_id = 0 + device_id_split = device.split(":") + if len(device_id_split) > 1: + device_id = int(device_id_split[1]) + import torch_directml + self.device = torch_directml.device(device_id) if self.model is None or self.processor is None: self.load_model(model_size=model) def set_device(self, device: str): + self.device_str = device if device == "cuda" or device == "auto": device = "cuda" if torch.cuda.is_available() else "cpu" + elif device.startswith("direct-ml"): + device_id = 0 + device_id_split = device.split(":") + if len(device_id_split) > 1: + device_id = int(device_id_split[1]) + import torch_directml + device = torch_directml.device(device_id) self.device = device @staticmethod diff --git a/Models/STT/speecht5.py b/Models/STT/speecht5.py index 5df1a0b..569d76b 100644 --- a/Models/STT/speecht5.py +++ b/Models/STT/speecht5.py @@ -15,6 +15,13 @@ def __init__(self, device="cpu"): self.device = device if device == "cuda": self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + elif device.startswith("direct-ml"): + device_id = 0 + device_id_split = device.split(":") + if len(device_id_split) > 1: + device_id = int(device_id_split[1]) + import torch_directml + self.device = torch_directml.device(device_id) if self.model is None: self.load_model() diff --git a/Models/STT/tansformer_whisper.py b/Models/STT/tansformer_whisper.py index 56c5c13..dee9e78 100644 --- a/Models/STT/tansformer_whisper.py +++ b/Models/STT/tansformer_whisper.py @@ -17,6 +17,7 @@ class TransformerWhisper(metaclass=SingletonMeta): processor = None compute_type = "float32" compute_device = "cpu" + compute_device_str = "cpu" text_correction_model = None @@ -33,7 +34,7 @@ class TransformerWhisper(metaclass=SingletonMeta): def __init__(self, compute_type="float32", device="cpu"): os.makedirs(self.model_cache_path, exist_ok=True) self.compute_type = compute_type - self.compute_device = device + self.set_compute_device(device) self.load_model_list() #if self._debug_skip_dl: @@ -56,6 +57,19 @@ def set_compute_type(self, compute_type): self.compute_type = compute_type def set_compute_device(self, device): + self.compute_device_str = device + if device is None or device == "cuda" or device == "auto" or device == "": + self.compute_device_str = "cuda" if torch.cuda.is_available() else "cpu" + device = torch.device(self.compute_device_str) + elif device == "cpu": + device = torch.device("cpu") + elif device.startswith("direct-ml"): + device_id = 0 + device_id_split = device.split(":") + if len(device_id_split) > 1: + device_id = int(device_id_split[1]) + import torch_directml + device = torch_directml.device(device_id) self.compute_device = device def load_model_list(self): @@ -108,7 +122,7 @@ def load_model(self, model='small', compute_type="float32", device="cpu"): compute_8bit = self._str_to_dtype_dict(self.compute_type).get('8bit', False) self.compute_type = compute_type - self.compute_device = device + self.set_compute_device(device) if not self._debug_skip_dl: self.download_model(model) @@ -120,7 +134,7 @@ def load_model(self, model='small', compute_type="float32", device="cpu"): self.previous_model = model self.release_model() print(f"Loading Whisper-Transformer model: {model} on {device} with {compute_type} precision...") - self.model = WhisperForConditionalGeneration.from_pretrained(str(Path(self.model_cache_path / model).resolve()), torch_dtype=compute_dtype, load_in_8bit=compute_8bit, load_in_4bit=compute_4bit) + self.model = WhisperForConditionalGeneration.from_pretrained(str(Path(self.model_cache_path / model).resolve()), torch_dtype=compute_dtype, load_in_8bit=compute_8bit, load_in_4bit=compute_4bit, device_map=self.compute_device) if not compute_8bit and not compute_4bit: self.model = self.model.to(self.compute_device) self.processor = WhisperProcessor.from_pretrained(str(Path(self.model_cache_path / model).resolve())) @@ -129,7 +143,7 @@ def load_model(self, model='small', compute_type="float32", device="cpu"): def transcribe(self, audio_sample, model, task, language, return_timestamps=False, beam_size=4) -> dict: - self.load_model(model, self.compute_type, self.compute_device) + self.load_model(model, self.compute_type, self.compute_device_str) compute_dtype = self._str_to_dtype_dict(self.compute_type).get('dtype', torch.float32) diff --git a/Models/TTS/silero.py b/Models/TTS/silero.py index d3141d2..a79469f 100644 --- a/Models/TTS/silero.py +++ b/Models/TTS/silero.py @@ -194,7 +194,7 @@ class Silero: sample_rate = 48000 speaker = 'random' models = [] - device = "cpu" # cpu or cuda + device = "cpu" # cpu, cuda or direct-ml rate = "" pitch = "" @@ -339,7 +339,15 @@ def load(self): self.set_language(settings.GetOption('tts_model')[0]) self.set_model(settings.GetOption('tts_model')[1]) - device = torch.device(self.device) + if self.device.startswith("direct-ml"): + device_id = 0 + device_id_split = self.device.split(":") + if len(device_id_split) > 1: + device_id = int(device_id_split[1]) + import torch_directml + device = torch_directml.device(device_id) + else: + device = torch.device(self.device) # set cache path torch.hub.set_dir(str(Path(cache_path).resolve())) diff --git a/audioWhisper.py b/audioWhisper.py index 9e87a2d..ee497c4 100644 --- a/audioWhisper.py +++ b/audioWhisper.py @@ -547,6 +547,7 @@ def main(ctx, detect_energy, detect_energy_time, ui_download, devices, sample_ra print(e) vad_thread_num = int(1) + vad_model = None if vad_enabled: vad_model = VAD.VAD(vad_thread_num) @@ -567,7 +568,7 @@ def main(ctx, detect_energy, detect_energy_time, ui_download, devices, sample_ra # prepare the plugin timer calls call_plugin_timer(Plugins) - if vad_enabled: + if vad_enabled and vad_model is not None: # num_samples = 1536 vad_frames_per_buffer = int(settings.SETTINGS.SetOption("vad_frames_per_buffer", settings.SETTINGS.get_argument_setting_fallback(ctx, "vad_frames_per_buffer", diff --git a/audioWhisper.spec b/audioWhisper.spec index b8999b9..8c7cc4b 100644 --- a/audioWhisper.spec +++ b/audioWhisper.spec @@ -11,7 +11,7 @@ binaries = [] # Collect dynamic libraries from onnxruntime binaries= collect_dynamic_libs('onnxruntime', destdir='onnxruntime/capi') -hiddenimports = ['torch', 'pytorch', 'torchaudio.lib.libtorchaudio', 'scipy.signal', 'transformers.models.nllb', 'sentencepiece', 'df.deepfilternet3', 'bitsandbytes', 'faiss', 'faiss-cpu', 'praat-parselmouth', 'parselmouth', 'pyworld', 'torchcrepe', 'grpcio', 'grpc', 'annotated_types', 'Cython', 'nemo_toolkit', 'nemo', 'speechbrain', 'pyannote', 'pyannote.audio', 'pyannote.pipeline', 'noisereduce', 'frozendict'] +hiddenimports = ['torch', 'pytorch', 'torchaudio.lib.libtorchaudio', 'scipy.signal', 'transformers.models.nllb', 'sentencepiece', 'df.deepfilternet3', 'bitsandbytes', 'faiss', 'faiss-cpu', 'praat-parselmouth', 'parselmouth', 'pyworld', 'torchcrepe', 'grpcio', 'grpc', 'annotated_types', 'Cython', 'nemo_toolkit', 'nemo', 'speechbrain', 'pyannote', 'pyannote.audio', 'pyannote.pipeline', 'noisereduce', 'frozendict', 'torch_directml'] datas += collect_data_files('torch') datas += collect_data_files('whisper') datas += collect_data_files('pykakasi') @@ -110,6 +110,8 @@ tmp_ret = collect_all('noisereduce') datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2] tmp_ret = collect_all('frozendict') datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2] +tmp_ret = collect_all('torch_directml') +datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2] workdir = os.environ.get('WORKDIR_WIN', r'\drone\src') workdir = "C:" + workdir # Now workdir = "C:\drone\src" diff --git a/audioprocessor.py b/audioprocessor.py index fe5522f..e275567 100644 --- a/audioprocessor.py +++ b/audioprocessor.py @@ -428,9 +428,14 @@ def load_whisper(model, ai_device): if stt_type == "original_whisper": try: set_ai_device = ai_device - if ai_device == "direct-ml": + + if ai_device.startswith("direct-ml"): + device_id = 0 + device_id_split = ai_device.split(":") + if len(device_id_split) > 1: + device_id = int(device_id_split[1]) import torch_directml - set_ai_device = torch_directml.device() + set_ai_device = torch_directml.device(device_id) return whisper.load_model(model, download_root=".cache/whisper", device=set_ai_device) except Exception as e: print("Failed to load whisper model. Application exits. " + str(e)) diff --git a/requirements.amd.txt b/requirements.amd.txt index 32db530..81160f2 100644 --- a/requirements.amd.txt +++ b/requirements.amd.txt @@ -7,4 +7,3 @@ torch==2.2.0.dev20231211+cpu torchaudio==2.2.0.dev20231211+cpu torchvision==0.17.0.dev20231211+cpu -torch-directml diff --git a/requirements.nvidia.txt b/requirements.nvidia.txt index 9f27e1c..6e9738a 100644 --- a/requirements.nvidia.txt +++ b/requirements.nvidia.txt @@ -1,4 +1,4 @@ --extra-index-url https://download.pytorch.org/whl/cu121 -torch==2.2.1 -torchvision==0.17.1 -torchaudio==2.2.1 +torch==2.3.1 +torchvision==0.18.1 +torchaudio==2.3.1 diff --git a/requirements.txt b/requirements.txt index 2472317..1b61143 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,19 @@ # numpy v1.23.4 required for whisper numpy==1.24.2 -tqdm +tqdm==4.66.4 rich==12.6.0 -more-itertools -librosa==0.10.1 +more-itertools==10.3.0 +librosa==0.10.2.post1 #transformers==4.33.2 #transformers @ https://github.com/Sharrnah/transformers/archive/refs/heads/add_seamless-m4t.zip #transformers @ https://github.com/huggingface/transformers/archive/84724efd101af52ed3d6af878e41ff8fd651a9cc.zip #transformers==4.35.0 #transformers @ https://github.com/huggingface/transformers/archive/235e5d4991e8a0984aa78db91087b49622c7740e.zip -transformers==4.42.3 +transformers==4.42.4 +torch-directml tensorboardX==2.6.2.2 -accelerate==0.30.1 +accelerate==0.32.1 #optimum #flash-attn #bitsandbytes==0.41.1 @@ -25,7 +26,7 @@ PyAudio==0.2.14 PyAudioWPatch==0.2.12.6 resampy==0.4.3 sounddevice==0.4.7 -SpeechRecognition==3.10.1 +SpeechRecognition==3.10.4 pydub>=0.25.1 git+https://github.com/openai/whisper.git #triton @ https://github.com/PrashantSaikia/Triton-for-Windows/raw/84739dfcb724845b301fbde6a738e15c3ed25905/triton-2.0.0-cp310-cp310-win_amd64.whl diff --git a/settings.py b/settings.py index 388ca7c..de2e10b 100644 --- a/settings.py +++ b/settings.py @@ -267,7 +267,7 @@ def get_available_models(self): def get_available_setting_values(self): possible_settings = { - "ai_device": ["None", "cuda", "cpu"], + "ai_device": ["None", "cuda", "cpu", "direct-ml:0", "direct-ml:1"], "model": self.get_available_models(), "whisper_task": ["transcribe", "translate"], "stt_type": ["faster_whisper", "original_whisper", "transformer_whisper", "seamless_m4t", "mms", "speech_t5", "wav2vec_bert", "nemo_canary", ""],