From de7682a2f0a9b74e0f8541452d254931ce422ef9 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Fri, 24 Mar 2023 10:55:55 +0100 Subject: [PATCH] Automatically download converted models from the Hugging Face Hub (#70) * Automatically download converted models from the Hugging Face Hub * Remove unused import * Remove non needed requirements in dev mode * Remove extra index URL when pip install in CI * Allow downloading to a specific directory * Update docstring * Add argument to disable the progess bars * Fix typo in docstring --- .github/workflows/ci.yml | 6 ++-- README.md | 49 +++++++++++++-------------- faster_whisper/__init__.py | 3 +- faster_whisper/transcribe.py | 12 +++++-- faster_whisper/utils.py | 45 ++++++++++++++++++++++++ requirements.txt | 1 + setup.py | 3 +- tests/conftest.py | 18 ---------- tests/{test.py => test_transcribe.py} | 4 +-- tests/test_utils.py | 17 ++++++++++ 10 files changed, 105 insertions(+), 53 deletions(-) rename tests/{test.py => test_transcribe.py} (88%) create mode 100644 tests/test_utils.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0508273a..407661fd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,7 +25,7 @@ jobs: - name: Install module run: | pip install wheel - pip install .[dev] --extra-index-url https://download.pytorch.org/whl/cpu + pip install -e .[dev] - name: Check code format with Black run: | @@ -55,11 +55,11 @@ jobs: - name: Install module run: | pip install wheel - pip install .[dev] --extra-index-url https://download.pytorch.org/whl/cpu + pip install -e .[dev] - name: Run pytest run: | - pytest -v tests/test.py + pytest -v tests/ build-and-push-package: diff --git a/README.md b/README.md index be927538..3fa1ede9 100644 --- a/README.md +++ b/README.md @@ -44,12 +44,6 @@ The module can be installed from [PyPI](https://pypi.org/project/faster-whisper/ pip install faster-whisper ``` -The model conversion script requires the modules `transformers` and `torch` which can be installed with the `[conversion]` extra requirement: - -```bash -pip install faster-whisper[conversion] -``` - **Other installation methods:** ```bash @@ -70,35 +64,20 @@ GPU execution requires the NVIDIA libraries cuBLAS 11.x and cuDNN 8.x to be inst ## Usage -### Model conversion - -A Whisper model should be first converted into the CTranslate2 format. We provide a script to download and convert models from the [Hugging Face model repository](https://huggingface.co/models?sort=downloads&search=whisper). - -For example the command below converts the "large-v2" Whisper model and saves the weights in FP16: - -```bash -ct2-transformers-converter --model openai/whisper-large-v2 --output_dir whisper-large-v2-ct2 \ - --copy_files tokenizer.json --quantization float16 -``` - -If the option `--copy_files tokenizer.json` is not used, the tokenizer configuration is automatically downloaded when the model is loaded later. - -Models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html). - ### Transcription ```python from faster_whisper import WhisperModel -model_path = "whisper-large-v2-ct2/" +model_size = "large-v2" # Run on GPU with FP16 -model = WhisperModel(model_path, device="cuda", compute_type="float16") +model = WhisperModel(model_size, device="cuda", compute_type="float16") # or run on GPU with INT8 -# model = WhisperModel(model_path, device="cuda", compute_type="int8_float16") +# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") # or run on CPU with INT8 -# model = WhisperModel(model_path, device="cpu", compute_type="int8") +# model = WhisperModel(model_size, device="cpu", compute_type="int8") segments, info = model.transcribe("audio.mp3", beam_size=5) @@ -120,6 +99,26 @@ for segment in segments: See more model and transcription options in the [`WhisperModel`](https://github.com/guillaumekln/faster-whisper/blob/master/faster_whisper/transcribe.py) class implementation. +## Model conversion + +When loading a model from its size such as `WhisperModel("large-v2")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/guillaumekln). + +We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models. + +For example the command below converts the [original "large-v2" Whisper model](https://huggingface.co/openai/whisper-large-v2) and saves the weights in FP16: + +```bash +pip install transformers[torch]>=4.23 + +ct2-transformers-converter --model openai/whisper-large-v2 --output_dir whisper-large-v2-ct2 \ + --copy_files tokenizer.json --quantization float16 +``` + +* The option `--model` accepts a model name on the Hub or a path to a model directory. +* If the option `--copy_files tokenizer.json` is not used, the tokenizer configuration is automatically downloaded when the model is loaded later. + +Models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html). + ## Comparing performance against other implementations If you are comparing the performance against other Whisper implementations, you should make sure to run the comparison with similar settings. In particular: diff --git a/faster_whisper/__init__.py b/faster_whisper/__init__.py index eea2e649..add677e2 100644 --- a/faster_whisper/__init__.py +++ b/faster_whisper/__init__.py @@ -1,9 +1,10 @@ from faster_whisper.audio import decode_audio from faster_whisper.transcribe import WhisperModel -from faster_whisper.utils import format_timestamp +from faster_whisper.utils import download_model, format_timestamp __all__ = [ "decode_audio", "WhisperModel", + "download_model", "format_timestamp", ] diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index f65f3d2e..3f8b3082 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -11,6 +11,7 @@ from faster_whisper.audio import decode_audio from faster_whisper.feature_extractor import FeatureExtractor from faster_whisper.tokenizer import Tokenizer +from faster_whisper.utils import download_model class Word(NamedTuple): @@ -57,7 +58,7 @@ class TranscriptionOptions(NamedTuple): class WhisperModel: def __init__( self, - model_path: str, + model_size_or_path: str, device: str = "auto", device_index: Union[int, List[int]] = 0, compute_type: str = "default", @@ -67,7 +68,9 @@ def __init__( """Initializes the Whisper model. Args: - model_path: Path to the converted model. + model_size_or_path: Size of the model to use (e.g. "large-v2", "small", "tiny.en", etc.) + or a path to a converted model directory. When a size is configured, the converted + model is downloaded from the Hugging Face Hub. device: Device to use for computation ("cpu", "cuda", "auto"). device_index: Device ID to use. The model can also be loaded on multiple GPUs by passing a list of IDs @@ -82,6 +85,11 @@ def __init__( (concurrent calls to self.model.generate() will run in parallel). This can improve the global throughput at the cost of increased memory usage. """ + if os.path.isdir(model_size_or_path): + model_path = model_size_or_path + else: + model_path = download_model(model_size_or_path) + self.model = ctranslate2.models.Whisper( model_path, device=device, diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index 140ea9eb..ee459aa9 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -1,3 +1,42 @@ +from typing import Optional + +import huggingface_hub + +from tqdm.auto import tqdm + + +def download_model( + size: str, + output_dir: Optional[str] = None, + show_progress_bars: bool = True, +): + """Downloads a CTranslate2 Whisper model from the Hugging Face Hub. + + The model is downloaded from https://huggingface.co/guillaumekln. + + Args: + size: Size of the model to download (tiny, tiny.en, base, base.en, small, small.en, + medium, medium.en, or large-v2). + output_dir: Directory where the model should be saved. If not set, the model is saved in + the standard Hugging Face cache directory. + show_progress_bars: Show the tqdm progress bars during the download. + + Returns: + The path to the downloaded model. + """ + repo_id = "guillaumekln/faster-whisper-%s" % size + kwargs = {} + + if output_dir is not None: + kwargs["local_dir"] = output_dir + kwargs["local_dir_use_symlinks"] = False + + if not show_progress_bars: + kwargs["tqdm_class"] = disabled_tqdm + + return huggingface_hub.snapshot_download(repo_id, **kwargs) + + def format_timestamp( seconds: float, always_include_hours: bool = False, @@ -19,3 +58,9 @@ def format_timestamp( return ( f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}" ) + + +class disabled_tqdm(tqdm): + def __init__(self, *args, **kwargs): + kwargs["disable"] = True + super().__init__(*args, **kwargs) diff --git a/requirements.txt b/requirements.txt index fdecf4d7..a8eb983b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ av==10.* ctranslate2>=3.10,<4 +huggingface_hub>=0.13 tokenizers==0.13.* diff --git a/setup.py b/setup.py index 57a11caf..1edb8845 100644 --- a/setup.py +++ b/setup.py @@ -48,8 +48,7 @@ def get_requirements(path): install_requires=install_requires, extras_require={ "conversion": conversion_requires, - "dev": conversion_requires - + [ + "dev": [ "black==23.*", "flake8==6.*", "isort==5.*", diff --git a/tests/conftest.py b/tests/conftest.py index be0d44f8..1a1ee1d1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,5 @@ import os -import ctranslate2 import pytest @@ -12,20 +11,3 @@ def data_dir(): @pytest.fixture def jfk_path(data_dir): return os.path.join(data_dir, "jfk.flac") - - -@pytest.fixture(scope="session") -def tiny_model_dir(tmp_path_factory): - model_path = str(tmp_path_factory.mktemp("data") / "model") - convert_model("tiny", model_path) - return model_path - - -def convert_model(size, output_dir): - name = "openai/whisper-%s" % size - - ctranslate2.converters.TransformersConverter( - name, - copy_files=["tokenizer.json"], - load_as_float16=True, - ).convert(output_dir, quantization="float16") diff --git a/tests/test.py b/tests/test_transcribe.py similarity index 88% rename from tests/test.py rename to tests/test_transcribe.py index de53577a..575bbd4e 100644 --- a/tests/test.py +++ b/tests/test_transcribe.py @@ -1,8 +1,8 @@ from faster_whisper import WhisperModel -def test_transcribe(tiny_model_dir, jfk_path): - model = WhisperModel(tiny_model_dir) +def test_transcribe(jfk_path): + model = WhisperModel("tiny") segments, info = model.transcribe(jfk_path, word_timestamps=True) assert info.language == "en" diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..3e981f63 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,17 @@ +import os + +from faster_whisper import download_model + + +def test_download_model(tmpdir): + output_dir = str(tmpdir.join("model")) + + model_dir = download_model("tiny", output_dir=output_dir) + + assert model_dir == output_dir + assert os.path.isdir(model_dir) + assert not os.path.islink(model_dir) + + for filename in os.listdir(model_dir): + path = os.path.join(model_dir, filename) + assert not os.path.islink(path)