Merge branch 'main' into support_negative_timestamps

openai · Jan 12, 2025 · e8ea46f · e8ea46f
2 parents 17de1f2 + 517a43e
commit e8ea46f
Show file tree

Hide file tree

Showing 18 changed files with 206 additions and 102 deletions.
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -33,5 +33,5 @@ jobs:
         TWINE_USERNAME: __token__
         TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
       run: |
-        python setup.py sdist
+        python -m build --sdist
         twine upload dist/*
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -11,19 +11,19 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Fetch base branch
         run: git fetch origin ${{ github.base_ref }}
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
-          python-version: "3.8"
+          python-version: "3.9"
           architecture: x64
       - name: Get pip cache dir
         id: pip-cache
         run: |
           echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT
       - name: pip/pre-commit cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: |
             ${{ steps.pip-cache.outputs.dir }}
@@ -33,24 +33,43 @@ jobs:
             ${{ runner.os }}-pip-pre-commit
       - name: pre-commit
         run: |
-          pip install -U pre-commit
+          pip install --upgrade pre-commit
           pre-commit install --install-hooks
           pre-commit run --all-files
   whisper-test:
     needs: pre-commit
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.8', '3.9', '3.10', '3.11']
-        pytorch-version: [1.13.1, 2.0.0]
-        exclude:
-          - python-version: '3.11'
+        include:
+          - python-version: '3.8'
+            pytorch-version: 1.10.1
+            numpy-requirement: "'numpy<2'"
+          - python-version: '3.8'
             pytorch-version: 1.13.1
+            numpy-requirement: "'numpy<2'"
+          - python-version: '3.8'
+            pytorch-version: 2.0.1
+            numpy-requirement: "'numpy<2'"
+          - python-version: '3.9'
+            pytorch-version: 2.1.2
+            numpy-requirement: "'numpy<2'"
+          - python-version: '3.10'
+            pytorch-version: 2.2.2
+            numpy-requirement: "'numpy<2'"
+          - python-version: '3.11'
+            pytorch-version: 2.3.1
+            numpy-requirement: "'numpy'"
+          - python-version: '3.12'
+            pytorch-version: 2.4.1
+            numpy-requirement: "'numpy'"
+          - python-version: '3.12'
+            pytorch-version: 2.5.0
+            numpy-requirement: "'numpy'"
     steps:
-      - uses: conda-incubator/setup-miniconda@v2
+      - uses: conda-incubator/setup-miniconda@v3
       - run: conda install -n test ffmpeg python=${{ matrix.python-version }}
-      - run: pip3 install torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - run: echo "$CONDA/envs/test/bin" >> $GITHUB_PATH
-      - run: pip install .["dev"]
+      - run: pip3 install .["dev"] ${{ matrix.numpy-requirement }} torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple
       - run: pytest --durations=0 -vv -k 'not test_transcribe or test_transcribe[tiny] or test_transcribe[tiny.en]' -m 'not requires_cuda'
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.0.1
+    rev: v5.0.0
     hooks:
       - id: check-json
       - id: end-of-file-fixer
@@ -11,17 +11,17 @@ repos:
       - id: check-added-large-files
         args: [--maxkb=4096]
   - repo: https://github.com/psf/black
-    rev: 23.7.0
+    rev: 24.10.0
     hooks:
       - id: black
   - repo: https://github.com/pycqa/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
       - id: isort
         name: isort (python)
         args: ["--profile", "black", "-l", "88", "--trailing-comma", "--multi-line", "3"]
   - repo: https://github.com/pycqa/flake8.git
-    rev: 6.0.0
+    rev: 7.1.1
     hooks:
       - id: flake8
         types: [python]

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,19 @@
 # CHANGELOG
 
+## [v20240930](https://github.com/openai/whisper/releases/tag/v20240930)
+
+* allowing numpy 2 in tests ([#2362](https://github.com/openai/whisper/pull/2362))
+* large-v3-turbo model ([#2361](https://github.com/openai/whisper/pull/2361))
+* test on python/pytorch versions up to 3.12 and 2.4.1 ([#2360](https://github.com/openai/whisper/pull/2360))
+* using sdpa if available ([#2359](https://github.com/openai/whisper/pull/2359))
+
+## [v20240927](https://github.com/openai/whisper/releases/tag/v20240927)
+
+* pinning numpy<2 in tests ([#2332](https://github.com/openai/whisper/pull/2332))
+* Relax triton requirements for compatibility with pytorch 2.4 and newer ([#2307](https://github.com/openai/whisper/pull/2307))
+* Skip silence around hallucinations ([#1838](https://github.com/openai/whisper/pull/1838))
+* Fix triton env marker ([#1887](https://github.com/openai/whisper/pull/1887))
+
 ## [v20231117](https://github.com/openai/whisper/releases/tag/v20231117)
 
 * Relax triton requirements for compatibility with pytorch 2.1 and newer ([#1802](https://github.com/openai/whisper/pull/1802))

diff --git a/README.md b/README.md
@@ -57,17 +57,21 @@ pip install setuptools-rust
 
 ## Available models and languages
 
-There are five model sizes, four with English-only versions, offering speed and accuracy tradeoffs. Below are the names of the available models and their approximate memory requirements and inference speed relative to the large model; actual speed may vary depending on many factors including the available hardware.
+There are six model sizes, four with English-only versions, offering speed and accuracy tradeoffs.
+Below are the names of the available models and their approximate memory requirements and inference speed relative to the large model.
+The relative speeds below are measured by transcribing English speech on a A100, and the real-world speed may vary significantly depending on many factors including the language, the speaking speed, and the available hardware.
 
 |  Size  | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
 |:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:|
-|  tiny  |    39 M    |     `tiny.en`      |       `tiny`       |     ~1 GB     |      ~32x      |
-|  base  |    74 M    |     `base.en`      |       `base`       |     ~1 GB     |      ~16x      |
-| small  |   244 M    |     `small.en`     |      `small`       |     ~2 GB     |      ~6x       |
+|  tiny  |    39 M    |     `tiny.en`      |       `tiny`       |     ~1 GB     |      ~10x      |
+|  base  |    74 M    |     `base.en`      |       `base`       |     ~1 GB     |      ~7x       |
+| small  |   244 M    |     `small.en`     |      `small`       |     ~2 GB     |      ~4x       |
 | medium |   769 M    |    `medium.en`     |      `medium`      |     ~5 GB     |      ~2x       |
 | large  |   1550 M   |        N/A         |      `large`       |    ~10 GB     |       1x       |
+| turbo  |   809 M    |        N/A         |      `turbo`       |     ~6 GB     |      ~8x       |
 
 The `.en` models for English-only applications tend to perform better, especially for the `tiny.en` and `base.en` models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models.
+Additionally, the `turbo` model is an optimized version of `large-v3` that offers faster transcription speed with a minimal degradation in accuracy.
 
 Whisper's performance varies widely depending on the language. The figure below shows a performance breakdown of `large-v3` and `large-v2` models by language, using WERs (word error rates) or CER (character error rates, shown in *Italic*) evaluated on the Common Voice 15 and Fleurs datasets. Additional WER/CER metrics corresponding to the other models and datasets can be found in Appendix D.1, D.2, and D.4 of [the paper](https://arxiv.org/abs/2212.04356), as well as the BLEU (Bilingual Evaluation Understudy) scores for translation in Appendix D.3.
 
@@ -77,11 +81,11 @@ Whisper's performance varies widely depending on the language. The figure below
 
 ## Command-line usage
 
-The following command will transcribe speech in audio files, using the `medium` model:
+The following command will transcribe speech in audio files, using the `turbo` model:
 
-    whisper audio.flac audio.mp3 audio.wav --model medium
+    whisper audio.flac audio.mp3 audio.wav --model turbo
 
-The default setting (which selects the `small` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option:
+The default setting (which selects the `turbo` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option:
 
     whisper japanese.wav --language Japanese
 
@@ -103,7 +107,7 @@ Transcription can also be performed within Python:
 ```python
 import whisper
 
-model = whisper.load_model("base")
+model = whisper.load_model("turbo")
 result = model.transcribe("audio.mp3")
 print(result["text"])
 ```
@@ -115,14 +119,14 @@ Below is an example usage of `whisper.detect_language()` and `whisper.decode()`
 ```python
 import whisper
 
-model = whisper.load_model("base")
+model = whisper.load_model("turbo")
 
 # load audio and pad/trim it to fit 30 seconds
 audio = whisper.load_audio("audio.mp3")
 audio = whisper.pad_or_trim(audio)
 
 # make log-Mel spectrogram and move to the same device as the model
-mel = whisper.log_mel_spectrogram(audio).to(model.device)
+mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)
 
 # detect the spoken language
 _, probs = model.detect_language(mel)

diff --git a/data/README.md b/data/README.md
@@ -45,7 +45,7 @@ We downloaded the [CHiME-5 dataset](https://spandh.dcs.shef.ac.uk//chime_challen
 
 ### AMI-IHM, AMI-SDM1
 
-We preprocessed the [AMI Corpus](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) by following the stage 0 ad 2 of the [s5b recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/ami/s5b).
+We preprocessed the [AMI Corpus](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) by following the stage 0 and 2 of the [s5b recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/ami/s5b).
 
 
 ## Long-form English-only datasets

diff --git a/model-card.md b/model-card.md
@@ -16,13 +16,15 @@ The Whisper models are trained for speech recognition and translation tasks, cap
 | small  |   244 M    |         ✓          |         ✓          |
 | medium |   769 M    |         ✓          |         ✓          |
 | large  |   1550 M   |                    |         ✓          |
+| turbo  |   798 M    |                    |         ✓          |
 
 In December 2022, we [released an improved large model named `large-v2`](https://github.com/openai/whisper/discussions/661), and `large-v3` in November 2023.
+Additionally, we've added a `turbo` model in September 2024 which is optimized for inference speed.
 
 
 ### Release date
 
-September 2022 (original series), December 2022 (`large-v2`), and November 2023 (`large-v3`)
+September 2022 (original series), December 2022 (`large-v2`), November 2023 (`large-v3`), September 2024 (`large-v3-turbo`)
 
 ### Model type
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,8 +1,54 @@
+[build-system]
+build-backend = "setuptools.build_meta"
+
+requires = [ "setuptools>=61.2" ]
+
+[project]
+name = "openai-whisper"
+description = "Robust Speech Recognition via Large-Scale Weak Supervision"
+readme.content-type = "text/markdown"
+readme.file = "README.md"
+license = { text = "MIT" }
+authors = [ { name = "OpenAI" } ]
+requires-python = ">=3.8"
+classifiers = [
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+]
+dynamic = [ "version" ]
+dependencies = [
+  "more-itertools",
+  "numba",
+  "numpy",
+  "tiktoken",
+  "torch",
+  "tqdm",
+  "triton>=2; (platform_machine=='x86_64' and sys_platform=='linux') or sys_platform=='linux2'",
+]
+optional-dependencies.dev = [ "black", "flake8", "isort", "pytest", "scipy" ]
+urls = { Homepage = "https://github.com/openai/whisper" }
+scripts.whisper = "whisper.transcribe:cli"
+
+[tool.setuptools]
+py-modules = [ "whisper" ]
+include-package-data = true
+
+[tool.setuptools.dynamic]
+version = { attr = "whisper.version.__version__" }
+
+[tool.setuptools.packages.find]
+exclude = [ "tests*" ]
+namespaces = false
+
 [tool.black]
 
 [tool.isort]
 profile = "black"
 include_trailing_comma = true
 line_length = 88
 multi_line_output = 3
-
diff --git a/requirements.txt b/requirements.txt
@@ -4,4 +4,4 @@ torch
 tqdm
 more-itertools
 tiktoken
-triton>=2.0.0,<3;platform_machine=="x86_64" and sys_platform=="linux" or sys_platform=="linux2"
+triton>=2.0.0;platform_machine=="x86_64" and sys_platform=="linux" or sys_platform=="linux2"
diff --git a/setup.py b/setup.py
diff --git a/whisper/__init__.py b/whisper/__init__.py
@@ -27,6 +27,8 @@
     "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
     "large-v3": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt",
     "large": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt",
+    "large-v3-turbo": "https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt",
+    "turbo": "https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt",
 }
 
 # base85-encoded (n_layers, n_heads) boolean arrays indicating the cross-attention heads that are
@@ -44,6 +46,8 @@
     "large-v2": b"ABzY8zd+h!0{>%R7=D0pU<_bnWW*tkYAhobTNnu$jnkEkXqp)j;w1Tzk)UH3X%SZd&fFZ2fC2yj",
     "large-v3": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00",
     "large": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00",
+    "large-v3-turbo": b"ABzY8j^C+e0{>%RARaKHP%t(lGR*)0g!tONPyhe`",
+    "turbo": b"ABzY8j^C+e0{>%RARaKHP%t(lGR*)0g!tONPyhe`",
 }
 
 

diff --git a/whisper/audio.py b/whisper/audio.py
@@ -122,7 +122,7 @@ def log_mel_spectrogram(
         The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
 
     n_mels: int
-        The number of Mel-frequency filters, only 80 is supported
+        The number of Mel-frequency filters, only 80 and 128 are supported
 
     padding: int
         Number of zero samples to pad to the right
@@ -132,7 +132,7 @@ def log_mel_spectrogram(
 
     Returns
     -------
-    torch.Tensor, shape = (80, n_frames)
+    torch.Tensor, shape = (n_mels, n_frames)
         A Tensor that contains the Mel spectrogram
     """
     if not torch.is_tensor(audio):