k2-fsa · csukuangfj · Mar 14, 2024 · Mar 18, 2024 · Mar 18, 2024 · Apr 6, 2024
diff --git a/.github/scripts/aishell3/TTS/run.sh b/.github/scripts/aishell3/TTS/run.sh
@@ -0,0 +1,118 @@
+#!/usr/bin/env bash
+
+set -ex
+
+python3 -m pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
+python3 -m pip install numba
+python3 -m pip install pypinyin
+python3 -m pip install cython
+
+apt-get update
+apt-get install -y jq
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/aishell3/TTS
+
+sed -i.bak s/1000/10/g ./prepare.sh
+
+
+function download_data() {
+  mkdir download
+  pushd download
+  curl -SL -O https://huggingface.co/csukuangfj/aishell3-ci-data/resolve/main/aishell3.tar.bz2
+  tar xf aishell3.tar.bz2
+  rm aishell3.tar.bz2
+  ls -lh
+  popd
+}
+
+function prepare_data() {
+  ./prepare.sh
+
+  echo "----------tokens.txt----------"
+  cat data/tokens.txt
+  echo "------------------------------"
+  wc -l data/tokens.txt
+  echo "------------------------------"
+
+  echo "----------lexicon.txt----------"
+  head data/lexicon.txt
+  echo "----"
+  tail data/lexicon.txt
+  echo "----"
+  wc -l data/lexicon.txt
+}
+
+function train() {
+  pushd ./vits
+  sed -i.bak s/200/50/g ./train.py
+  git diff .
+  popd
+
+  # for t in low medium high; do
+  for t in low; do
+    ./vits/train.py \
+      --exp-dir vits/exp-$t \
+      --model-type $t \
+      --num-epochs 1 \
+      --save-every-n 1 \
+      --num-buckets 2 \
+      --tokens data/tokens.txt \
+      --max-duration 20
+
+    ls -lh vits/exp-$t
+  done
+}
+
+function export_onnx() {
+  # for t in low medium high; do
+  for t in low; do
+    ./vits/export-onnx.py \
+      --model-type $t \
+      --epoch 1 \
+      --exp-dir ./vits/exp-$t \
+      --tokens data/tokens.txt \
+      --speakers ./data/speakers.txt
+
+    ls -lh vits/exp-$t/
+  done
+}
+
+function test_low() {
+  git clone https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06
+  repo=icefall-tts-aishell3-vits-low-2024-04-06
+
+  ./vits/export-onnx.py \
+    --model-type low \
+    --epoch 1000 \
+    --exp-dir $repo/exp \
+    --tokens $repo/data/tokens.txt \
+    --speakers $repo/data/speakers.txt
+
+  ls -lh $repo/exp/vits-epoch-1000.onnx
+
+  python3 -m pip install sherpa-onnx
+
+  sherpa-onnx-offline-tts \
+    --vits-model=$repo/exp/vits-epoch-960.onnx \
+    --vits-tokens=$repo/data/tokens.txt \
+    --vits-lexicon=$repo/data/lexicon.txt \
+    --num-threads=1 \
+    --vits-length-scale=1.0 \
+    --sid=33 \
+    --output-filename=/icefall/low.wav \
+    --debug=1 \
+    "这是一个语音合成测试"
+}
+
+
+download_data
+prepare_data
+train
+export_onnx
+test_low
diff --git a/.github/workflows/aishell3.yml b/.github/workflows/aishell3.yml
@@ -0,0 +1,84 @@
+name: aishell3
+
+on:
+  push:
+    branches:
+      - master
+      - tts-aishell3
+
+  pull_request:
+    branches:
+      - master
+
+  workflow_dispatch:
+
+concurrency:
+  group: aishell3-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  generate_build_matrix:
+    if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && (github.event.label.name == 'ready' || github.event_name == 'push' || github.event_name == 'aishell3')
+
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python ./.github/scripts/docker/generate_build_matrix.py
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          echo "::set-output name=matrix::${MATRIX}"
+  aishell3:
+    needs: generate_build_matrix
+    name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Free space
+        shell: bash
+        run: |
+          df -h
+          rm -rf /opt/hostedtoolcache
+          df -h
+          echo "pwd: $PWD"
+          echo "github.workspace ${{ github.workspace }}"
+
+      - name: Run aishell3 tests
+        uses: addnab/docker-run-action@v3
+        with:
+            image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
+            options: |
+              --volume ${{ github.workspace }}/:/icefall
+            shell: bash
+            run: |
+              export PYTHONPATH=/icefall:$PYTHONPATH
+              cd /icefall
+              git config --global --add safe.directory /icefall
+
+              .github/scripts/aishell3/TTS/run.sh
+
+      - name: display files
+        shell: bash
+        run: |
+          ls -lh
+
+      - uses: actions/upload-artifact@v4
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0'
+        with:
+          name: generated-test-files-${{ matrix.python-version }}-${{ matrix.torch-version }}
+          path: ./*.wav
diff --git a/.gitignore b/.gitignore
@@ -36,3 +36,7 @@ node_modules
 .DS_Store
 *.fst
 *.arpa
+core.c
+*.so
+build
+*.wav
diff --git a/docs/source/recipes/TTS/ljspeech/vits.rst b/docs/source/recipes/TTS/ljspeech/vits.rst
@@ -19,7 +19,7 @@ Install extra dependencies
 .. code-block:: bash
 
   pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
-  pip install numba espnet_tts_frontend
+  pip install numba espnet_tts_frontend cython
 
 Data preparation
 ----------------

diff --git a/egs/aishell3/TTS/local/compute_spectrogram_aishell3.py b/egs/aishell3/TTS/local/compute_spectrogram_aishell3.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+# Copyright    2021-2023  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                       Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the aishell3 dataset.
+It looks for manifests in the directory data/manifests.
+
+The generated spectrogram features are saved in data/spectrogram.
+"""
+
+import logging
+import os
+from pathlib import Path
+
+import torch
+from lhotse import (
+    CutSet,
+    LilcomChunkyWriter,
+    Spectrogram,
+    SpectrogramConfig,
+    load_manifest,
+)
+from lhotse.audio import RecordingSet
+from lhotse.supervision import SupervisionSet
+
+from icefall.utils import get_executor
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+
+def compute_spectrogram_aishell3():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/spectrogram")
+    num_jobs = min(4, os.cpu_count())
+
+    sampling_rate = 8000
+    frame_length = 1024 / sampling_rate  # (in second)
+    frame_shift = 256 / sampling_rate  # (in second)
+    use_fft_mag = True
+
+    prefix = "aishell3"
+    suffix = "jsonl.gz"
+    partitions = ("test", "train")
+
+    config = SpectrogramConfig(
+        sampling_rate=sampling_rate,
+        frame_length=frame_length,
+        frame_shift=frame_shift,
+        use_fft_mag=use_fft_mag,
+    )
+    extractor = Spectrogram(config)
+
+    for partition in partitions:
+        recordings = load_manifest(
+            src_dir / f"{prefix}_recordings_{partition}.{suffix}", RecordingSet
+        )
+        supervisions = load_manifest(
+            src_dir / f"{prefix}_supervisions_{partition}.{suffix}", SupervisionSet
+        )
+
+        # resample from 44100 to 8000
+        recordings = recordings.resample(sampling_rate)
+
+        with get_executor() as ex:  # Initialize the executor only once.
+            cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
+            if (output_dir / cuts_filename).is_file():
+                logging.info(f"{cuts_filename} already exists - skipping.")
+                return
+            logging.info(f"Processing {partition}")
+            cut_set = CutSet.from_manifests(
+                recordings=recordings, supervisions=supervisions
+            )
+
+            cut_set = cut_set.compute_and_store_features(
+                extractor=extractor,
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
+                # when an executor is specified, make more partitions
+                num_jobs=num_jobs if ex is None else 80,
+                executor=ex,
+                storage_type=LilcomChunkyWriter,
+            )
+            cut_set.to_file(output_dir / cuts_filename)
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    compute_spectrogram_aishell3()