From d56964371cbd2d53eca619bb215276b272f0e462 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 1 Mar 2024 19:48:38 +0800 Subject: [PATCH] Support VITS models from icefall. (#625) --- sherpa-onnx/csrc/offline-tts-vits-impl.h | 6 ++++-- sherpa-onnx/csrc/offline-tts-vits-model-metadata.h | 1 + sherpa-onnx/csrc/offline-tts-vits-model.cc | 4 ++++ sherpa-onnx/csrc/piper-phonemize-lexicon.cc | 2 +- sherpa-onnx/csrc/session.cc | 4 +++- 5 files changed, 13 insertions(+), 4 deletions(-) diff --git a/sherpa-onnx/csrc/offline-tts-vits-impl.h b/sherpa-onnx/csrc/offline-tts-vits-impl.h index 978f557c3..2fc0f3098 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-impl.h +++ b/sherpa-onnx/csrc/offline-tts-vits-impl.h @@ -205,7 +205,8 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { if (meta_data.frontend == "characters") { frontend_ = std::make_unique( mgr, config_.model.vits.tokens, meta_data); - } else if ((meta_data.is_piper || meta_data.is_coqui) && + } else if ((meta_data.is_piper || meta_data.is_coqui || + meta_data.is_icefall) && !config_.model.vits.data_dir.empty()) { frontend_ = std::make_unique( mgr, config_.model.vits.tokens, config_.model.vits.data_dir, @@ -231,7 +232,8 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { if (meta_data.frontend == "characters") { frontend_ = std::make_unique( config_.model.vits.tokens, meta_data); - } else if ((meta_data.is_piper || meta_data.is_coqui) && + } else if ((meta_data.is_piper || meta_data.is_coqui || + meta_data.is_icefall) && !config_.model.vits.data_dir.empty()) { frontend_ = std::make_unique( config_.model.vits.tokens, config_.model.vits.data_dir, diff --git a/sherpa-onnx/csrc/offline-tts-vits-model-metadata.h b/sherpa-onnx/csrc/offline-tts-vits-model-metadata.h index 60e375540..e4e9d8864 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model-metadata.h +++ b/sherpa-onnx/csrc/offline-tts-vits-model-metadata.h @@ -20,6 +20,7 @@ struct OfflineTtsVitsModelMetaData { bool is_piper = false; bool is_coqui = false; + bool is_icefall = false; // the following options are for models from coqui-ai/TTS int32_t blank_id = 0; diff --git a/sherpa-onnx/csrc/offline-tts-vits-model.cc b/sherpa-onnx/csrc/offline-tts-vits-model.cc index d3672ed6b..c41a193d6 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model.cc +++ b/sherpa-onnx/csrc/offline-tts-vits-model.cc @@ -110,6 +110,10 @@ class OfflineTtsVitsModel::Impl { if (comment.find("coqui") != std::string::npos) { meta_data_.is_coqui = true; } + + if (comment.find("icefall") != std::string::npos) { + meta_data_.is_icefall = true; + } } Ort::Value RunVitsPiperOrCoqui(Ort::Value x, int64_t sid, float speed) { diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc index e63352043..ff5e9abe1 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc @@ -236,7 +236,7 @@ std::vector> PiperPhonemizeLexicon::ConvertTextToTokenIds( std::vector phoneme_ids; - if (meta_data_.is_piper) { + if (meta_data_.is_piper || meta_data_.is_icefall) { for (const auto &p : phonemes) { phoneme_ids = PiperPhonemesToIds(token2id_, p); ans.push_back(std::move(phoneme_ids)); diff --git a/sherpa-onnx/csrc/session.cc b/sherpa-onnx/csrc/session.cc index f4cb760ef..94987ebc9 100644 --- a/sherpa-onnx/csrc/session.cc +++ b/sherpa-onnx/csrc/session.cc @@ -105,11 +105,13 @@ static Ort::SessionOptions GetSessionOptionsImpl(int32_t num_threads, } else { SHERPA_ONNX_LOGE("Use nnapi"); } -#else +#elif defined(__ANDROID_API__) SHERPA_ONNX_LOGE( "Android NNAPI requires API level >= 27. Current API level %d " "Fallback to cpu!", (int32_t)__ANDROID_API__); +#else + SHERPA_ONNX_LOGE("NNAPI is for Android only. Fallback to cpu"); #endif break; }