diff --git a/.github/workflows/c-api.yaml b/.github/workflows/c-api.yaml index 049240d776..81e81ce384 100644 --- a/.github/workflows/c-api.yaml +++ b/.github/workflows/c-api.yaml @@ -81,6 +81,45 @@ jobs: otool -L ./install/lib/libsherpa-onnx-c-api.dylib fi + - name: Test Matcha TTS (zh) + shell: bash + run: | + gcc -o matcha-tts-zh-c-api ./c-api-examples/matcha-tts-zh-c-api.c \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-c-api \ + -l onnxruntime + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 + tar xvf matcha-icefall-zh-baker.tar.bz2 + rm matcha-icefall-zh-baker.tar.bz2 + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + + ./matcha-tts-zh-c-api + + - name: Test Matcha TTS (en) + shell: bash + run: | + gcc -o matcha-tts-en-c-api ./c-api-examples/matcha-tts-en-c-api.c \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-c-api \ + -l onnxruntime + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 + tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 + rm matcha-icefall-en_US-ljspeech.tar.bz2 + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + + ./matcha-tts-en-c-api + + - uses: actions/upload-artifact@v4 + with: + name: matcha-tts-${{ matrix.os }} + path: ./generated-matcha-*.wav + - name: Test vad + Whisper tiny.en shell: bash run: | diff --git a/c-api-examples/CMakeLists.txt b/c-api-examples/CMakeLists.txt index c7db2bc275..9c84932f23 100644 --- a/c-api-examples/CMakeLists.txt +++ b/c-api-examples/CMakeLists.txt @@ -7,6 +7,12 @@ target_link_libraries(decode-file-c-api sherpa-onnx-c-api cargs) if(SHERPA_ONNX_ENABLE_TTS) add_executable(offline-tts-c-api offline-tts-c-api.c) target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs) + + add_executable(matcha-tts-zh-c-api matcha-tts-zh-c-api.c) + target_link_libraries(matcha-tts-zh-c-api sherpa-onnx-c-api cargs) + + add_executable(matcha-tts-en-c-api matcha-tts-en-c-api.c) + target_link_libraries(matcha-tts-en-c-api sherpa-onnx-c-api cargs) endif() if(SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION) diff --git a/c-api-examples/matcha-tts-en-c-api.c b/c-api-examples/matcha-tts-en-c-api.c new file mode 100644 index 0000000000..103ecd5237 --- /dev/null +++ b/c-api-examples/matcha-tts-en-c-api.c @@ -0,0 +1,87 @@ +// c-api-examples/matcha-tts-en-c-api.c +// +// Copyright (c) 2025 Xiaomi Corporation + +// This file shows how to use sherpa-onnx C API +// for English TTS with MatchaTTS. +// +// clang-format off +/* +Usage + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 +rm matcha-icefall-en_US-ljspeech.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +./matcha-tts-en-c-api + + */ +// clang-format on + +#include +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +static int32_t ProgressCallback(const float *samples, int32_t num_samples, + float progress) { + fprintf(stderr, "Progress: %.3f%%\n", progress * 100); + // return 1 to continue generating + // return 0 to stop generating + return 1; +} + +int32_t main(int32_t argc, char *argv[]) { + SherpaOnnxOfflineTtsConfig config; + memset(&config, 0, sizeof(config)); + config.model.matcha.acoustic_model = + "./matcha-icefall-en_US-ljspeech/model-steps-3.onnx"; + + config.model.matcha.vocoder = "./hifigan_v2.onnx"; + + config.model.matcha.tokens = "./matcha-icefall-en_US-ljspeech/tokens.txt"; + + config.model.matcha.data_dir = + "./matcha-icefall-en_US-ljspeech/espeak-ng-data"; + + config.model.num_threads = 1; + + // If you don't want to see debug messages, please set it to 0 + config.model.debug = 1; + + const char *filename = "./generated-matcha-en.wav"; + const char *text = + "Today as always, men fall into two groups: slaves and free men. Whoever " + "does not have two-thirds of his day for himself, is a slave, whatever " + "he may be: a statesman, a businessman, an official, or a scholar. " + "Friends fell out often because life was changing so fast. The easiest " + "thing in the world was to lose touch with someone."; + + SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); + int32_t sid = 0; + float speed = 1.0; // larger -> faster in speech speed + +#if 0 + // If you don't want to use a callback, then please enable this branch + const SherpaOnnxGeneratedAudio *audio = + SherpaOnnxOfflineTtsGenerate(tts, text, sid, speed); +#else + const SherpaOnnxGeneratedAudio *audio = + SherpaOnnxOfflineTtsGenerateWithProgressCallback(tts, text, sid, speed, + ProgressCallback); +#endif + + SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate, filename); + + SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio); + SherpaOnnxDestroyOfflineTts(tts); + + fprintf(stderr, "Input text is: %s\n", text); + fprintf(stderr, "Speaker ID is is: %d\n", sid); + fprintf(stderr, "Saved to: %s\n", filename); + + return 0; +} diff --git a/c-api-examples/matcha-tts-zh-c-api.c b/c-api-examples/matcha-tts-zh-c-api.c new file mode 100644 index 0000000000..c7667f0cb3 --- /dev/null +++ b/c-api-examples/matcha-tts-zh-c-api.c @@ -0,0 +1,87 @@ +// c-api-examples/matcha-tts-zh-c-api.c +// +// Copyright (c) 2025 Xiaomi Corporation + +// This file shows how to use sherpa-onnx C API +// for Chinese TTS with MatchaTTS. +// +// clang-format off +/* +Usage + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 +tar xvf matcha-icefall-zh-baker.tar.bz2 +rm matcha-icefall-zh-baker.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +./matcha-tts-zh-c-api + + */ +// clang-format on + +#include +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +static int32_t ProgressCallback(const float *samples, int32_t num_samples, + float progress) { + fprintf(stderr, "Progress: %.3f%%\n", progress * 100); + // return 1 to continue generating + // return 0 to stop generating + return 1; +} + +int32_t main(int32_t argc, char *argv[]) { + SherpaOnnxOfflineTtsConfig config; + memset(&config, 0, sizeof(config)); + config.model.matcha.acoustic_model = + "./matcha-icefall-zh-baker/model-steps-3.onnx"; + config.model.matcha.vocoder = "./hifigan_v2.onnx"; + config.model.matcha.lexicon = "./matcha-icefall-zh-baker/lexicon.txt"; + config.model.matcha.tokens = "./matcha-icefall-zh-baker/tokens.txt"; + config.model.matcha.dict_dir = "./matcha-icefall-zh-baker/dict"; + config.model.num_threads = 1; + + // If you don't want to see debug messages, please set it to 0 + config.model.debug = 1; + + // clang-format off + config.rule_fsts = "./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst"; + // clang-format on + + const char *filename = "./generated-matcha-zh.wav"; + const char *text = + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如" + "涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感" + "受着生命的奇迹与温柔." + "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; " + "经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"; + + SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); + int32_t sid = 0; + float speed = 1.0; // larger -> faster in speech speed + +#if 0 + // If you don't want to use a callback, then please enable this branch + const SherpaOnnxGeneratedAudio *audio = + SherpaOnnxOfflineTtsGenerate(tts, text, sid, speed); +#else + const SherpaOnnxGeneratedAudio *audio = + SherpaOnnxOfflineTtsGenerateWithProgressCallback(tts, text, sid, speed, + ProgressCallback); +#endif + + SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate, filename); + + SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio); + SherpaOnnxDestroyOfflineTts(tts); + + fprintf(stderr, "Input text is: %s\n", text); + fprintf(stderr, "Speaker ID is is: %d\n", sid); + fprintf(stderr, "Saved to: %s\n", filename); + + return 0; +} diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 7033807309..6afc1bf627 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -1058,6 +1058,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( const SherpaOnnxOfflineTtsConfig *config) { sherpa_onnx::OfflineTtsConfig tts_config; + // vits tts_config.model.vits.model = SHERPA_ONNX_OR(config->model.vits.model, ""); tts_config.model.vits.lexicon = SHERPA_ONNX_OR(config->model.vits.lexicon, ""); @@ -1073,6 +1074,24 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( tts_config.model.vits.dict_dir = SHERPA_ONNX_OR(config->model.vits.dict_dir, ""); + // matcha + tts_config.model.matcha.acoustic_model = + SHERPA_ONNX_OR(config->model.matcha.acoustic_model, ""); + tts_config.model.matcha.vocoder = + SHERPA_ONNX_OR(config->model.matcha.vocoder, ""); + tts_config.model.matcha.lexicon = + SHERPA_ONNX_OR(config->model.matcha.lexicon, ""); + tts_config.model.matcha.tokens = + SHERPA_ONNX_OR(config->model.matcha.tokens, ""); + tts_config.model.matcha.data_dir = + SHERPA_ONNX_OR(config->model.matcha.data_dir, ""); + tts_config.model.matcha.noise_scale = + SHERPA_ONNX_OR(config->model.matcha.noise_scale, 0.667); + tts_config.model.matcha.length_scale = + SHERPA_ONNX_OR(config->model.matcha.length_scale, 1.0); + tts_config.model.matcha.dict_dir = + SHERPA_ONNX_OR(config->model.matcha.dict_dir, ""); + tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1); tts_config.model.debug = config->model.debug; tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu"); @@ -1082,7 +1101,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, ""); - tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 2); + tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 1); if (tts_config.model.debug) { #if __OHOS__ diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 167051cd92..e79d951e22 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -894,15 +894,28 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsVitsModelConfig { float noise_scale; float noise_scale_w; - float length_scale; // < 1, faster in speed; > 1, slower in speed + float length_scale; // < 1, faster in speech speed; > 1, slower in speed const char *dict_dir; } SherpaOnnxOfflineTtsVitsModelConfig; +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsMatchaModelConfig { + const char *acoustic_model; + const char *vocoder; + const char *lexicon; + const char *tokens; + const char *data_dir; + + float noise_scale; + float length_scale; // < 1, faster in speech speed; > 1, slower in speed + const char *dict_dir; +} SherpaOnnxOfflineTtsMatchaModelConfig; + SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { SherpaOnnxOfflineTtsVitsModelConfig vits; int32_t num_threads; int32_t debug; const char *provider; + SherpaOnnxOfflineTtsMatchaModelConfig matcha; } SherpaOnnxOfflineTtsModelConfig; SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig { diff --git a/sherpa-onnx/csrc/offline-tts.h b/sherpa-onnx/csrc/offline-tts.h index 8399443c00..884173e7b6 100644 --- a/sherpa-onnx/csrc/offline-tts.h +++ b/sherpa-onnx/csrc/offline-tts.h @@ -30,7 +30,7 @@ struct OfflineTtsConfig { // Maximum number of sentences that we process at a time. // This is to avoid OOM for very long input text. // If you set it to -1, then we process all sentences in a single batch. - int32_t max_num_sentences = 2; + int32_t max_num_sentences = 1; OfflineTtsConfig() = default; OfflineTtsConfig(const OfflineTtsModelConfig &model,