Support specifying voice in espeak-ng for kokoro tts models.

k2-fsa · Feb 10, 2025 · ceefa1f · ceefa1f
1 parent d5da943
commit ceefa1f
Show file tree

Hide file tree

Showing 4 changed files with 10 additions and 5 deletions.
diff --git a/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
@@ -104,7 +104,8 @@ class KokoroMultiLangLexicon::Impl {
     // https://en.cppreference.com/w/cpp/regex
     // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
     std::string expr =
-        "([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([\\u0000-\\u007f]+)";
+        "([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([äöüßÄÖÜ\\u0000-\\u007f]+"
+        ")";
 
     auto ws = ToWideString(text);
     std::wstring wexpr = ToWideString(expr);
@@ -127,7 +128,7 @@ class KokoroMultiLangLexicon::Impl {
         if (debug_) {
           SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
         }
-        ids_vec = ConvertEnglishToTokenIDs(ms);
+        ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
       } else {
         if (debug_) {
           SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
@@ -257,7 +258,7 @@ class KokoroMultiLangLexicon::Impl {
   }
 
   std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs(
-      const std::string &text) const {
+      const std::string &text, const std::string &voice) const {
     std::vector<std::string> words = SplitUtf8(text);
     if (debug_) {
       std::ostringstream os;
@@ -315,7 +316,7 @@ class KokoroMultiLangLexicon::Impl {
 
         piper::eSpeakPhonemeConfig config;
 
-        config.voice = "en-us";
+        config.voice = voice;
 
         std::vector<std::vector<piper::Phoneme>> phonemes;
 

diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-impl.h b/sherpa-onnx/csrc/offline-tts-kokoro-impl.h
@@ -221,7 +221,7 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
     }
 
     std::vector<TokenIDs> token_ids =
-        frontend_->ConvertTextToTokenIds(text, "en-us");
+        frontend_->ConvertTextToTokenIds(text, meta_data.voice);
 
     if (token_ids.empty() ||
         (token_ids.size() == 1 && token_ids[0].tokens.empty())) {

diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h b/sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h
@@ -18,6 +18,8 @@ struct OfflineTtsKokoroModelMetaData {
   int32_t version = 1;
   int32_t has_espeak = 1;
   int32_t max_token_len = 0;
+
+  std::string voice;
 };
 
 }  // namespace sherpa_onnx

diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-model.cc b/sherpa-onnx/csrc/offline-tts-kokoro-model.cc
@@ -138,6 +138,8 @@ class OfflineTtsKokoroModel::Impl {
     SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1);
     SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers");
     SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak");
+    SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.voice, "voice",
+                                                "en-us");
 
     if (config_.debug) {
       std::vector<std::string> speaker_names;