diff --git a/sherpa-onnx/c-api/cxx-api.cc b/sherpa-onnx/c-api/cxx-api.cc index df9fca850f..d3c522137b 100644 --- a/sherpa-onnx/c-api/cxx-api.cc +++ b/sherpa-onnx/c-api/cxx-api.cc @@ -270,7 +270,8 @@ OfflineStream OfflineRecognizer::CreateStream() const { return OfflineStream{s}; } -OfflineStream OfflineRecognizer::CreateStream(const std::string &hotwords) const { +OfflineStream OfflineRecognizer::CreateStream( + const std::string &hotwords) const { auto s = SherpaOnnxCreateOfflineStreamWithHotwords(p_, hotwords.c_str()); return OfflineStream{s}; } diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index 4976f58fb1..5ee7a50507 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -549,6 +549,7 @@ if(SHERPA_ONNX_ENABLE_TESTS) context-graph-test.cc packed-sequence-test.cc pad-sequence-test.cc + regex-lang-test.cc slice-test.cc stack-test.cc text-utils-test.cc diff --git a/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc index 8d69eb8c8e..a26df51a3e 100644 --- a/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc +++ b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc @@ -4,9 +4,7 @@ #include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h" -#include #include -#include #include // NOLINT #include #include @@ -22,6 +20,8 @@ #include "rawfile/raw_file_manager.h" #endif +#include + #include "cppjieba/Jieba.hpp" #include "espeak-ng/speak_lib.h" #include "phoneme_ids.hpp" @@ -37,20 +37,6 @@ void CallPhonemizeEspeak(const std::string &text, piper::eSpeakPhonemeConfig &config, // NOLINT std::vector> *phonemes); -static std::wstring ToWideString(const std::string &s) { - // see - // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t - std::wstring_convert> converter; - return converter.from_bytes(s); -} - -static std::string ToString(const std::wstring &s) { - // see - // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t - std::wstring_convert> converter; - return converter.to_bytes(s); -} - class KokoroMultiLangLexicon::Impl { public: Impl(const std::string &tokens, const std::string &lexicon, @@ -103,15 +89,19 @@ class KokoroMultiLangLexicon::Impl { // https://en.cppreference.com/w/cpp/regex // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex - std::string expr = - "([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([äöüßÄÖÜ\\u0000-\\u007f]+" - ")"; + std::string expr_chinese = "([\\u4e00-\\u9fff]+)"; + std::string expr_not_chinese = "([^\\u4e00-\\u9fff]+)"; + + std::string expr_both = expr_chinese + "|" + expr_not_chinese; auto ws = ToWideString(text); - std::wstring wexpr = ToWideString(expr); - std::wregex we(wexpr); + std::wstring wexpr_both = ToWideString(expr_both); + std::wregex we_both(wexpr_both); + + std::wstring wexpr_zh = ToWideString(expr_chinese); + std::wregex we_zh(wexpr_zh); - auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we); + auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we_both); auto end = std::wsregex_iterator(); std::vector ans; @@ -119,21 +109,22 @@ class KokoroMultiLangLexicon::Impl { for (std::wsregex_iterator i = begin; i != end; ++i) { std::wsmatch match = *i; std::wstring match_str = match.str(); + auto ms = ToString(match_str); uint8_t c = reinterpret_cast(ms.data())[0]; std::vector> ids_vec; - - if (c < 0x80) { + if (std::regex_match(match_str, we_zh)) { if (debug_) { - SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str()); + SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str()); } - ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice); + ids_vec = ConvertChineseToTokenIDs(ms); } else { if (debug_) { - SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str()); + SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str()); } - ids_vec = ConvertChineseToTokenIDs(ms); + + ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice); } for (const auto &ids : ids_vec) { @@ -315,9 +306,10 @@ class KokoroMultiLangLexicon::Impl { this_sentence.push_back(space_id); } else { if (debug_) { - SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'", word.c_str()); + SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'", + word.c_str()); } - + piper::eSpeakPhonemeConfig config; config.voice = voice; diff --git a/sherpa-onnx/csrc/regex-lang-test.cc b/sherpa-onnx/csrc/regex-lang-test.cc new file mode 100644 index 0000000000..11df85cc18 --- /dev/null +++ b/sherpa-onnx/csrc/regex-lang-test.cc @@ -0,0 +1,86 @@ +// sherpa-onnx/csrc/regex-lang-test.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include // NOLINT + +#include "gtest/gtest.h" +#include "sherpa-onnx/csrc/text-utils.cc" + +namespace sherpa_onnx { + +static void TestLang(const std::string &expr, const std::string &text, + const std::vector &expected) { + auto ws = ToWideString(text); + std::wstring wexpr = ToWideString(expr); + std::wregex we(wexpr); + + auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we); + auto end = std::wsregex_iterator(); + int32_t k = 0; + for (std::wsregex_iterator i = begin; i != end; ++i) { + std::wsmatch match = *i; + std::wstring match_str = match.str(); + auto ms = ToString(match_str); + std::cout << ms << "\n"; + EXPECT_EQ(ms, expected[k]); + k++; + } + EXPECT_EQ(k, expected.size()); +} + +TEST(German, Case1) { + std::cout << "----------Test German----------"; + // see https://character-table.netlify.app/german/ + std::string expr = + "([\\u0020-\\u005f\\u0061-" + "\\u007d\\u00a0\\u00a7\\u00a9\\u00ab\\u00bb\\u00c4\\u00d6\\u00dc\\u00df\\" + "u00e4\\u00f6\\u00fc\\u2010-\\u2011\\u2013-" + "\\u2014\\u2018\\u201a\\u201c\\u201e\\u2026\\u2030\\u20ac]+)"; + + std::string text = + "开始Übeltäter übergibt Ärzten 中间öfters äußerst ätzende Öle结束3€"; + + std::vector expected = {"Übeltäter übergibt Ärzten ", + "öfters äußerst ätzende Öle", "3€"}; + + TestLang(expr, text, expected); +} + +TEST(French, Case1) { + std::string expr = + "([\\u0020-\\u005f\\u0061-" + "\\u007a\\u007c\\u00a0\\u00a7\\u00a9\\u00ab\\u00b2-" + "\\u00b3\\u00bb\\u00c0\\u00c2\\u00c6-\\u00cb\\u00ce-" + "\\u00cf\\u00d4\\u00d9\\u00db-\\u00dc\\u00e0\\u00e2\\u00e6-" + "\\u00eb\\u00ee-\\u00ef\\u00f4\\u00f9\\u00fb-\\u00fc\\u00ff\\u0152-" + "\\u0153\\u0178\\u02b3\\u02e2\\u1d48-\\u1d49\\u2010-\\u2011\\u2013-" + "\\u2014\\u2019\\u201c-\\u201d\\u2020-\\u2021\\u2026\\u202f-" + "\\u2030\\u20ac\\u2212]+)"; + std::string text = + "L'été, 一avec son ciel bleuâtre, 二est un moment où, 三Noël, maçon"; + std::vector expected = { + "L'été, ", + "avec son ciel bleuâtre, ", + "est un moment où, ", + "Noël, maçon", + }; + TestLang(expr, text, expected); +} + +TEST(English, Case1) { + // https://character-table.netlify.app/english/ + std::string expr = + "([\\u0020-\\u005f\\u0061-\\u007a\\u007c\\u00a0\\u00a7\\u00a9\\u2010-" + "\\u2011\\u2013-\\u2014\\u2018-\\u2019\\u201c-\\u201d\\u2020-" + "\\u2021\\u2026\\u2030\\u2032-\\u2033\\u20ac]+)"; + std::string text = "一how are you doing? 二Thank you!"; + + std::vector expected = { + "how are you doing? ", + "Thank you!", + }; + TestLang(expr, text, expected); +} + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/text-utils-test.cc b/sherpa-onnx/csrc/text-utils-test.cc index 15558f166c..e9cee573f3 100644 --- a/sherpa-onnx/csrc/text-utils-test.cc +++ b/sherpa-onnx/csrc/text-utils-test.cc @@ -8,6 +8,14 @@ namespace sherpa_onnx { +TEST(ToLowerCase, WideString) { + std::string text = + "Hallo! Übeltäter übergibt Ärzten öfters äußerst ätzende Öle 3€"; + auto t = ToLowerCase(text); + std::cout << text << "\n"; + std::cout << t << "\n"; +} + TEST(RemoveInvalidUtf8Sequences, Case1) { std::vector v = { 0xe4, 0xbb, 0x8a, // 今 diff --git a/sherpa-onnx/csrc/text-utils.cc b/sherpa-onnx/csrc/text-utils.cc index d0a64a8ce6..e14c344cd6 100644 --- a/sherpa-onnx/csrc/text-utils.cc +++ b/sherpa-onnx/csrc/text-utils.cc @@ -8,8 +8,11 @@ #include #include #include +#include #include +#include #include +#include #include #include #include @@ -389,10 +392,7 @@ std::vector SplitUtf8(const std::string &text) { } std::string ToLowerCase(const std::string &s) { - std::string ans(s.size(), 0); - std::transform(s.begin(), s.end(), ans.begin(), - [](unsigned char c) { return std::tolower(c); }); - return ans; + return ToString(ToLowerCase(ToWideString(s))); } void ToLowerCase(std::string *in_out) { @@ -400,6 +400,66 @@ void ToLowerCase(std::string *in_out) { [](unsigned char c) { return std::tolower(c); }); } +std::wstring ToLowerCase(const std::wstring &s) { + std::wstring ans(s.size(), 0); + std::transform(s.begin(), s.end(), ans.begin(), [](wchar_t c) -> wchar_t { + switch (c) { + // French + case L'À': + return L'à'; + case L'Â': + return L'â'; + case L'Æ': + return L'æ'; + case L'Ç': + return L'ç'; + case L'È': + return L'è'; + case L'É': + return L'é'; + case L'Ë': + return L'ë'; + case L'Î': + return L'î'; + case L'Ï': + return L'ï'; + case L'Ô': + return L'ô'; + case L'Ù': + return L'ù'; + case L'Û': + return L'û'; + case L'Ü': + return L'ü'; + + // others + case L'Á': + return L'á'; + case L'Í': + return L'í'; + case L'Ó': + return L'ó'; + case L'Ú': + return L'ú'; + case L'Ñ': + return L'ñ'; + case L'Ì': + return L'ì'; + case L'Ò': + return L'ò'; + case L'Ä': + return L'ä'; + case L'Ö': + return L'ö'; + // TODO(fangjun): Add more + + default: + return std::towlower(c); + } + }); + return ans; +} + static inline bool InRange(uint8_t x, uint8_t low, uint8_t high) { return low <= x && x <= high; } @@ -625,4 +685,18 @@ std::string Gb2312ToUtf8(const std::string &text) { } #endif +std::wstring ToWideString(const std::string &s) { + // see + // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t + std::wstring_convert> converter; + return converter.from_bytes(s); +} + +std::string ToString(const std::wstring &s) { + // see + // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t + std::wstring_convert> converter; + return converter.to_bytes(s); +} + } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/text-utils.h b/sherpa-onnx/csrc/text-utils.h index f0ecdb8e6c..3daf22bd76 100644 --- a/sherpa-onnx/csrc/text-utils.h +++ b/sherpa-onnx/csrc/text-utils.h @@ -124,6 +124,8 @@ std::vector SplitUtf8(const std::string &text); std::string ToLowerCase(const std::string &s); void ToLowerCase(std::string *in_out); +std::wstring ToLowerCase(const std::wstring &s); + std::string RemoveInvalidUtf8Sequences(const std::string &text, bool show_debug_msg = false); @@ -139,6 +141,10 @@ bool IsGB2312(const std::string &text); std::string Gb2312ToUtf8(const std::string &text); #endif +std::wstring ToWideString(const std::string &s); + +std::string ToString(const std::wstring &s); + } // namespace sherpa_onnx #endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_