Skip to content

Commit

Permalink
Separate only Chinese and non-Chinese words for now
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj committed Feb 13, 2025
1 parent 24736c3 commit 57b5ffa
Showing 1 changed file with 20 additions and 33 deletions.
53 changes: 20 additions & 33 deletions sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -89,56 +89,43 @@ class KokoroMultiLangLexicon::Impl {

// https://en.cppreference.com/w/cpp/regex
// https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
std::string expr_french =
"([\\u0020-\\u005f\\u0061-"
"\\u007a\\u007c\\u00a0\\u00a7\\u00a9\\u00ab\\u00b2-"
"\\u00b3\\u00bb\\u00c0\\u00c2\\u00c6-\\u00cb\\u00ce-"
"\\u00cf\\u00d4\\u00d9\\u00db-\\u00dc\\u00e0\\u00e2\\u00e6-"
"\\u00eb\\u00ee-\\u00ef\\u00f4\\u00f9\\u00fb-\\u00fc\\u00ff\\u0152-"
"\\u0153\\u0178\\u02b3\\u02e2\\u1d48-\\u1d49\\u2010-\\u2011\\u2013-"
"\\u2014\\u2019\\u201c-\\u201d\\u2020-\\u2021\\u2026\\u202f-"
"\\u2030\\u20ac\\u2212]+)";

std::string expr_german =
"([\\u0020-\\u005f\\u0061-"
"\\u007d\\u00a0\\u00a7\\u00a9\\u00ab\\u00bb\\u00c4\\u00d6\\u00dc\\u00df"
"\\"
"u00e4\\u00f6\\u00fc\\u2010-\\u2011\\u2013-"
"\\u2014\\u2018\\u201a\\u201c\\u201e\\u2026\\u2030\\u20ac]+)";

std::string expr_english =
"([\\u0020-\\u005f\\u0061-\\u007a\\u007c\\u00a0\\u00a7\\u00a9\\u2010-"
"\\u2011\\u2013-\\u2014\\u2018-\\u2019\\u201c-\\u201d\\u2020-"
"\\u2021\\u2026\\u2030\\u2032-\\u2033\\u20ac]+)";

std::string expr_chinese = "([\\u4e00-\\u9fff]+)";
std::string expr_not_chinese = "([^\\u4e00-\\u9fff]+)";

// std::string expr = expr_english + "|" + expr_german + "|" + expr_french;
std::string expr = expr_french;

// std::string expr =
// "([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([äöüßÄÖÜ\\u0000-\\u007f]+"
// ")";
std::string expr_both = expr_chinese + "|" + expr_not_chinese;

auto ws = ToWideString(text);
std::wstring wexpr = ToWideString(expr);
std::wregex we(wexpr);
std::wstring wexpr_both = ToWideString(expr_both);
std::wregex we_both(wexpr_both);

std::wstring wexpr_zh = ToWideString(expr_chinese);
std::wregex we_zh(wexpr_zh);

auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we);
auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we_both);
auto end = std::wsregex_iterator();

std::vector<TokenIDs> ans;

for (std::wsregex_iterator i = begin; i != end; ++i) {
std::wsmatch match = *i;
std::wstring match_str = match.str();

auto ms = ToString(match_str);
uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0];

std::vector<std::vector<int32_t>> ids_vec;
if (std::regex_match(match_str, we_zh)) {
if (debug_) {
SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
}
ids_vec = ConvertChineseToTokenIDs(ms);
} else {
if (debug_) {
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
}

SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
}

for (const auto &ids : ids_vec) {
if (ids.size() > 4) {
Expand Down

0 comments on commit 57b5ffa

Please sign in to comment.