From 55702897350232ba69031c7773f3c56196f1b32a Mon Sep 17 00:00:00 2001 From: Ovler Date: Tue, 13 Jun 2023 02:18:32 +0800 Subject: [PATCH] buffer added to avoid splitted chatacter inspired by https://github.com/ggerganov/whisper.cpp/issues/399#issuecomment-1508222875 --- Examples/WhisperDesktop/Utils/logger.cpp | 53 ++++++++++++++++++++-- Examples/main/main.cpp | 57 +++++++++++++++++++++++- 2 files changed, 104 insertions(+), 6 deletions(-) diff --git a/Examples/WhisperDesktop/Utils/logger.cpp b/Examples/WhisperDesktop/Utils/logger.cpp index 712835b..7122113 100644 --- a/Examples/WhisperDesktop/Utils/logger.cpp +++ b/Examples/WhisperDesktop/Utils/logger.cpp @@ -1,6 +1,7 @@ #include "stdafx.h" #include "logger.h" #include "miscUtils.h" +#include namespace { @@ -37,6 +38,37 @@ void printTime( CStringA& rdi, Whisper::sTimeSpan time, bool comma ) fields.ticks / 10'000 ); } +bool utf8_check_is_valid(const char *str, int len) { + // based on https://gist.github.com/ichramm/3ffeaf7ba4f24853e9ecaf176da84566 + int n; + for (int i = 0; i < len; ++i) { + unsigned char c = (unsigned char) str[i]; + //if (c==0x09 || c==0x0a || c==0x0d || (0x20 <= c && c <= 0x7e) ) n = 0; // is_printable_ascii + if (0x00 <= c && c <= 0x7f) { + n=0; // 0bbbbbbb + } else if ((c & 0xE0) == 0xC0) { + n=1; // 110bbbbb + } else if ( c==0xed && i<(len-1) && ((unsigned char)str[i+1] & 0xa0)==0xa0) { + return false; //U+d800 to U+dfff + } else if ((c & 0xF0) == 0xE0) { + n=2; // 1110bbbb + } else if ((c & 0xF8) == 0xF0) { + n=3; // 11110bbb + //} else if (($c & 0xFC) == 0xF8) { n=4; // 111110bb //byte 5, unnecessary in 4 byte UTF-8 + //} else if (($c & 0xFE) == 0xFC) { n=5; // 1111110b //byte 6, unnecessary in 4 byte UTF-8 + } else { + return false; + } + + for (int j = 0; j < n && i < len; ++j) { // n bytes matching 10bbbbbb follow ? + if ((++i == len) || (( (unsigned char)str[i] & 0xC0) != 0x80)) { + return false; + } + } + } + return true; +} + HRESULT logNewSegments( const iTranscribeResult* results, size_t newSegments, bool printSpecial ) { sTranscribeLength length; @@ -49,6 +81,7 @@ HRESULT logNewSegments( const iTranscribeResult* results, size_t newSegments, bo const sToken* const tokens = results->getTokens(); CStringA str; + std::string buffer; for( ; i < len; i++ ) { const sSegment& seg = segments[ i ]; @@ -62,10 +95,22 @@ HRESULT logNewSegments( const iTranscribeResult* results, size_t newSegments, bo { const sToken& tok = tokens[ seg.firstToken + j ]; if( !printSpecial && ( tok.flags & eTokenFlags::Special ) ) - continue; - str += k_colors[ colorIndex( tok ) ]; - str += tok.text; - str += "\033[0m"; + continue; + if (utf8_check_is_valid(tok.text, strlen(tok.text))) { + str += k_colors[ colorIndex( tok ) ]; + str += tok.text; + str += "\033[0m"; + } else { + for (int k = 0; k < strlen(tok.text); k++) { + buffer.push_back(tok.text[k]); + if (utf8_check_is_valid(&buffer[0], buffer.size())) { + str += k_colors[ colorIndex( tok ) ]; + str += &buffer[0]; + str += "\033[0m"; + buffer.clear(); + } + } + } } logInfo( u8"%s", cstr( str ) ); } diff --git a/Examples/main/main.cpp b/Examples/main/main.cpp index 0919131..6d378d8 100644 --- a/Examples/main/main.cpp +++ b/Examples/main/main.cpp @@ -52,6 +52,37 @@ namespace return col; } + static bool utf8_check_is_valid(const char *str, int len) { + // based on https://gist.github.com/ichramm/3ffeaf7ba4f24853e9ecaf176da84566 + int n; + for (int i = 0; i < len; ++i) { + unsigned char c = (unsigned char) str[i]; + //if (c==0x09 || c==0x0a || c==0x0d || (0x20 <= c && c <= 0x7e) ) n = 0; // is_printable_ascii + if (0x00 <= c && c <= 0x7f) { + n=0; // 0bbbbbbb + } else if ((c & 0xE0) == 0xC0) { + n=1; // 110bbbbb + } else if ( c==0xed && i<(len-1) && ((unsigned char)str[i+1] & 0xa0)==0xa0) { + return false; //U+d800 to U+dfff + } else if ((c & 0xF0) == 0xE0) { + n=2; // 1110bbbb + } else if ((c & 0xF8) == 0xF0) { + n=3; // 11110bbb + //} else if (($c & 0xFC) == 0xF8) { n=4; // 111110bb //byte 5, unnecessary in 4 byte UTF-8 + //} else if (($c & 0xFE) == 0xFC) { n=5; // 1111110b //byte 6, unnecessary in 4 byte UTF-8 + } else { + return false; + } + + for (int j = 0; j < n && i < len; ++j) { // n bytes matching 10bbbbbb follow ? + if ((++i == len) || (( (unsigned char)str[i] & 0xC0) != 0x80)) { + return false; + } + } + } + return true; + } + HRESULT __cdecl newSegmentCallback( iContext* context, uint32_t n_new, void* user_data ) noexcept { ComLight::CComPtr results; @@ -78,12 +109,23 @@ namespace { if( params.print_colors ) { + std::string buffer; for( uint32_t j = 0; j < seg.countTokens; j++ ) { const sToken& tok = tokens[ seg.firstToken + j ]; if( !params.print_special && ( tok.flags & eTokenFlags::Special ) ) continue; - wprintf( L"%S%s%S", k_colors[ colorIndex( tok ) ], utf16( tok.text ).c_str(), "\033[0m" ); + if (utf8_check_is_valid(tok.text, strlen(tok.text))) // not using utf16 test just because I've just used utf8 in another part. + wprintf( L"%S%s%S", k_colors[ colorIndex( tok ) ], utf16( tok.text ).c_str(), "\033[0m" ); + else { + for (int k = 0; k < strlen(tok.text); k++) { + buffer.push_back(tok.text[k]); + if (utf8_check_is_valid(&buffer[0], buffer.size())) { + wprintf( L"%S%s%S", k_colors[ colorIndex( tok ) ], utf16( buffer ).c_str(), "\033[0m"); + buffer.clear(); + } + } + } } } else @@ -123,12 +165,23 @@ namespace to_timestamp( seg.time.end ).c_str(), speaker.c_str() ); + std::string buffer; for( uint32_t j = 0; j < seg.countTokens; j++ ) { const sToken& tok = tokens[ seg.firstToken + j ]; if( !params.print_special && ( tok.flags & eTokenFlags::Special ) ) continue; - wprintf( L"%S%s%S", k_colors[ colorIndex( tok ) ], utf16( tok.text ).c_str(), "\033[0m" ); + if (utf8_check_is_valid(tok.text, strlen(tok.text))) // not using utf16 test just because I've just used utf8 in another part. + wprintf( L"%S%s%S", k_colors[ colorIndex( tok ) ], utf16( tok.text ).c_str(), "\033[0m" ); + else { + for (int k = 0; k < strlen(tok.text); k++) { + buffer.push_back(tok.text[k]); + if (utf8_check_is_valid(&buffer[0], buffer.size())) { + wprintf( L"%S%s%S", k_colors[ colorIndex( tok ) ], utf16( buffer ).c_str(), "\033[0m"); + buffer.clear(); + } + } + } } printf( "\n" ); }