diff --git a/runtime/docs/SDK_tutorial_online.md b/runtime/docs/SDK_tutorial_online.md index 4683761da..c8e78be67 100644 --- a/runtime/docs/SDK_tutorial_online.md +++ b/runtime/docs/SDK_tutorial_online.md @@ -59,7 +59,7 @@ For more client version support, please refer to the [websocket_protocol](./webs If you want to run the client directly for testing, you can refer to the following simple instructions, using the Python version as an example: ```shell -python3 funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode offline --audio_in "../audio/asr_example.wav" +python3 funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode offline --audio_in "../audio/asr_example.pcm" ``` Command parameter instructions: @@ -79,7 +79,7 @@ Command parameter instructions: After entering the samples/cpp directory, you can test it with CPP. The command is as follows: ```shell -./funasr-wss-client-2pass --server-ip 127.0.0.1 --port 10095 --wav-path ../audio/asr_example.wav +./funasr-wss-client-2pass --server-ip 127.0.0.1 --port 10095 --wav-path ../audio/asr_example.pcm ``` Command parameter description: diff --git a/runtime/docs/SDK_tutorial_online_zh.md b/runtime/docs/SDK_tutorial_online_zh.md index e6705de95..ab48ec760 100644 --- a/runtime/docs/SDK_tutorial_online_zh.md +++ b/runtime/docs/SDK_tutorial_online_zh.md @@ -84,7 +84,7 @@ python3 funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode 2pass 进入samples/cpp目录后,可以用cpp进行测试,指令如下: ```shell ./funasr-wss-client-2pass --server-ip 127.0.0.1 --port 10095 --mode 2pass \ - --wav-path ../audio/asr_example.wav + --wav-path ../audio/asr_example.pcm ``` 命令参数说明: diff --git a/runtime/onnxruntime/include/audio.h b/runtime/onnxruntime/include/audio.h index 5194aa226..98f21693b 100644 --- a/runtime/onnxruntime/include/audio.h +++ b/runtime/onnxruntime/include/audio.h @@ -76,6 +76,7 @@ class Audio { int Fetch(float *&dout, int &len, int &flag, float &start_time); void Padding(); void Split(OfflineStream* offline_streamj); + void CutSplit(OfflineStream* offline_streamj); void Split(VadModel* vad_obj, vector>& vad_segments, bool input_finished=true); void Split(VadModel* vad_obj, int chunk_len, bool input_finished=true, ASR_TYPE asr_mode=ASR_TWO_PASS); float GetTimeLen(); diff --git a/runtime/onnxruntime/src/audio.cpp b/runtime/onnxruntime/src/audio.cpp index 6f829ccc9..9b93dc83b 100644 --- a/runtime/onnxruntime/src/audio.cpp +++ b/runtime/onnxruntime/src/audio.cpp @@ -1085,6 +1085,60 @@ void Audio::Split(OfflineStream* offline_stream) } } +void Audio::CutSplit(OfflineStream* offline_stream) +{ + std::unique_ptr vad_online_handle = make_unique((FsmnVad*)(offline_stream->vad_handle).get()); + AudioFrame *frame; + + frame = frame_queue.front(); + frame_queue.pop(); + int sp_len = frame->GetLen(); + delete frame; + frame = nullptr; + + int step = dest_sample_rate*10; + bool is_final=false; + vector> vad_segments; + for (int sample_offset = 0; sample_offset < speech_len; sample_offset += std::min(step, speech_len - sample_offset)) { + if (sample_offset + step >= speech_len - 1) { + step = speech_len - sample_offset; + is_final = true; + } else { + is_final = false; + } + std::vector pcm_data(speech_data+sample_offset, speech_data+sample_offset+step); + vector> cut_segments = vad_online_handle->Infer(pcm_data, is_final); + vad_segments.insert(vad_segments.end(), cut_segments.begin(), cut_segments.end()); + } + + int speech_start_i = -1, speech_end_i =-1; + for(vector vad_segment:vad_segments) + { + if(vad_segment.size() != 2){ + LOG(ERROR) << "Size of vad_segment is not 2."; + break; + } + if(vad_segment[0] != -1){ + speech_start_i = vad_segment[0]; + } + if(vad_segment[1] != -1){ + speech_end_i = vad_segment[1]; + } + + if(speech_start_i!=-1 && speech_end_i!=-1){ + frame = new AudioFrame(); + int start = speech_start_i*seg_sample; + int end = speech_end_i*seg_sample; + frame->SetStart(start); + frame->SetEnd(end); + frame_queue.push(frame); + frame = nullptr; + speech_start_i=-1; + speech_end_i=-1; + } + } +} + void Audio::Split(VadModel* vad_obj, vector>& vad_segments, bool input_finished) { AudioFrame *frame; diff --git a/runtime/onnxruntime/src/funasrruntime.cpp b/runtime/onnxruntime/src/funasrruntime.cpp index 0ca4dedb0..68a9f099a 100644 --- a/runtime/onnxruntime/src/funasrruntime.cpp +++ b/runtime/onnxruntime/src/funasrruntime.cpp @@ -245,7 +245,7 @@ return p_result; } if(offline_stream->UseVad()){ - audio.Split(offline_stream); + audio.CutSplit(offline_stream); } float* buff; diff --git a/runtime/onnxruntime/src/util.cpp b/runtime/onnxruntime/src/util.cpp index 039fa90be..a12570b86 100644 --- a/runtime/onnxruntime/src/util.cpp +++ b/runtime/onnxruntime/src/util.cpp @@ -590,8 +590,8 @@ std::string TimestampSentence(std::string &text, std::string &str_time){ // format ts_sent += "{\"text_seg\":\"" + text_seg + "\","; ts_sent += "\"punc\":\"" + characters[idx_str] + "\","; - ts_sent += "\"start\":\"" + to_string(start) + "\","; - ts_sent += "\"end\":\"" + to_string(end) + "\","; + ts_sent += "\"start\":" + to_string(start) + ","; + ts_sent += "\"end\":" + to_string(end) + ","; ts_sent += "\"ts_list\":" + VectorToString(ts_seg, false) + "}"; if (idx_str == characters.size()-1){ @@ -627,8 +627,8 @@ std::string TimestampSentence(std::string &text, std::string &str_time){ // format ts_sent += "{\"text_seg\":\"" + text_seg + "\","; ts_sent += "\"punc\":\"\","; - ts_sent += "\"start\":\"" + to_string(start) + "\","; - ts_sent += "\"end\":\"" + to_string(end) + "\","; + ts_sent += "\"start\":" + to_string(start) + ","; + ts_sent += "\"end\":" + to_string(end) + ","; ts_sent += "\"ts_list\":" + VectorToString(ts_seg, false) + "}"; ts_sentences += ts_sent; } diff --git a/runtime/websocket/CMakeLists.txt b/runtime/websocket/CMakeLists.txt index 13472da14..ba6497a2e 100644 --- a/runtime/websocket/CMakeLists.txt +++ b/runtime/websocket/CMakeLists.txt @@ -31,7 +31,7 @@ if(ENABLE_WEBSOCKET) # cmake_policy(SET CMP0135 NEW) include(FetchContent) - if(NOT EXISTS ${PROJECT_SOURCE_DIR}/third_party/websocket ) + if(NOT EXISTS ${PROJECT_SOURCE_DIR}/third_party/websocket/websocketpp ) FetchContent_Declare(websocketpp GIT_REPOSITORY https://github.com/zaphoyd/websocketpp.git GIT_TAG 0.8.2 @@ -42,7 +42,7 @@ if(ENABLE_WEBSOCKET) endif() include_directories(${PROJECT_SOURCE_DIR}/third_party/websocket) - if(NOT EXISTS ${PROJECT_SOURCE_DIR}/third_party/asio ) + if(NOT EXISTS ${PROJECT_SOURCE_DIR}/third_party/asio/asio ) FetchContent_Declare(asio URL https://github.com/chriskohlhoff/asio/archive/refs/tags/asio-1-24-0.tar.gz SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/asio @@ -52,7 +52,7 @@ if(ENABLE_WEBSOCKET) endif() include_directories(${PROJECT_SOURCE_DIR}/third_party/asio/asio/include) - if(NOT EXISTS ${PROJECT_SOURCE_DIR}/third_party/json ) + if(NOT EXISTS ${PROJECT_SOURCE_DIR}/third_party/json/ChangeLog.md ) FetchContent_Declare(json URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.2.tar.gz SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/json diff --git a/runtime/websocket/bin/funasr-wss-client-2pass.cpp b/runtime/websocket/bin/funasr-wss-client-2pass.cpp index 0cbd10e23..6533dd556 100644 --- a/runtime/websocket/bin/funasr-wss-client-2pass.cpp +++ b/runtime/websocket/bin/funasr-wss-client-2pass.cpp @@ -192,10 +192,7 @@ class WebsocketClient { funasr::Audio audio(1); int32_t sampling_rate = audio_fs; std::string wav_format = "pcm"; - if (funasr::IsTargetFile(wav_path.c_str(), "wav")) { - if (!audio.LoadWav(wav_path.c_str(), &sampling_rate, false)) - return; - } else if (funasr::IsTargetFile(wav_path.c_str(), "pcm")) { + if (funasr::IsTargetFile(wav_path.c_str(), "pcm")) { if (!audio.LoadPcmwav(wav_path.c_str(), &sampling_rate, false)) return; } else { wav_format = "others"; diff --git a/runtime/websocket/bin/funasr-wss-client.cpp b/runtime/websocket/bin/funasr-wss-client.cpp index 1dc9e3e33..7af3fbbda 100644 --- a/runtime/websocket/bin/funasr-wss-client.cpp +++ b/runtime/websocket/bin/funasr-wss-client.cpp @@ -193,10 +193,7 @@ class WebsocketClient { funasr::Audio audio(1); int32_t sampling_rate = audio_fs; std::string wav_format = "pcm"; - if (funasr::IsTargetFile(wav_path.c_str(), "wav")) { - if (!audio.LoadWav(wav_path.c_str(), &sampling_rate, false)) - return; - } else if(funasr::IsTargetFile(wav_path.c_str(), "pcm")){ + if(funasr::IsTargetFile(wav_path.c_str(), "pcm")){ if (!audio.LoadPcmwav(wav_path.c_str(), &sampling_rate, false)) return ; }else{ diff --git a/runtime/websocket/bin/websocket-server-2pass.cpp b/runtime/websocket/bin/websocket-server-2pass.cpp index 954ffae9d..8c8cab419 100644 --- a/runtime/websocket/bin/websocket-server-2pass.cpp +++ b/runtime/websocket/bin/websocket-server-2pass.cpp @@ -211,7 +211,7 @@ void WebSocketServer::do_decoder( if(wav_format != "pcm" && wav_format != "PCM"){ websocketpp::lib::error_code ec; nlohmann::json jsonresult; - jsonresult["text"] = "ERROR. Real-time transcription service ONLY SUPPORT wav_format pcm."; + jsonresult["text"] = "ERROR. Real-time transcription service ONLY SUPPORT PCM stream."; jsonresult["wav_name"] = wav_name; jsonresult["is_final"] = true; if (is_ssl) {