From e66be32046616c3368b0ff327b6431813a17eb8e Mon Sep 17 00:00:00 2001 From: yirongjie Date: Mon, 15 Jul 2024 16:09:24 +0000 Subject: [PATCH 1/3] feat: demo_imagebind_1mod --- CMakeLists.txt | 14 ++++++ examples/demo_imagebind_1mod.cpp | 54 +++++++++++++++++++++ src/Module.hpp | 20 ++++++-- src/Tensor.hpp | 2 + src/models/imagebind/modeling_imagebind.hpp | 12 +++++ 5 files changed, 97 insertions(+), 5 deletions(-) create mode 100644 examples/demo_imagebind_1mod.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index d9e0bdb2..22d2b686 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -384,6 +384,20 @@ else () target_link_libraries(demo_imagebind MLLM_CPU) endif () +add_executable(demo_imagebind_1mod ${PROJECT_SOURCE_DIR}/examples/demo_imagebind_1mod.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC} # ${DIR_SRC_QUANT} + src/tokenizers/Tokenizer.cpp + src/tokenizers/BPE/Bpe.cpp + ${DIR_SRC_PROCESSOE} + ${DIR_THIRDPARTY_AUDIO} + src/processor/PreProcess.cpp +) +if (ARM AND NOT APK) + target_compile_options(demo_imagebind_1mod PRIVATE -fopenmp) + target_link_libraries(demo_imagebind_1mod PUBLIC MLLM_CPU -fopenmp -static-openmp) +else () + target_link_libraries(demo_imagebind_1mod MLLM_CPU) +endif () + add_executable(demo_tinyllama ${PROJECT_SOURCE_DIR}/examples/demo_tinyllama.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC} # ${DIR_SRC_QUANT} src/tokenizers/Tokenizer.cpp src/tokenizers/Tokenizer.hpp diff --git a/examples/demo_imagebind_1mod.cpp b/examples/demo_imagebind_1mod.cpp new file mode 100644 index 00000000..8d81472a --- /dev/null +++ b/examples/demo_imagebind_1mod.cpp @@ -0,0 +1,54 @@ +// +// Created by Rongjie Yi on 24-7-15. +// +#include "cmdline.h" +#include "models/imagebind/modeling_imagebind.hpp" +#include "models/imagebind/processing_imagebind.hpp" + +using namespace mllm; + +int main(int argc, char **argv) { + cmdline::parser cmdParser; + cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/clip_vocab.mllm"); + cmdParser.add("model", 'm', "specify mllm model path", false, "../models/imagebind_huge-q4_k.mllm"); + cmdParser.add("merges", 'f', "specify mllm tokenizer merges.txt path", false, "../vocab/clip_merges.txt"); + cmdParser.add("thread", 't', "num of threads", false, 4); + cmdParser.parse_check(argc, argv); + + string vocab_path = cmdParser.get("vocab"); + string model_path = cmdParser.get("model"); + string merges_path = cmdParser.get("merges"); + CPUBackend::cpu_threads = cmdParser.get("thread"); + + auto processor = ImagebindProcessor(vocab_path, merges_path); + + ImagebindConfig config("huge"); + + // auto input_tensors = processor.process( + // {"a dog.", "A car", "A bird"},config.max_position_embeddings, + // {"../assets/dog_image.jpg", "../assets/car_image.jpg", "../assets/bird_image.jpg"}, config.img_hw, + // {"../assets/dog_audio.wav", "../assets/car_audio.wav", "../assets/bird_audio.wav"}); + + auto input_tensors = processor.process( + {"a dog."},config.max_position_embeddings, + {"../assets/dog_image.jpg"}, config.img_hw, + {"../assets/dog_audio.wav"}); + + std::cout<<"Text| input_shape:["< tmps; @@ -172,17 +178,21 @@ class Module { return modules; } - void profiling() { - printf("\n"); + void profiling(string name = "") { + // printf("\n"); std::cout << "===========================================" << std::endl; - std::cout << " Load time: " << load_time_/1000.0F << " s" << std::endl; - if(prefilling_token_size_){ - std::cout << " Prefilling speed: " << 1000 * prefilling_token_size_ / inference_times_[0] << " tokens/s" << std::endl; + if (name != "") { + std::cout << " " << name << std::endl; + std::cout << "-------------------------------------------" << std::endl; } + std::cout << " Load time: " << load_time_/1000.0F << " s" << std::endl; if(inference_times_.size()>1){ + std::cout << " Prefilling speed: " << 1000 * prefilling_token_size_ / inference_times_[0] << " tokens/s" << std::endl; double sum_decoding_time = std::accumulate(std::begin(inference_times_)+1, std::end(inference_times_), 0.0); double mean_decoding_time = sum_decoding_time / (inference_times_.size()-1); std::cout << " Decoding speed: " << 1000 / mean_decoding_time << " tokens/s" << std::endl; + } else{ + std::cout << " Inference latency: " << inference_times_[0]/1000.0F << " s" << std::endl; } std::cout << "===========================================" << std::endl; } diff --git a/src/Tensor.hpp b/src/Tensor.hpp index b39c8ade..d37546ca 100644 --- a/src/Tensor.hpp +++ b/src/Tensor.hpp @@ -62,12 +62,14 @@ class Tensor { Tensor(Backend *bn) : backend_(bn), host_ptr_(), capacity_(0), dtype_(MLLM_TYPE_F32) { } + /* ~Tensor() { if (host_ptr_ != nullptr && masterTensor() == nullptr && !aggregated_&& gph_.find(name_) == gph_.end()) { backend_->free(host_ptr_); host_ptr_ = nullptr; } } + */ static map gph_; std::map& chls() { return chls_; diff --git a/src/models/imagebind/modeling_imagebind.hpp b/src/models/imagebind/modeling_imagebind.hpp index 883e6349..02750a4e 100644 --- a/src/models/imagebind/modeling_imagebind.hpp +++ b/src/models/imagebind/modeling_imagebind.hpp @@ -80,6 +80,10 @@ class ImagebindVisionModel final : public Module { public: ImagebindVisionModel() = default; + ImagebindVisionModel(const ImagebindConfig &config): + ImagebindVisionModel(config.vision_hidden_dim, config.vision_head_size, config.vision_ffn_hidden, config.head_hidden_dim, + config.patch, config.patch_time, config.img_hw, config.vision_block_num, + config.names_config){}; ImagebindVisionModel(int hidden_dim, int head_size, int ffn_hidden, int head_hidden_dim, int patch, int patch_time, int img_hw, int block_num, const ImagebindNameConfig &names) { @@ -128,6 +132,10 @@ class ImagebindTextModel final : public Module { public: ImagebindTextModel() = default; + ImagebindTextModel(const ImagebindConfig &config): + ImagebindTextModel(config.text_hidden_dim, config.text_head_size, config.text_ffn_hidden, config.head_hidden_dim, + config.vocab_size, config.max_position_embeddings, config.text_block_num, + config.names_config){}; ImagebindTextModel(int hidden_dim, int head_size, int ffn_hidden, int head_hidden_dim, int vocab_size, int max_position_embeddings, int block_num, const ImagebindNameConfig &names) { @@ -186,6 +194,10 @@ class ImagebindAudioModel final : public Module { public: ImagebindAudioModel() = default; + ImagebindAudioModel(const ImagebindConfig &config): + ImagebindAudioModel(config.audio_hidden_dim, config.audio_head_size, config.audio_ffn_hidden, config.head_hidden_dim, + config.audio_kernal, config.audio_stride, config.audio_h, config.audio_w, config.audio_block_num, + config.names_config){}; ImagebindAudioModel(int hidden_dim, int head_size, int ffn_hidden, int head_hidden_dim, int patch, int stride, int img_h, int img_w, int block_num, const ImagebindNameConfig &names) { From e4e4e017e5826d38956355586e5f18f9f621fd1e Mon Sep 17 00:00:00 2001 From: yirongjie Date: Tue, 16 Jul 2024 02:19:54 +0000 Subject: [PATCH 2/3] fix: profiling --- examples/demo_imagebind_1mod.cpp | 16 ++++++++++---- src/Module.hpp | 37 ++++++++++++++++++++++++++------ 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/examples/demo_imagebind_1mod.cpp b/examples/demo_imagebind_1mod.cpp index 8d81472a..38782ff2 100644 --- a/examples/demo_imagebind_1mod.cpp +++ b/examples/demo_imagebind_1mod.cpp @@ -24,6 +24,8 @@ int main(int argc, char **argv) { ImagebindConfig config("huge"); + int loop_times = 10; + // auto input_tensors = processor.process( // {"a dog.", "A car", "A bird"},config.max_position_embeddings, // {"../assets/dog_image.jpg", "../assets/car_image.jpg", "../assets/bird_image.jpg"}, config.img_hw, @@ -37,18 +39,24 @@ int main(int argc, char **argv) { std::cout<<"Text| input_shape:["< inference_times_; + vector> last_shape_bshd_; public: static map backends; @@ -108,28 +110,49 @@ class Module { return Forward(inputs, anyArgs); } if (inputs[0].ttype() == TensorType::INPUT_TENSOR) { - for (auto &input : inputs) { + bool need_setup = true; + for (int i = 0; i < inputs.size(); i++) { + auto &input = inputs[i]; input.setTtype(TensorType::NORMAL_TENSOR); input.status() = TENSOR_STATIC_INIT; if(input.batch() == 0){ Tensor::gph_[input.name()] = input; } + if(input.sequence()!=1 && !last_shape_bshd_.empty()){ + // if LLM/VLLM model, the `need_setup` should be `true` + if(input.batch() == last_shape_bshd_[i][0] & + input.sequence() == last_shape_bshd_[i][1] & + input.head() == last_shape_bshd_[i][2] & + input.dimension() == last_shape_bshd_[i][3]){ + need_setup = false; + } + } } tensor_status = TENSOR_STATIC_INIT; uint64_t time_start = mllm_time_us(); - Forward(inputs, anyArgs); + if(need_setup){ + Forward(inputs, anyArgs); + } for (auto &input : inputs) { input.status() = TENSOR_STATIC_READY; } tensor_status = TENSOR_STATIC_READY; auto output = Forward(inputs, anyArgs); uint64_t time_end = mllm_time_us(); + + double inference_time_ = (time_end - time_start) / 1000.0F;//ms + inference_times_.push_back(inference_time_); if(prefilling_token_size_==0){ prefilling_token_size_ = inputs[0].sequence(); + }else if(decoding_token_size_==0){ + decoding_token_size_ = inputs[0].sequence(); + } + last_shape_bshd_.clear(); + for (auto &input : inputs) { + last_shape_bshd_.push_back({input.batch(), input.sequence(), + input.head(), input.dimension()}); } - double inference_time_ = (time_end - time_start) / 1000.0F;//ms - inference_times_.push_back(inference_time_); return output; } else { @@ -186,13 +209,15 @@ class Module { std::cout << "-------------------------------------------" << std::endl; } std::cout << " Load time: " << load_time_/1000.0F << " s" << std::endl; - if(inference_times_.size()>1){ + if(inference_times_.size()>1 && decoding_token_size_ != prefilling_token_size_){ std::cout << " Prefilling speed: " << 1000 * prefilling_token_size_ / inference_times_[0] << " tokens/s" << std::endl; double sum_decoding_time = std::accumulate(std::begin(inference_times_)+1, std::end(inference_times_), 0.0); double mean_decoding_time = sum_decoding_time / (inference_times_.size()-1); std::cout << " Decoding speed: " << 1000 / mean_decoding_time << " tokens/s" << std::endl; } else{ - std::cout << " Inference latency: " << inference_times_[0]/1000.0F << " s" << std::endl; + double sum_time = std::accumulate(std::begin(inference_times_), std::end(inference_times_), 0.0); + double mean_time = sum_time / (inference_times_.size()); + std::cout << " Inference latency: " << mean_time/1000.0F << " s" << std::endl; } std::cout << "===========================================" << std::endl; } From 4800360626d450d457a1f71f59f90993a26e343d Mon Sep 17 00:00:00 2001 From: yirongjie Date: Tue, 16 Jul 2024 02:49:28 +0000 Subject: [PATCH 3/3] fix: error print if load --- src/Module.hpp | 13 ++++++++----- src/ParamLoader.cpp | 6 ++++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/Module.hpp b/src/Module.hpp index 517aaf07..12c57d59 100644 --- a/src/Module.hpp +++ b/src/Module.hpp @@ -110,6 +110,14 @@ class Module { return Forward(inputs, anyArgs); } if (inputs[0].ttype() == TensorType::INPUT_TENSOR) { + if(prefilling_token_size_==0){ // first time init + if(!Tensor::gph_.empty()){ + Tensor::gph_.clear(); + } + prefilling_token_size_ = inputs[0].sequence(); + }else if(decoding_token_size_==0){ + decoding_token_size_ = inputs[0].sequence(); + } bool need_setup = true; for (int i = 0; i < inputs.size(); i++) { auto &input = inputs[i]; @@ -143,11 +151,6 @@ class Module { double inference_time_ = (time_end - time_start) / 1000.0F;//ms inference_times_.push_back(inference_time_); - if(prefilling_token_size_==0){ - prefilling_token_size_ = inputs[0].sequence(); - }else if(decoding_token_size_==0){ - decoding_token_size_ = inputs[0].sequence(); - } last_shape_bshd_.clear(); for (auto &input : inputs) { last_shape_bshd_.push_back({input.batch(), input.sequence(), diff --git a/src/ParamLoader.cpp b/src/ParamLoader.cpp index d27bc537..f9029d8b 100644 --- a/src/ParamLoader.cpp +++ b/src/ParamLoader.cpp @@ -61,7 +61,7 @@ ParamLoader::ParamLoader(std::string filename, bool use_mmap) : // #endif if (this->fp_ == nullptr) { - std::cout << "param open file failed" << std::endl; + // std::cout << "param open file failed" << std::endl; return; int errorCode = errno; char *errorMsg = strerror(errorCode); @@ -123,7 +123,9 @@ std::tuple ParamLoader::load(string name) { } DataType ParamLoader::getDataType(string name) { if (data_type_.count(name) != 1) { - std::cerr<fp_ != nullptr) { + std::cerr<