From e66be32046616c3368b0ff327b6431813a17eb8e Mon Sep 17 00:00:00 2001
From: yirongjie <yirj0809@gmail.com>
Date: Mon, 15 Jul 2024 16:09:24 +0000
Subject: [PATCH 1/3] feat: demo_imagebind_1mod

---
 CMakeLists.txt                              | 14 ++++++
 examples/demo_imagebind_1mod.cpp            | 54 +++++++++++++++++++++
 src/Module.hpp                              | 20 ++++++--
 src/Tensor.hpp                              |  2 +
 src/models/imagebind/modeling_imagebind.hpp | 12 +++++
 5 files changed, 97 insertions(+), 5 deletions(-)
 create mode 100644 examples/demo_imagebind_1mod.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d9e0bdb2..22d2b686 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -384,6 +384,20 @@ else ()
     target_link_libraries(demo_imagebind MLLM_CPU)
 endif ()
 
+add_executable(demo_imagebind_1mod ${PROJECT_SOURCE_DIR}/examples/demo_imagebind_1mod.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC} # ${DIR_SRC_QUANT}
+        src/tokenizers/Tokenizer.cpp
+        src/tokenizers/BPE/Bpe.cpp
+        ${DIR_SRC_PROCESSOE}
+        ${DIR_THIRDPARTY_AUDIO}
+        src/processor/PreProcess.cpp
+)
+if (ARM AND NOT APK)
+    target_compile_options(demo_imagebind_1mod PRIVATE -fopenmp)
+    target_link_libraries(demo_imagebind_1mod PUBLIC MLLM_CPU -fopenmp -static-openmp)
+else ()
+    target_link_libraries(demo_imagebind_1mod MLLM_CPU)
+endif ()
+
 add_executable(demo_tinyllama ${PROJECT_SOURCE_DIR}/examples/demo_tinyllama.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC} # ${DIR_SRC_QUANT}
         src/tokenizers/Tokenizer.cpp
         src/tokenizers/Tokenizer.hpp
diff --git a/examples/demo_imagebind_1mod.cpp b/examples/demo_imagebind_1mod.cpp
new file mode 100644
index 00000000..8d81472a
--- /dev/null
+++ b/examples/demo_imagebind_1mod.cpp
@@ -0,0 +1,54 @@
+//
+// Created by Rongjie Yi on 24-7-15.
+//
+#include "cmdline.h"
+#include "models/imagebind/modeling_imagebind.hpp"
+#include "models/imagebind/processing_imagebind.hpp"
+
+using namespace mllm;
+
+int main(int argc, char **argv) {
+    cmdline::parser cmdParser;
+    cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/clip_vocab.mllm");
+    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/imagebind_huge-q4_k.mllm");
+    cmdParser.add<string>("merges", 'f', "specify mllm tokenizer merges.txt path", false, "../vocab/clip_merges.txt");
+    cmdParser.add<int>("thread", 't', "num of threads", false, 4);
+    cmdParser.parse_check(argc, argv);
+
+    string vocab_path = cmdParser.get<string>("vocab");
+    string model_path = cmdParser.get<string>("model");
+    string merges_path = cmdParser.get<string>("merges");
+    CPUBackend::cpu_threads = cmdParser.get<int>("thread");
+
+    auto processor = ImagebindProcessor(vocab_path, merges_path);
+
+    ImagebindConfig config("huge");
+
+    // auto input_tensors = processor.process(
+    //     {"a dog.", "A car", "A bird"},config.max_position_embeddings,
+    //     {"../assets/dog_image.jpg", "../assets/car_image.jpg", "../assets/bird_image.jpg"}, config.img_hw,
+    //     {"../assets/dog_audio.wav", "../assets/car_audio.wav", "../assets/bird_audio.wav"});
+    
+    auto input_tensors = processor.process(
+        {"a dog."},config.max_position_embeddings,
+        {"../assets/dog_image.jpg"}, config.img_hw,
+        {"../assets/dog_audio.wav"});
+    
+    std::cout<<"Text| input_shape:["<<input_tensors.text_tensors.batch()<<", "<<input_tensors.text_tensors.sequence()<<", "<<input_tensors.text_tensors.head()<<", "<<input_tensors.text_tensors.dimension()<<"]"<<std::endl;
+    auto text_model = ImagebindTextModel(config);
+    text_model.load(model_path);
+    auto result = text_model({input_tensors.text_tensors}, input_tensors.in_len);
+    text_model.profiling();
+
+    std::cout<<"Vision| input_shape:["<<input_tensors.img_tensors.batch()<<", "<<input_tensors.img_tensors.channel()<<", "<<input_tensors.img_tensors.time()<<", "<<input_tensors.img_tensors.height()<<", "<<input_tensors.img_tensors.width()<<"]"<<std::endl;
+    auto vision_model = ImagebindVisionModel(config);
+    vision_model.load(model_path);
+    result = vision_model({input_tensors.img_tensors});
+    vision_model.profiling();
+
+    std::cout<<"Audio| input_shape:["<<input_tensors.audio_tensors.batch()<<", "<<input_tensors.audio_tensors.sequence()<<", "<<input_tensors.audio_tensors.head()<<", "<<input_tensors.audio_tensors.dimension()<<"]"<<std::endl;
+    auto audio_model = ImagebindAudioModel(config);
+    audio_model.load(model_path);
+    result = audio_model({input_tensors.audio_tensors});
+    audio_model.profiling();
+}
\ No newline at end of file
diff --git a/src/Module.hpp b/src/Module.hpp
index 802147ad..a2ad21f1 100644
--- a/src/Module.hpp
+++ b/src/Module.hpp
@@ -56,6 +56,9 @@ class Module {
     }
 
     void load(string path) {
+        Tensor::gph_.clear();
+        Module::tensor_status = TENSOR_STATIC_INIT;
+
         mllm_time_init();
         initLoader(path);
         Module::doLoad = true;
@@ -75,6 +78,9 @@ class Module {
     }
 
     void load(AbstructLoader &param_loader) {
+        Tensor::gph_.clear();
+        Module::tensor_status = TENSOR_STATIC_INIT;
+        
         loader = &param_loader;
         Module::doLoad = true;
         vector<Tensor> tmps;
@@ -172,17 +178,21 @@ class Module {
         return modules;
     }
 
-    void profiling() {
-        printf("\n");
+    void profiling(string name = "") {
+        // printf("\n");
         std::cout << "===========================================" << std::endl;
-        std::cout << "  Load time: " << load_time_/1000.0F << " s" << std::endl;
-        if(prefilling_token_size_){
-            std::cout << "  Prefilling speed: " << 1000 * prefilling_token_size_ / inference_times_[0] << " tokens/s" << std::endl;
+        if (name != "") {
+            std::cout << "            " << name << std::endl;
+            std::cout << "-------------------------------------------" << std::endl;
         }
+        std::cout << "  Load time: " << load_time_/1000.0F << " s" << std::endl;
         if(inference_times_.size()>1){
+            std::cout << "  Prefilling speed: " << 1000 * prefilling_token_size_ / inference_times_[0] << " tokens/s" << std::endl;
             double sum_decoding_time = std::accumulate(std::begin(inference_times_)+1, std::end(inference_times_), 0.0);
             double mean_decoding_time = sum_decoding_time / (inference_times_.size()-1);
             std::cout << "  Decoding speed: " << 1000 / mean_decoding_time << " tokens/s" << std::endl;
+        } else{
+            std::cout << "  Inference latency: " << inference_times_[0]/1000.0F << " s" << std::endl;
         }
         std::cout << "===========================================" << std::endl;
     }
diff --git a/src/Tensor.hpp b/src/Tensor.hpp
index b39c8ade..d37546ca 100644
--- a/src/Tensor.hpp
+++ b/src/Tensor.hpp
@@ -62,12 +62,14 @@ class Tensor {
     Tensor(Backend *bn) :
         backend_(bn), host_ptr_(), capacity_(0), dtype_(MLLM_TYPE_F32) {
     }
+    /*
     ~Tensor() {
         if (host_ptr_ != nullptr && masterTensor() == nullptr && !aggregated_&& gph_.find(name_) == gph_.end()) {
             backend_->free(host_ptr_);
             host_ptr_ = nullptr;
         }
     }
+    */
     static map<string, Tensor> gph_;
     std::map<Chl, int>& chls() {
         return chls_;
diff --git a/src/models/imagebind/modeling_imagebind.hpp b/src/models/imagebind/modeling_imagebind.hpp
index 883e6349..02750a4e 100644
--- a/src/models/imagebind/modeling_imagebind.hpp
+++ b/src/models/imagebind/modeling_imagebind.hpp
@@ -80,6 +80,10 @@ class ImagebindVisionModel final : public Module {
 
 public:
     ImagebindVisionModel() = default;
+    ImagebindVisionModel(const ImagebindConfig &config):
+        ImagebindVisionModel(config.vision_hidden_dim, config.vision_head_size, config.vision_ffn_hidden, config.head_hidden_dim,
+                             config.patch, config.patch_time, config.img_hw, config.vision_block_num,
+                             config.names_config){};
     ImagebindVisionModel(int hidden_dim, int head_size, int ffn_hidden, int head_hidden_dim,
                          int patch, int patch_time, int img_hw, int block_num,
                          const ImagebindNameConfig &names) {
@@ -128,6 +132,10 @@ class ImagebindTextModel final : public Module {
 
 public:
     ImagebindTextModel() = default;
+    ImagebindTextModel(const ImagebindConfig &config):
+        ImagebindTextModel(config.text_hidden_dim, config.text_head_size, config.text_ffn_hidden, config.head_hidden_dim,
+                           config.vocab_size, config.max_position_embeddings, config.text_block_num,
+                           config.names_config){};
     ImagebindTextModel(int hidden_dim, int head_size, int ffn_hidden, int head_hidden_dim,
                        int vocab_size, int max_position_embeddings, int block_num,
                        const ImagebindNameConfig &names) {
@@ -186,6 +194,10 @@ class ImagebindAudioModel final : public Module {
 
 public:
     ImagebindAudioModel() = default;
+    ImagebindAudioModel(const ImagebindConfig &config):
+        ImagebindAudioModel(config.audio_hidden_dim, config.audio_head_size, config.audio_ffn_hidden, config.head_hidden_dim,
+                            config.audio_kernal, config.audio_stride, config.audio_h, config.audio_w, config.audio_block_num,
+                            config.names_config){};
     ImagebindAudioModel(int hidden_dim, int head_size, int ffn_hidden, int head_hidden_dim,
                          int patch, int stride, int img_h, int img_w, int block_num,
                          const ImagebindNameConfig &names) {

From e4e4e017e5826d38956355586e5f18f9f621fd1e Mon Sep 17 00:00:00 2001
From: yirongjie <yirj0809@gmail.com>
Date: Tue, 16 Jul 2024 02:19:54 +0000
Subject: [PATCH 2/3] fix: profiling

---
 examples/demo_imagebind_1mod.cpp | 16 ++++++++++----
 src/Module.hpp                   | 37 ++++++++++++++++++++++++++------
 2 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/examples/demo_imagebind_1mod.cpp b/examples/demo_imagebind_1mod.cpp
index 8d81472a..38782ff2 100644
--- a/examples/demo_imagebind_1mod.cpp
+++ b/examples/demo_imagebind_1mod.cpp
@@ -24,6 +24,8 @@ int main(int argc, char **argv) {
 
     ImagebindConfig config("huge");
 
+    int loop_times = 10;
+
     // auto input_tensors = processor.process(
     //     {"a dog.", "A car", "A bird"},config.max_position_embeddings,
     //     {"../assets/dog_image.jpg", "../assets/car_image.jpg", "../assets/bird_image.jpg"}, config.img_hw,
@@ -37,18 +39,24 @@ int main(int argc, char **argv) {
     std::cout<<"Text| input_shape:["<<input_tensors.text_tensors.batch()<<", "<<input_tensors.text_tensors.sequence()<<", "<<input_tensors.text_tensors.head()<<", "<<input_tensors.text_tensors.dimension()<<"]"<<std::endl;
     auto text_model = ImagebindTextModel(config);
     text_model.load(model_path);
-    auto result = text_model({input_tensors.text_tensors}, input_tensors.in_len);
+    for (int step = 0; step < loop_times; step++) {
+        auto result = text_model({input_tensors.text_tensors}, input_tensors.in_len);
+    }
     text_model.profiling();
 
     std::cout<<"Vision| input_shape:["<<input_tensors.img_tensors.batch()<<", "<<input_tensors.img_tensors.channel()<<", "<<input_tensors.img_tensors.time()<<", "<<input_tensors.img_tensors.height()<<", "<<input_tensors.img_tensors.width()<<"]"<<std::endl;
     auto vision_model = ImagebindVisionModel(config);
-    vision_model.load(model_path);
-    result = vision_model({input_tensors.img_tensors});
+    vision_model.load(model_path);    
+    for (int step = 0; step < loop_times; step++) {
+        auto result = vision_model({input_tensors.img_tensors});
+    }
     vision_model.profiling();
 
     std::cout<<"Audio| input_shape:["<<input_tensors.audio_tensors.batch()<<", "<<input_tensors.audio_tensors.sequence()<<", "<<input_tensors.audio_tensors.head()<<", "<<input_tensors.audio_tensors.dimension()<<"]"<<std::endl;
     auto audio_model = ImagebindAudioModel(config);
     audio_model.load(model_path);
-    result = audio_model({input_tensors.audio_tensors});
+    for (int step = 0; step < loop_times; step++) {
+        auto result = audio_model({input_tensors.audio_tensors});
+    }
     audio_model.profiling();
 }
\ No newline at end of file
diff --git a/src/Module.hpp b/src/Module.hpp
index a2ad21f1..517aaf07 100644
--- a/src/Module.hpp
+++ b/src/Module.hpp
@@ -23,7 +23,9 @@ class Module {
 private:
     double load_time_;
     int prefilling_token_size_=0;
+    int decoding_token_size_=0;
     vector<double> inference_times_;
+    vector<vector<int>> last_shape_bshd_;
     
 public:
     static map<BackendType, Backend *> backends;
@@ -108,28 +110,49 @@ class Module {
             return Forward(inputs, anyArgs);
         }
         if (inputs[0].ttype() == TensorType::INPUT_TENSOR) {
-            for (auto &input : inputs) {
+            bool need_setup = true;
+            for (int i = 0; i < inputs.size(); i++) {
+                auto &input = inputs[i];
                 input.setTtype(TensorType::NORMAL_TENSOR);
                 input.status() = TENSOR_STATIC_INIT;
                 if(input.batch() == 0){
                     Tensor::gph_[input.name()] = input;
                 }
+                if(input.sequence()!=1 && !last_shape_bshd_.empty()){
+                    // if LLM/VLLM model, the `need_setup` should be `true`
+                    if(input.batch() == last_shape_bshd_[i][0] & 
+                        input.sequence() == last_shape_bshd_[i][1] & 
+                        input.head() == last_shape_bshd_[i][2] & 
+                        input.dimension() == last_shape_bshd_[i][3]){
+                        need_setup = false;
+                    }
+                }
             }
             tensor_status = TENSOR_STATIC_INIT;
 
             uint64_t time_start = mllm_time_us();
-            Forward(inputs, anyArgs);
+            if(need_setup){
+                Forward(inputs, anyArgs);
+            }
             for (auto &input : inputs) {
                 input.status() = TENSOR_STATIC_READY;
             }
             tensor_status = TENSOR_STATIC_READY;
             auto output = Forward(inputs, anyArgs);
             uint64_t time_end = mllm_time_us();
+
+            double inference_time_ = (time_end - time_start) / 1000.0F;//ms
+            inference_times_.push_back(inference_time_);
             if(prefilling_token_size_==0){
                 prefilling_token_size_ = inputs[0].sequence();
+            }else if(decoding_token_size_==0){
+                decoding_token_size_ = inputs[0].sequence();
+            }
+            last_shape_bshd_.clear();
+            for (auto &input : inputs) {
+                last_shape_bshd_.push_back({input.batch(), input.sequence(), 
+                                            input.head(), input.dimension()});
             }
-            double inference_time_ = (time_end - time_start) / 1000.0F;//ms
-            inference_times_.push_back(inference_time_);
 
             return output;
         } else {
@@ -186,13 +209,15 @@ class Module {
             std::cout << "-------------------------------------------" << std::endl;
         }
         std::cout << "  Load time: " << load_time_/1000.0F << " s" << std::endl;
-        if(inference_times_.size()>1){
+        if(inference_times_.size()>1 && decoding_token_size_ != prefilling_token_size_){
             std::cout << "  Prefilling speed: " << 1000 * prefilling_token_size_ / inference_times_[0] << " tokens/s" << std::endl;
             double sum_decoding_time = std::accumulate(std::begin(inference_times_)+1, std::end(inference_times_), 0.0);
             double mean_decoding_time = sum_decoding_time / (inference_times_.size()-1);
             std::cout << "  Decoding speed: " << 1000 / mean_decoding_time << " tokens/s" << std::endl;
         } else{
-            std::cout << "  Inference latency: " << inference_times_[0]/1000.0F << " s" << std::endl;
+            double sum_time = std::accumulate(std::begin(inference_times_), std::end(inference_times_), 0.0);
+            double mean_time = sum_time / (inference_times_.size());
+            std::cout << "  Inference latency: " << mean_time/1000.0F << " s" << std::endl;
         }
         std::cout << "===========================================" << std::endl;
     }

From 4800360626d450d457a1f71f59f90993a26e343d Mon Sep 17 00:00:00 2001
From: yirongjie <yirj0809@gmail.com>
Date: Tue, 16 Jul 2024 02:49:28 +0000
Subject: [PATCH 3/3] fix: error print if load

---
 src/Module.hpp      | 13 ++++++++-----
 src/ParamLoader.cpp |  6 ++++--
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/Module.hpp b/src/Module.hpp
index 517aaf07..12c57d59 100644
--- a/src/Module.hpp
+++ b/src/Module.hpp
@@ -110,6 +110,14 @@ class Module {
             return Forward(inputs, anyArgs);
         }
         if (inputs[0].ttype() == TensorType::INPUT_TENSOR) {
+            if(prefilling_token_size_==0){ // first time init
+                if(!Tensor::gph_.empty()){
+                    Tensor::gph_.clear();
+                }
+                prefilling_token_size_ = inputs[0].sequence();
+            }else if(decoding_token_size_==0){
+                decoding_token_size_ = inputs[0].sequence();
+            }
             bool need_setup = true;
             for (int i = 0; i < inputs.size(); i++) {
                 auto &input = inputs[i];
@@ -143,11 +151,6 @@ class Module {
 
             double inference_time_ = (time_end - time_start) / 1000.0F;//ms
             inference_times_.push_back(inference_time_);
-            if(prefilling_token_size_==0){
-                prefilling_token_size_ = inputs[0].sequence();
-            }else if(decoding_token_size_==0){
-                decoding_token_size_ = inputs[0].sequence();
-            }
             last_shape_bshd_.clear();
             for (auto &input : inputs) {
                 last_shape_bshd_.push_back({input.batch(), input.sequence(), 
diff --git a/src/ParamLoader.cpp b/src/ParamLoader.cpp
index d27bc537..f9029d8b 100644
--- a/src/ParamLoader.cpp
+++ b/src/ParamLoader.cpp
@@ -61,7 +61,7 @@ ParamLoader::ParamLoader(std::string filename, bool use_mmap) :
     // #endif
 
     if (this->fp_ == nullptr) {
-        std::cout << "param open file failed" << std::endl;
+        // std::cout << "param open file failed" << std::endl;
         return;
         int errorCode = errno;
         char *errorMsg = strerror(errorCode);
@@ -123,7 +123,9 @@ std::tuple<uint8_t *, uint64_t> ParamLoader::load(string name) {
 }
 DataType ParamLoader::getDataType(string name) {
     if (data_type_.count(name) != 1) {
-        std::cerr<<name<<" not found"<<std::endl;
+        if (this->fp_ != nullptr) {
+            std::cerr<<name<<" not found"<<std::endl;
+        }
         return DataType::MLLM_TYPE_COUNT;
     }
     int type = data_type_[name];