feat add dealloc for activation_tensors for only CPU Backend. (#201)

UbiquitousLearning · Nov 27, 2024 · c921e90 · c921e90
1 parent 2a58575
commit c921e90
Show file tree

Hide file tree

Showing 21 changed files with 222 additions and 88 deletions.
diff --git a/examples/demo_bert.cpp b/examples/demo_bert.cpp
@@ -5,6 +5,7 @@
 #include "models/bert/modeling_bert.hpp"
 #include "models/bert/tokenization_bert.hpp"
 #include "cmdline.h"
+#include <vector>
 
 /*
  * an intent to support gte-small BertModel to do text embedding
@@ -24,15 +25,17 @@ int main(int argc, char *argv[]) {
     CPUBackend::cpu_threads = cmdParser.get<int>("thread");
 
     BertTokenizer tokenizer(vocab_path, true);
-    string text = "Help me set an alarm at 21:30";
-    auto inputs = tokenizer.tokenizes(text);
     auto config = BertConfig();
     auto model = BertModel(config);
     model.load(model_path);
 
-    auto res = model({inputs[0], inputs[1], inputs[2]})[0];
-
-    res.printData<float>();
+    string text = "Help me set an alarm at 21:30";
+    vector<string> texts = {text, text};
+    for (auto &text : texts) {
+        auto inputs = tokenizer.tokenizes(text);
+        auto res = model({inputs[0], inputs[1], inputs[2]})[0];
+        res.printData<float>();
+    }
 
     return 0;
 }
diff --git a/examples/demo_gemma.cpp b/examples/demo_gemma.cpp
@@ -54,6 +54,7 @@ int main(int argc, char **argv) {
             chatPostProcessing(out_token, input_tensor, {});
         }
         printf("\n");
+        model.clear_kvcache();
     }
 
     return 0;

diff --git a/examples/demo_imagebind_1mod.cpp b/examples/demo_imagebind_1mod.cpp
@@ -13,7 +13,7 @@ int main(int argc, char **argv) {
     cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/imagebind_huge-q4_k.mllm");
     cmdParser.add<string>("merges", 'f', "specify mllm tokenizer merges.txt path", false, "../vocab/clip_merges.txt");
     cmdParser.add<int>("thread", 't', "num of threads", false, 4);
-    cmdParser.add<int>("loop_times", 'l', "number of inference loops", false, 10);
+    cmdParser.add<int>("loop_times", 'l', "number of inference loops", false, 2);
     cmdParser.add<string>("modality", 'o', "inference modality (text/vision/audio/all)", false, "all");
     cmdParser.parse_check(argc, argv);
 

diff --git a/examples/demo_openelm.cpp b/examples/demo_openelm.cpp
@@ -48,10 +48,10 @@ int main(int argc, char **argv) {
 
         LlmTextGeneratorOpts opt{
             .max_new_tokens = 100,
-            .do_sample = true,
-            .temperature = 0.3F,
-            .top_k = 50,
-            .top_p = 0.F,
+            .do_sample = false,
+            // .temperature = 0.3F,
+            // .top_k = 50,
+            // .top_p = 0.F,
         };
         model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
             auto out_string = tokenizer.detokenize({out_token});
@@ -61,5 +61,6 @@ int main(int argc, char **argv) {
             return true;
         });
         std::cout << "\n";
+        model.clear_kvcache();
     }
 }
diff --git a/examples/demo_phi3v.cpp b/examples/demo_phi3v.cpp
@@ -49,7 +49,7 @@ int main(int argc, char **argv) {
             auto [not_end, output_string] = processor.tokenizer->postprocess(out_string);
             if (!not_end) { break; }
             std::cout << output_string << std::flush;
-            chatPostProcessing(out_token, input_tensor[0], {});
+            chatPostProcessing(out_token, input_tensor[0], {&input_tensor[1], &input_tensor[2]});
         }
         printf("\n");
     }

diff --git a/examples/demo_stablelm.cpp b/examples/demo_stablelm.cpp
@@ -11,7 +11,7 @@ int main(int argc, char **argv) {
     cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/stablelm_vocab.mllm");
     cmdParser.add<string>("merge", 'e', "specify mllm merge path", false, "../vocab/stablelm_merges.txt");
     cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/stablelm-2-1.6b-chat-q4_k.mllm");
-    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
+    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 600);
     cmdParser.add<int>("thread", 't', "num of threads", false, 4);
     cmdParser.parse_check(argc, argv);
 

diff --git a/examples/demo_vit.cpp b/examples/demo_vit.cpp
@@ -1,4 +1,5 @@
 #include <iostream>
+#include <vector>
 #include "cmdline.h"
 #include "models/vit/modeling_vit.hpp"
 #include "models/vit/labels_vit.hpp"
@@ -21,8 +22,15 @@ int main(int argc, char **argv) {
     auto model = ViTModel(config);
     model.load(model_path);
 
-    auto input_tensor = processor.process("../assets/cat.jpg", 224);
-    auto result = model({input_tensor});
-    auto token_idx = processor.postProcess(result[0]);
-    std::cout << imagenet_id2label[token_idx] << std::endl;
+    vector<string> imgs = {"../assets/cat.jpg",
+                           "../assets/dog_image.jpg",
+                           "../assets/bird_image.jpg",
+                           "../assets/car_image.jpg",
+                           "../assets/bus.png"};
+    for (auto &img : imgs) {
+        auto input_tensor = processor.process(img, 224);
+        auto result = model({input_tensor});
+        auto token_idx = processor.postProcess(result[0]);
+        std::cout << imagenet_id2label[token_idx] << std::endl;
+    }
 }
diff --git a/examples/demo_yi.cpp b/examples/demo_yi.cpp
@@ -20,7 +20,7 @@ int main(int argc, char **argv) {
     cmdline::parser cmdParser;
     cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/yi_vocab.mllm");
     cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/yi-1.5-6b-chat-q4_k.mllm");
-    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
+    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 600);
     cmdParser.add<int>("thread", 't', "num of threads", false, 4);
     cmdParser.parse_check(argc, argv);
 

diff --git a/src/Layer.hpp b/src/Layer.hpp
@@ -117,6 +117,7 @@ class Layer {
             module = Module::llm_model_ptr;
         }
         map<string, shared_ptr<Tensor>> &activation_tensors = module->activation_tensors;
+        auto &activation_tensors_num = module->activation_tensors_num;
         Module::runlistIdx = saved_list_idx;
         bool do_init = false;
         // set backend to current module device and try to create op
@@ -182,6 +183,7 @@ class Layer {
                     activation_tensors[next_name] = std::make_shared<Tensor>(backend_);
                     activation_tensors[next_name]->setName(next_name);
                     activation_tensors[next_name]->setModule(module);
+                    activation_tensors_num[next_name] = 0;
                 }
             }
             if (module->doLoad) {
@@ -237,6 +239,28 @@ class Layer {
             break;
         }
         }
+        if (Backend::global_backends.size() == 1) {
+            for (auto input_tensor : input_tensors) {
+                if ((activation_tensors_num.find(input_tensor->name()) != activation_tensors_num.end())) {
+                    switch (Tensor::tensor_status) {
+                    case TENSOR_STATIC_INIT: {
+                        activation_tensors_num[input_tensor->name()] += 1;
+                        break;
+                    }
+                    case TENSOR_STATIC_READY: {
+                        activation_tensors_num[input_tensor->name()] -= 1;
+                        break;
+                    }
+                    default: {
+                    }
+                    }
+                    if (activation_tensors_num[input_tensor->name()] == 0 && activation_tensors[input_tensor->name()]->sequence() > 1) {
+                        activation_tensors[input_tensor->name()]->dealloc();
+                        // std::cout << input_tensor->name() << "|" << std::endl;
+                    }
+                }
+            }
+        }
 #ifdef DEBUGOPTIME
         if (Tensor::tensor_status == TENSOR_STATIC_READY) {
             auto end_t = mllm_time_us();

diff --git a/src/Module.cpp b/src/Module.cpp
@@ -25,33 +25,33 @@ std::unordered_map<string, shared_ptr<Op>> Module::tensor_func_ops;
 vector<double> Module::profiling(string name) {
     vector<double> output;
     // printf("\n");
-    MLLM_LOG_INFO_STREAM << "===========================================" << std::endl;
+    std::cout << "===========================================" << std::endl;
     if (!name.empty()) {
-        MLLM_LOG_INFO_STREAM << "            " << name << std::endl;
-        MLLM_LOG_INFO_STREAM << "-------------------------------------------" << std::endl;
+        std::cout << "            " << name << std::endl;
+        std::cout << "-------------------------------------------" << std::endl;
     }
     double load_time_s = load_time_ / 1000.0F;
-    MLLM_LOG_INFO_STREAM << "  Load time: " << load_time_ / 1000.0F << " s" << std::endl;
+    std::cout << "  Load time: " << load_time_ / 1000.0F << " s" << std::endl;
     if (inference_times_.size() > 1 && decoding_token_size_ != prefilling_token_size_) {
         double prefile_speed = 1000 * prefilling_token_size_ / inference_times_[0];
-        MLLM_LOG_INFO_STREAM << "  Prefilling speed: " << prefile_speed << " tokens/s" << std::endl;
+        std::cout << "  Prefilling speed: " << prefile_speed << " tokens/s" << std::endl;
         double sum_decoding_time = std::accumulate(std::begin(inference_times_) + 1, std::end(inference_times_), 0.0);
         double mean_decoding_time = sum_decoding_time / (inference_times_.size() - 1);
         double decoding_speed = 1000 / mean_decoding_time;
-        MLLM_LOG_INFO_STREAM << "  Decoding speed: " << decoding_speed << " tokens/s" << std::endl;
+        std::cout << "  Decoding speed: " << decoding_speed << " tokens/s" << std::endl;
         output = {load_time_s, prefile_speed, decoding_speed};
     } else {
         double sum_time = std::accumulate(std::begin(inference_times_), std::end(inference_times_), 0.0);
         double mean_time = sum_time / (inference_times_.size());
         double inference_time_s = mean_time / 1000.0F;
-        MLLM_LOG_INFO_STREAM << "  Inference latency: " << mean_time / 1000.0F << " s" << std::endl;
+        std::cout << "  Inference latency: " << mean_time / 1000.0F << " s" << std::endl;
         output = {load_time_s, inference_time_s};
     }
     // double sum_time = std::accumulate(std::begin(inference_times_), std::end(inference_times_), 0.0);
-    // MLLM_LOG_INFO_STREAM<<sum_time<< " - "<<Tensor::forward_times<<" = "<<sum_time-Tensor::forward_times<<std::endl;
-    // MLLM_LOG_INFO_STREAM<<Tensor::forward_times<< " - "<<Tensor::forward_times_2<<" = "<<Tensor::forward_times-Tensor::forward_times_2<<std::endl;
+    // std::cout<<sum_time<< " - "<<Tensor::forward_times<<" = "<<sum_time-Tensor::forward_times<<std::endl;
+    // std::cout<<Tensor::forward_times<< " - "<<Tensor::forward_times_2<<" = "<<Tensor::forward_times-Tensor::forward_times_2<<std::endl;
 
-    MLLM_LOG_INFO_STREAM << "===========================================" << std::endl;
+    std::cout << "===========================================" << std::endl;
 
     prefilling_token_size_ = 0;
     decoding_token_size_ = 0;

diff --git a/src/Module.hpp b/src/Module.hpp
@@ -36,8 +36,10 @@ class Module {
 
 public:
     map<string, shared_ptr<Tensor>> activation_tensors;
+    map<string, int> activation_tensors_num;
     AbstructLoader *loader;
     bool doLoad = false;
+    bool op_transposed_flag = false;
 
     static Module *llm_model_ptr;
     // tag to indicate the multi-chunk prefilling
@@ -183,33 +185,19 @@ class Module {
             } else if (decoding_token_size_ == 0) {
                 decoding_token_size_ = inputs[0].sequence();
             }
-            bool need_setup = true;
             for (int i = 0; i < inputs.size(); i++) {
                 auto &input = inputs[i];
                 input.setName("input" + std::to_string(i));
                 input.setTtype(TensorType::NORMAL_TENSOR);
                 activation_tensors[input.name()] = std::shared_ptr<Tensor>(&input, [](Tensor *) {});
                 activation_tensors[input.name()]->setName(input.name());
                 activation_tensors[input.name()]->setModule(this);
-                llm_model_ptr = this;
-                if (inputs[0].sequence() != 1 && !last_shape_bshd_.empty()) {
-                    // if LLM/VLLM model, the `need_setup` should be `true`
-                    if (input.batch() == last_shape_bshd_[i][0] & input.sequence() == last_shape_bshd_[i][1] & input.head() == last_shape_bshd_[i][2] & input.dimension() == last_shape_bshd_[i][3]) {
-                        // if it is the QNN multi-chunk prefilling, the `need_setup` should be `true` to reshape & setUp CPU Ops
-                        if (Module::isMultiChunkPrefilling) {
-                            need_setup = true;
-                            break;
-                        }
-                        need_setup = false;
-                    }
-                }
             }
+            llm_model_ptr = this;
             Tensor::tensor_status = TENSOR_STATIC_INIT;
 
             uint64_t time_start = mllm_time_us();
-            if (need_setup) {
-                Forward(inputs, anyArgs);
-            }
+            Forward(inputs, anyArgs);
             Tensor::tensor_status = TENSOR_STATIC_READY;
             // uint64_t time_start = mllm_time_us();
             auto output = Forward(inputs, anyArgs);
@@ -222,7 +210,7 @@ class Module {
                 last_shape_bshd_.push_back({input.batch(), input.sequence(),
                                             input.head(), input.dimension()});
             }
-
+            llm_model_ptr->op_transposed_flag = true;
             return output;
         } else { // inner Modules
             // offload according to the backends' info inited during loading