Skip to content

Commit

Permalink
feat add dealloc for activation_tensors for only CPU Backend. (#201)
Browse files Browse the repository at this point in the history
  • Loading branch information
yirongjie authored Nov 27, 2024
1 parent 2a58575 commit c921e90
Show file tree
Hide file tree
Showing 21 changed files with 222 additions and 88 deletions.
13 changes: 8 additions & 5 deletions examples/demo_bert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "models/bert/modeling_bert.hpp"
#include "models/bert/tokenization_bert.hpp"
#include "cmdline.h"
#include <vector>

/*
* an intent to support gte-small BertModel to do text embedding
Expand All @@ -24,15 +25,17 @@ int main(int argc, char *argv[]) {
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

BertTokenizer tokenizer(vocab_path, true);
string text = "Help me set an alarm at 21:30";
auto inputs = tokenizer.tokenizes(text);
auto config = BertConfig();
auto model = BertModel(config);
model.load(model_path);

auto res = model({inputs[0], inputs[1], inputs[2]})[0];

res.printData<float>();
string text = "Help me set an alarm at 21:30";
vector<string> texts = {text, text};
for (auto &text : texts) {
auto inputs = tokenizer.tokenizes(text);
auto res = model({inputs[0], inputs[1], inputs[2]})[0];
res.printData<float>();
}

return 0;
}
1 change: 1 addition & 0 deletions examples/demo_gemma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ int main(int argc, char **argv) {
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
model.clear_kvcache();
}

return 0;
Expand Down
2 changes: 1 addition & 1 deletion examples/demo_imagebind_1mod.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ int main(int argc, char **argv) {
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/imagebind_huge-q4_k.mllm");
cmdParser.add<string>("merges", 'f', "specify mllm tokenizer merges.txt path", false, "../vocab/clip_merges.txt");
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.add<int>("loop_times", 'l', "number of inference loops", false, 10);
cmdParser.add<int>("loop_times", 'l', "number of inference loops", false, 2);
cmdParser.add<string>("modality", 'o', "inference modality (text/vision/audio/all)", false, "all");
cmdParser.parse_check(argc, argv);

Expand Down
9 changes: 5 additions & 4 deletions examples/demo_openelm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ int main(int argc, char **argv) {

LlmTextGeneratorOpts opt{
.max_new_tokens = 100,
.do_sample = true,
.temperature = 0.3F,
.top_k = 50,
.top_p = 0.F,
.do_sample = false,
// .temperature = 0.3F,
// .top_k = 50,
// .top_p = 0.F,
};
model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
auto out_string = tokenizer.detokenize({out_token});
Expand All @@ -61,5 +61,6 @@ int main(int argc, char **argv) {
return true;
});
std::cout << "\n";
model.clear_kvcache();
}
}
2 changes: 1 addition & 1 deletion examples/demo_phi3v.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ int main(int argc, char **argv) {
auto [not_end, output_string] = processor.tokenizer->postprocess(out_string);
if (!not_end) { break; }
std::cout << output_string << std::flush;
chatPostProcessing(out_token, input_tensor[0], {});
chatPostProcessing(out_token, input_tensor[0], {&input_tensor[1], &input_tensor[2]});
}
printf("\n");
}
Expand Down
2 changes: 1 addition & 1 deletion examples/demo_stablelm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ int main(int argc, char **argv) {
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/stablelm_vocab.mllm");
cmdParser.add<string>("merge", 'e', "specify mllm merge path", false, "../vocab/stablelm_merges.txt");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/stablelm-2-1.6b-chat-q4_k.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 600);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);

Expand Down
16 changes: 12 additions & 4 deletions examples/demo_vit.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <iostream>
#include <vector>
#include "cmdline.h"
#include "models/vit/modeling_vit.hpp"
#include "models/vit/labels_vit.hpp"
Expand All @@ -21,8 +22,15 @@ int main(int argc, char **argv) {
auto model = ViTModel(config);
model.load(model_path);

auto input_tensor = processor.process("../assets/cat.jpg", 224);
auto result = model({input_tensor});
auto token_idx = processor.postProcess(result[0]);
std::cout << imagenet_id2label[token_idx] << std::endl;
vector<string> imgs = {"../assets/cat.jpg",
"../assets/dog_image.jpg",
"../assets/bird_image.jpg",
"../assets/car_image.jpg",
"../assets/bus.png"};
for (auto &img : imgs) {
auto input_tensor = processor.process(img, 224);
auto result = model({input_tensor});
auto token_idx = processor.postProcess(result[0]);
std::cout << imagenet_id2label[token_idx] << std::endl;
}
}
2 changes: 1 addition & 1 deletion examples/demo_yi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/yi_vocab.mllm");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/yi-1.5-6b-chat-q4_k.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 600);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);

Expand Down
24 changes: 24 additions & 0 deletions src/Layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ class Layer {
module = Module::llm_model_ptr;
}
map<string, shared_ptr<Tensor>> &activation_tensors = module->activation_tensors;
auto &activation_tensors_num = module->activation_tensors_num;
Module::runlistIdx = saved_list_idx;
bool do_init = false;
// set backend to current module device and try to create op
Expand Down Expand Up @@ -182,6 +183,7 @@ class Layer {
activation_tensors[next_name] = std::make_shared<Tensor>(backend_);
activation_tensors[next_name]->setName(next_name);
activation_tensors[next_name]->setModule(module);
activation_tensors_num[next_name] = 0;
}
}
if (module->doLoad) {
Expand Down Expand Up @@ -237,6 +239,28 @@ class Layer {
break;
}
}
if (Backend::global_backends.size() == 1) {
for (auto input_tensor : input_tensors) {
if ((activation_tensors_num.find(input_tensor->name()) != activation_tensors_num.end())) {
switch (Tensor::tensor_status) {
case TENSOR_STATIC_INIT: {
activation_tensors_num[input_tensor->name()] += 1;
break;
}
case TENSOR_STATIC_READY: {
activation_tensors_num[input_tensor->name()] -= 1;
break;
}
default: {
}
}
if (activation_tensors_num[input_tensor->name()] == 0 && activation_tensors[input_tensor->name()]->sequence() > 1) {
activation_tensors[input_tensor->name()]->dealloc();
// std::cout << input_tensor->name() << "|" << std::endl;
}
}
}
}
#ifdef DEBUGOPTIME
if (Tensor::tensor_status == TENSOR_STATIC_READY) {
auto end_t = mllm_time_us();
Expand Down
20 changes: 10 additions & 10 deletions src/Module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,33 +25,33 @@ std::unordered_map<string, shared_ptr<Op>> Module::tensor_func_ops;
vector<double> Module::profiling(string name) {
vector<double> output;
// printf("\n");
MLLM_LOG_INFO_STREAM << "===========================================" << std::endl;
std::cout << "===========================================" << std::endl;
if (!name.empty()) {
MLLM_LOG_INFO_STREAM << " " << name << std::endl;
MLLM_LOG_INFO_STREAM << "-------------------------------------------" << std::endl;
std::cout << " " << name << std::endl;
std::cout << "-------------------------------------------" << std::endl;
}
double load_time_s = load_time_ / 1000.0F;
MLLM_LOG_INFO_STREAM << " Load time: " << load_time_ / 1000.0F << " s" << std::endl;
std::cout << " Load time: " << load_time_ / 1000.0F << " s" << std::endl;
if (inference_times_.size() > 1 && decoding_token_size_ != prefilling_token_size_) {
double prefile_speed = 1000 * prefilling_token_size_ / inference_times_[0];
MLLM_LOG_INFO_STREAM << " Prefilling speed: " << prefile_speed << " tokens/s" << std::endl;
std::cout << " Prefilling speed: " << prefile_speed << " tokens/s" << std::endl;
double sum_decoding_time = std::accumulate(std::begin(inference_times_) + 1, std::end(inference_times_), 0.0);
double mean_decoding_time = sum_decoding_time / (inference_times_.size() - 1);
double decoding_speed = 1000 / mean_decoding_time;
MLLM_LOG_INFO_STREAM << " Decoding speed: " << decoding_speed << " tokens/s" << std::endl;
std::cout << " Decoding speed: " << decoding_speed << " tokens/s" << std::endl;
output = {load_time_s, prefile_speed, decoding_speed};
} else {
double sum_time = std::accumulate(std::begin(inference_times_), std::end(inference_times_), 0.0);
double mean_time = sum_time / (inference_times_.size());
double inference_time_s = mean_time / 1000.0F;
MLLM_LOG_INFO_STREAM << " Inference latency: " << mean_time / 1000.0F << " s" << std::endl;
std::cout << " Inference latency: " << mean_time / 1000.0F << " s" << std::endl;
output = {load_time_s, inference_time_s};
}
// double sum_time = std::accumulate(std::begin(inference_times_), std::end(inference_times_), 0.0);
// MLLM_LOG_INFO_STREAM<<sum_time<< " - "<<Tensor::forward_times<<" = "<<sum_time-Tensor::forward_times<<std::endl;
// MLLM_LOG_INFO_STREAM<<Tensor::forward_times<< " - "<<Tensor::forward_times_2<<" = "<<Tensor::forward_times-Tensor::forward_times_2<<std::endl;
// std::cout<<sum_time<< " - "<<Tensor::forward_times<<" = "<<sum_time-Tensor::forward_times<<std::endl;
// std::cout<<Tensor::forward_times<< " - "<<Tensor::forward_times_2<<" = "<<Tensor::forward_times-Tensor::forward_times_2<<std::endl;

MLLM_LOG_INFO_STREAM << "===========================================" << std::endl;
std::cout << "===========================================" << std::endl;

prefilling_token_size_ = 0;
decoding_token_size_ = 0;
Expand Down
22 changes: 5 additions & 17 deletions src/Module.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@ class Module {

public:
map<string, shared_ptr<Tensor>> activation_tensors;
map<string, int> activation_tensors_num;
AbstructLoader *loader;
bool doLoad = false;
bool op_transposed_flag = false;

static Module *llm_model_ptr;
// tag to indicate the multi-chunk prefilling
Expand Down Expand Up @@ -183,33 +185,19 @@ class Module {
} else if (decoding_token_size_ == 0) {
decoding_token_size_ = inputs[0].sequence();
}
bool need_setup = true;
for (int i = 0; i < inputs.size(); i++) {
auto &input = inputs[i];
input.setName("input" + std::to_string(i));
input.setTtype(TensorType::NORMAL_TENSOR);
activation_tensors[input.name()] = std::shared_ptr<Tensor>(&input, [](Tensor *) {});
activation_tensors[input.name()]->setName(input.name());
activation_tensors[input.name()]->setModule(this);
llm_model_ptr = this;
if (inputs[0].sequence() != 1 && !last_shape_bshd_.empty()) {
// if LLM/VLLM model, the `need_setup` should be `true`
if (input.batch() == last_shape_bshd_[i][0] & input.sequence() == last_shape_bshd_[i][1] & input.head() == last_shape_bshd_[i][2] & input.dimension() == last_shape_bshd_[i][3]) {
// if it is the QNN multi-chunk prefilling, the `need_setup` should be `true` to reshape & setUp CPU Ops
if (Module::isMultiChunkPrefilling) {
need_setup = true;
break;
}
need_setup = false;
}
}
}
llm_model_ptr = this;
Tensor::tensor_status = TENSOR_STATIC_INIT;

uint64_t time_start = mllm_time_us();
if (need_setup) {
Forward(inputs, anyArgs);
}
Forward(inputs, anyArgs);
Tensor::tensor_status = TENSOR_STATIC_READY;
// uint64_t time_start = mllm_time_us();
auto output = Forward(inputs, anyArgs);
Expand All @@ -222,7 +210,7 @@ class Module {
last_shape_bshd_.push_back({input.batch(), input.sequence(),
input.head(), input.dimension()});
}

llm_model_ptr->op_transposed_flag = true;
return output;
} else { // inner Modules
// offload according to the backends' info inited during loading
Expand Down
Loading

0 comments on commit c921e90

Please sign in to comment.