Skip to content

Commit

Permalink
Merge pull request #92 from yirongjie/main
Browse files Browse the repository at this point in the history
feat: Add new demo: demo_imagebind_1mod
  • Loading branch information
yirongjie authored Jul 16, 2024
2 parents ec3360d + 4800360 commit be80c5c
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 12 deletions.
14 changes: 14 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,20 @@ else ()
target_link_libraries(demo_imagebind MLLM_CPU)
endif ()

add_executable(demo_imagebind_1mod ${PROJECT_SOURCE_DIR}/examples/demo_imagebind_1mod.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC} # ${DIR_SRC_QUANT}
src/tokenizers/Tokenizer.cpp
src/tokenizers/BPE/Bpe.cpp
${DIR_SRC_PROCESSOE}
${DIR_THIRDPARTY_AUDIO}
src/processor/PreProcess.cpp
)
if (ARM AND NOT APK)
target_compile_options(demo_imagebind_1mod PRIVATE -fopenmp)
target_link_libraries(demo_imagebind_1mod PUBLIC MLLM_CPU -fopenmp -static-openmp)
else ()
target_link_libraries(demo_imagebind_1mod MLLM_CPU)
endif ()

add_executable(demo_tinyllama ${PROJECT_SOURCE_DIR}/examples/demo_tinyllama.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC} # ${DIR_SRC_QUANT}
src/tokenizers/Tokenizer.cpp
src/tokenizers/Tokenizer.hpp
Expand Down
62 changes: 62 additions & 0 deletions examples/demo_imagebind_1mod.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
//
// Created by Rongjie Yi on 24-7-15.
//
#include "cmdline.h"
#include "models/imagebind/modeling_imagebind.hpp"
#include "models/imagebind/processing_imagebind.hpp"

using namespace mllm;

int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/clip_vocab.mllm");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/imagebind_huge-q4_k.mllm");
cmdParser.add<string>("merges", 'f', "specify mllm tokenizer merges.txt path", false, "../vocab/clip_merges.txt");
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);

string vocab_path = cmdParser.get<string>("vocab");
string model_path = cmdParser.get<string>("model");
string merges_path = cmdParser.get<string>("merges");
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

auto processor = ImagebindProcessor(vocab_path, merges_path);

ImagebindConfig config("huge");

int loop_times = 10;

// auto input_tensors = processor.process(
// {"a dog.", "A car", "A bird"},config.max_position_embeddings,
// {"../assets/dog_image.jpg", "../assets/car_image.jpg", "../assets/bird_image.jpg"}, config.img_hw,
// {"../assets/dog_audio.wav", "../assets/car_audio.wav", "../assets/bird_audio.wav"});

auto input_tensors = processor.process(
{"a dog."},config.max_position_embeddings,
{"../assets/dog_image.jpg"}, config.img_hw,
{"../assets/dog_audio.wav"});

std::cout<<"Text| input_shape:["<<input_tensors.text_tensors.batch()<<", "<<input_tensors.text_tensors.sequence()<<", "<<input_tensors.text_tensors.head()<<", "<<input_tensors.text_tensors.dimension()<<"]"<<std::endl;
auto text_model = ImagebindTextModel(config);
text_model.load(model_path);
for (int step = 0; step < loop_times; step++) {
auto result = text_model({input_tensors.text_tensors}, input_tensors.in_len);
}
text_model.profiling();

std::cout<<"Vision| input_shape:["<<input_tensors.img_tensors.batch()<<", "<<input_tensors.img_tensors.channel()<<", "<<input_tensors.img_tensors.time()<<", "<<input_tensors.img_tensors.height()<<", "<<input_tensors.img_tensors.width()<<"]"<<std::endl;
auto vision_model = ImagebindVisionModel(config);
vision_model.load(model_path);
for (int step = 0; step < loop_times; step++) {
auto result = vision_model({input_tensors.img_tensors});
}
vision_model.profiling();

std::cout<<"Audio| input_shape:["<<input_tensors.audio_tensors.batch()<<", "<<input_tensors.audio_tensors.sequence()<<", "<<input_tensors.audio_tensors.head()<<", "<<input_tensors.audio_tensors.dimension()<<"]"<<std::endl;
auto audio_model = ImagebindAudioModel(config);
audio_model.load(model_path);
for (int step = 0; step < loop_times; step++) {
auto result = audio_model({input_tensors.audio_tensors});
}
audio_model.profiling();
}
58 changes: 48 additions & 10 deletions src/Module.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ class Module {
private:
double load_time_;
int prefilling_token_size_=0;
int decoding_token_size_=0;
vector<double> inference_times_;
vector<vector<int>> last_shape_bshd_;

public:
static map<BackendType, Backend *> backends;
Expand Down Expand Up @@ -56,6 +58,9 @@ class Module {
}

void load(string path) {
Tensor::gph_.clear();
Module::tensor_status = TENSOR_STATIC_INIT;

mllm_time_init();
initLoader(path);
Module::doLoad = true;
Expand All @@ -75,6 +80,9 @@ class Module {
}

void load(AbstructLoader &param_loader) {
Tensor::gph_.clear();
Module::tensor_status = TENSOR_STATIC_INIT;

loader = &param_loader;
Module::doLoad = true;
vector<Tensor> tmps;
Expand Down Expand Up @@ -102,28 +110,52 @@ class Module {
return Forward(inputs, anyArgs);
}
if (inputs[0].ttype() == TensorType::INPUT_TENSOR) {
for (auto &input : inputs) {
if(prefilling_token_size_==0){ // first time init
if(!Tensor::gph_.empty()){
Tensor::gph_.clear();
}
prefilling_token_size_ = inputs[0].sequence();
}else if(decoding_token_size_==0){
decoding_token_size_ = inputs[0].sequence();
}
bool need_setup = true;
for (int i = 0; i < inputs.size(); i++) {
auto &input = inputs[i];
input.setTtype(TensorType::NORMAL_TENSOR);
input.status() = TENSOR_STATIC_INIT;
if(input.batch() == 0){
Tensor::gph_[input.name()] = input;
}
if(input.sequence()!=1 && !last_shape_bshd_.empty()){
// if LLM/VLLM model, the `need_setup` should be `true`
if(input.batch() == last_shape_bshd_[i][0] &
input.sequence() == last_shape_bshd_[i][1] &
input.head() == last_shape_bshd_[i][2] &
input.dimension() == last_shape_bshd_[i][3]){
need_setup = false;
}
}
}
tensor_status = TENSOR_STATIC_INIT;

uint64_t time_start = mllm_time_us();
Forward(inputs, anyArgs);
if(need_setup){
Forward(inputs, anyArgs);
}
for (auto &input : inputs) {
input.status() = TENSOR_STATIC_READY;
}
tensor_status = TENSOR_STATIC_READY;
auto output = Forward(inputs, anyArgs);
uint64_t time_end = mllm_time_us();
if(prefilling_token_size_==0){
prefilling_token_size_ = inputs[0].sequence();
}

double inference_time_ = (time_end - time_start) / 1000.0F;//ms
inference_times_.push_back(inference_time_);
last_shape_bshd_.clear();
for (auto &input : inputs) {
last_shape_bshd_.push_back({input.batch(), input.sequence(),
input.head(), input.dimension()});
}

return output;
} else {
Expand Down Expand Up @@ -172,17 +204,23 @@ class Module {
return modules;
}

void profiling() {
printf("\n");
void profiling(string name = "") {
// printf("\n");
std::cout << "===========================================" << std::endl;
if (name != "") {
std::cout << " " << name << std::endl;
std::cout << "-------------------------------------------" << std::endl;
}
std::cout << " Load time: " << load_time_/1000.0F << " s" << std::endl;
if(prefilling_token_size_){
if(inference_times_.size()>1 && decoding_token_size_ != prefilling_token_size_){
std::cout << " Prefilling speed: " << 1000 * prefilling_token_size_ / inference_times_[0] << " tokens/s" << std::endl;
}
if(inference_times_.size()>1){
double sum_decoding_time = std::accumulate(std::begin(inference_times_)+1, std::end(inference_times_), 0.0);
double mean_decoding_time = sum_decoding_time / (inference_times_.size()-1);
std::cout << " Decoding speed: " << 1000 / mean_decoding_time << " tokens/s" << std::endl;
} else{
double sum_time = std::accumulate(std::begin(inference_times_), std::end(inference_times_), 0.0);
double mean_time = sum_time / (inference_times_.size());
std::cout << " Inference latency: " << mean_time/1000.0F << " s" << std::endl;
}
std::cout << "===========================================" << std::endl;
}
Expand Down
6 changes: 4 additions & 2 deletions src/ParamLoader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ ParamLoader::ParamLoader(std::string filename, bool use_mmap) :
// #endif

if (this->fp_ == nullptr) {
std::cout << "param open file failed" << std::endl;
// std::cout << "param open file failed" << std::endl;
return;
int errorCode = errno;
char *errorMsg = strerror(errorCode);
Expand Down Expand Up @@ -123,7 +123,9 @@ std::tuple<uint8_t *, uint64_t> ParamLoader::load(string name) {
}
DataType ParamLoader::getDataType(string name) {
if (data_type_.count(name) != 1) {
std::cerr<<name<<" not found"<<std::endl;
if (this->fp_ != nullptr) {
std::cerr<<name<<" not found"<<std::endl;
}
return DataType::MLLM_TYPE_COUNT;
}
int type = data_type_[name];
Expand Down
2 changes: 2 additions & 0 deletions src/Tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,14 @@ class Tensor {
Tensor(Backend *bn) :
backend_(bn), host_ptr_(), capacity_(0), dtype_(MLLM_TYPE_F32) {
}
/*
~Tensor() {
if (host_ptr_ != nullptr && masterTensor() == nullptr && !aggregated_&& gph_.find(name_) == gph_.end()) {
backend_->free(host_ptr_);
host_ptr_ = nullptr;
}
}
*/
static map<string, Tensor> gph_;
std::map<Chl, int>& chls() {
return chls_;
Expand Down
12 changes: 12 additions & 0 deletions src/models/imagebind/modeling_imagebind.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ class ImagebindVisionModel final : public Module {

public:
ImagebindVisionModel() = default;
ImagebindVisionModel(const ImagebindConfig &config):
ImagebindVisionModel(config.vision_hidden_dim, config.vision_head_size, config.vision_ffn_hidden, config.head_hidden_dim,
config.patch, config.patch_time, config.img_hw, config.vision_block_num,
config.names_config){};
ImagebindVisionModel(int hidden_dim, int head_size, int ffn_hidden, int head_hidden_dim,
int patch, int patch_time, int img_hw, int block_num,
const ImagebindNameConfig &names) {
Expand Down Expand Up @@ -128,6 +132,10 @@ class ImagebindTextModel final : public Module {

public:
ImagebindTextModel() = default;
ImagebindTextModel(const ImagebindConfig &config):
ImagebindTextModel(config.text_hidden_dim, config.text_head_size, config.text_ffn_hidden, config.head_hidden_dim,
config.vocab_size, config.max_position_embeddings, config.text_block_num,
config.names_config){};
ImagebindTextModel(int hidden_dim, int head_size, int ffn_hidden, int head_hidden_dim,
int vocab_size, int max_position_embeddings, int block_num,
const ImagebindNameConfig &names) {
Expand Down Expand Up @@ -186,6 +194,10 @@ class ImagebindAudioModel final : public Module {

public:
ImagebindAudioModel() = default;
ImagebindAudioModel(const ImagebindConfig &config):
ImagebindAudioModel(config.audio_hidden_dim, config.audio_head_size, config.audio_ffn_hidden, config.head_hidden_dim,
config.audio_kernal, config.audio_stride, config.audio_h, config.audio_w, config.audio_block_num,
config.names_config){};
ImagebindAudioModel(int hidden_dim, int head_size, int ffn_hidden, int head_hidden_dim,
int patch, int stride, int img_h, int img_w, int block_num,
const ImagebindNameConfig &names) {
Expand Down

0 comments on commit be80c5c

Please sign in to comment.