Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
chenghuaWang authored Nov 22, 2024
2 parents e3302ee + b9772d0 commit ff0b4c5
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 30 deletions.
17 changes: 13 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<h1 align="center">
MLLM
mllm
</h1>

<h3 align="center">
Expand All @@ -14,6 +14,7 @@ fast and lightweight <ins>multimodal LLM</ins> inference engine for mobile and e

[![Website](https://img.shields.io/badge/website-visit-green)](https://ubiquitouslearning.github.io/mllm_website/)
[![Documentation](https://img.shields.io/badge/view-docs-blue)](https://ubiquitouslearning.github.io/mllm_website/introduction/getstarted/)
[![Android App](https://img.shields.io/badge/android-app-pink)](https://github.com/lx200916/ChatBotApp/)
[![Actions Status](https://github.com/UbiquitousLearning/mllm/workflows/Tests/badge.svg)](https://github.com/UbiquitousLearning/mllm/actions)
</h4>

Expand Down Expand Up @@ -55,14 +56,22 @@ Wait.. why on-device multimodal LLM? - It's a key building block for [intelligen

<table>
<tr>
<td>Chatting</td>
<!-- <td>Chatting</td> -->
<td>Android Intent Invocation</td>
<td>Image Understanding</td>
</tr>
<tr>
<td> <video src="https://github.com/user-attachments/assets/972b3bad-d659-4d76-9141-64ad0ad34d64"> </td>
<!-- <td> <video src="https://github.com/user-attachments/assets/972b3bad-d659-4d76-9141-64ad0ad34d64"> </td> -->
<td> <video src="https://github.com/user-attachments/assets/deb99f8d-9727-4519-9ca7-c39deb7c5b47"> </td>
<td> <video src="https://github.com/user-attachments/assets/55321a43-8484-4f74-b7b2-d4495f3626d9"> </td>
<td> <video src="https://github.com/user-attachments/assets/55321a43-8484-4f74-b7b2-d4495f3626d9"> </td>
</tr>
<tr>
<td>Chat CPU</td>
<td>Chat NPU</td>
</tr>
<tr>
<td> <video src="https://github.com/user-attachments/assets/2b0ab0d6-6727-4b85-9ee3-b39d23de5dde"> </td>
<td> <video src="https://github.com/user-attachments/assets/395f8e6e-2ab9-40bc-bf26-164ba5695c64"> </td>
</tr>
</table>

Expand Down
83 changes: 68 additions & 15 deletions examples/demo_qwen_npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ int main(int argc, char **argv) {
string model_path = cmdParser.get<string>("model");
string model_billion = cmdParser.get<string>("billion");
int tokens_limit = cmdParser.get<int>("limits");
const int chunk_size = 64;
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

auto tokenizer = QWenTokenizer(vocab_path, merge_path);
Expand All @@ -32,34 +33,81 @@ int main(int argc, char **argv) {
auto decoding_model = QWenForCausalLM(config);
decoding_model.load("../models/qwen-1.5-1.8b-chat-q4k.mllm");

// warmup START
std::string input_str = " ";
auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);
LlmTextGeneratorOpts opt{
.max_new_tokens = 1,
.do_sample = false,
.is_padding = true,
.seq_before_padding = real_seq_length,
.chunk_size = chunk_size,
};
model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
auto out_string = tokenizer.detokenize({out_token});
auto [not_end, output_string] = tokenizer.postprocess(out_string);
if (!not_end) { return false; }
return true;
});
Module::isFirstChunk = false;
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(0);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
// turn on the multi-chunk prefilling
Module::isMultiChunkPrefilling = true;
// warmup END
std::cout << "Warmup finished." << std::endl;

vector<string> in_strs = {
" Give me a short introduction to large language model.",
};
"\"Large Language Models (LLMs) are advanced artificial intelligence systems designed to understand and generate human-like text. These models are trained on vast amounts of data, enabling them to perform a wide range of tasks, from answering questions and summarizing text to generating creative content and engaging in conversational dialogue. LLMs like GPT-3 and GPT-4, developed by OpenAI, have set new benchmarks in natural language processing by leveraging deep learning architectures, particularly transformer models, which excel at capturing context and relationships within text. The scalability and versatility of LLMs make them invaluable tools for applications in education, customer service, content creation, and more. However, their deployment also raises ethical considerations, including issues of bias, misinformation, and the potential for misuse. As the field continues to evolve, ongoing research and responsible deployment strategies are essential to harnessing the full potential of these powerful AI systems while mitigating their risks.\"\nGenerate a title based on the above text."};

for (int i = 0; i < in_strs.size(); ++i) {
auto input_str = tokenizer.apply_chat_template(in_strs[i]);
auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 64, config.vocab_size);
auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);
const int seq_length_padding = (chunk_size - real_seq_length % chunk_size) + real_seq_length;
const int chunk_num = seq_length_padding / chunk_size;

std::cout << "[Q] " << in_strs[i] << std::endl;
std::cout << "[A] " << std::flush;

LlmTextGeneratorOpts opt{
.max_new_tokens = 1,
.do_sample = false,
.temperature = 0.3f,
.top_k = 50,
.top_p = 0.f,
.is_padding = true,
.seq_before_padding = real_seq_length,
.chunk_size = chunk_size,
};
model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
auto out_string = tokenizer.detokenize({out_token});
auto [not_end, output_string] = tokenizer.postprocess(out_string);
if (!not_end) { return false; }
std::cout << output_string << std::flush;
return true;
});

// tensor vectors to save the chunked tensors of the QNN prefilling input
bool isSwitched = false;
vector<Tensor> chunked_tensors(chunk_num);
for (int chunk_id = 0; chunk_id < chunk_num; ++chunk_id) {
chunked_tensors[chunk_id].setBackend(Backend::global_backends[MLLM_CPU]);
chunked_tensors[chunk_id].setTtype(INPUT_TENSOR);
chunked_tensors[chunk_id].reshape(1, 1, chunk_size, 1);
chunked_tensors[chunk_id].setName("input-chunk-" + to_string(chunk_id));
chunked_tensors[chunk_id].deepCopyFrom(&input_tensor, false, {0, 0, chunk_id * chunk_size, 0});

model.generate(chunked_tensors[chunk_id], opt, [&](unsigned int out_token) -> bool {
if (!isSwitched && chunk_id == 0 && static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->isStageSwitching()) {
// turn off switching at the first chunk of following inputs
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
isSwitched = true;
}
auto out_string = tokenizer.detokenize({out_token});
auto [not_end, output_string] = tokenizer.postprocess(out_string);
if (!not_end) { return false; }
if (chunk_id == chunk_num - 1) { // print the output of the last chunk
std::cout << output_string << std::flush;
}
return true;
});
Module::isFirstChunk = false;
}

static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(AUTOREGRESSIVE);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();

LlmTextGeneratorOpts decoding_opt{
Expand All @@ -70,8 +118,8 @@ int main(int argc, char **argv) {
.top_p = 0.f,
.is_padding = false,
};
bool isSwitched = false;
decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
isSwitched = false;
decoding_model.generate(chunked_tensors.back(), decoding_opt, [&](unsigned int out_token) -> bool {
// call only once of switchDecodeTag
if (!isSwitched) {
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
Expand All @@ -86,6 +134,11 @@ int main(int argc, char **argv) {
}
return true;
});
std::cout << "\n---------------" << std::endl;

// turn on switching, set sequence length and execution type
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(0);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
std::cout << "\n";
}
}
15 changes: 8 additions & 7 deletions tools/jni/LibHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@ bool LibHelper::setUp(const std::string &base_path, std::string weights_path, st
LOGI("Loading model from %s", weights_path.c_str());

switch (model) {
case QWEN:
case QWEN25:
qwconfig = QWenConfig(tokens_limit, "1.5B");
case QWEN15:
qwconfig = QWenConfig(tokens_limit, "1.8B");
tokenizer_ = make_shared<QWenTokenizer>(vocab_path, merge_path);
module_ = make_shared<QWenForCausalLM>(qwconfig);
#ifdef USE_QNN
Expand All @@ -78,7 +81,7 @@ bool LibHelper::setUp(const std::string &base_path, std::string weights_path, st
// warmup START
std::string input_str = " ";
int chunk_size = 64;
auto res = tokenizer->tokenizePaddingByChunk(input_str, chunk_size, 49152);
auto res = tokenizer->tokenizePaddingByChunk(input_str, chunk_size, 151936);
auto input_tensor = res.second;
auto real_seq_length = res.first;
LlmTextGeneratorOpts opt{
Expand Down Expand Up @@ -169,12 +172,12 @@ void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, u
unsigned max_new_tokens = 500;
LOGE("Running backend %d", backend_);

if (model_ == QWEN) {
if (model_ == QWEN15 || model_ == QWEN25) {
auto tokenizer = dynamic_pointer_cast<QWenTokenizer>(tokenizer_);
if (chat_template) input_str = tokenizer_->apply_chat_template(input_str);
if (backend_ == MLLMBackendType::QNN) {
int chunk_size = 64;
auto res = tokenizer->tokenizePaddingByChunk(input_str, chunk_size, 49152);
auto res = tokenizer->tokenizePaddingByChunk(input_str, chunk_size, 151936);
auto input_tensor = res.second;
max_new_tokens = tokens_limit - input_tensor.sequence();
auto real_seq_length = res.first;
Expand All @@ -198,8 +201,7 @@ void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, u
chunked_tensors[chunk_id].deepCopyFrom(&input_tensor, false, {0, 0, chunk_id * chunk_size, 0});

prefill_module_->generate(chunked_tensors[chunk_id], opt, [&](unsigned int out_token) -> bool {
// if (switch_flag && !isSwitched && chunk_id == 0) {
if (!isSwitched && chunk_id == 0) {
if (!isSwitched && chunk_id == 0 && static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->isStageSwitching()) {
// turn off switching at the first chunk of following inputs
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
isSwitched = true;
Expand All @@ -211,7 +213,6 @@ void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, u
output_string_ += output_string;
callback_(output_string_, !not_end);
}
if (!not_end) { return false; }
return true;
});
Module::isFirstChunk = false;
Expand Down
7 changes: 4 additions & 3 deletions tools/jni/LibHelper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,11 @@ class PreProcessor;
class Module;
class Tensor;
enum PreDefinedModel {
QWEN = 0,
QWEN25 = 0,
FUYU,
Bert,
PhoneLM
PhoneLM,
QWEN15
};

enum MLLMBackendType {
Expand All @@ -49,7 +50,7 @@ class LibHelper {

// Tokenizer *tokenizer_ = nullptr;
unsigned int eos_id_ = 2;
PreDefinedModel model_ = PreDefinedModel::QWEN;
PreDefinedModel model_ = PreDefinedModel::QWEN25;
MLLMBackendType backend_ = MLLMBackendType::CPU;
bool is_first_run_cond_ = true;
int tokens_limit = 4000;
Expand Down

0 comments on commit ff0b4c5

Please sign in to comment.