Merge branch 'main' into main

UbiquitousLearning · Nov 22, 2024 · ff0b4c5 · ff0b4c5
2 parents e3302ee + b9772d0
commit ff0b4c5
Show file tree

Hide file tree

Showing 5 changed files with 94 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 <h1 align="center">
-MLLM
+mllm
 </h1>
 
 <h3 align="center">
@@ -14,6 +14,7 @@ fast and lightweight <ins>multimodal LLM</ins> inference engine for mobile and e
 
 [![Website](https://img.shields.io/badge/website-visit-green)](https://ubiquitouslearning.github.io/mllm_website/)
 [![Documentation](https://img.shields.io/badge/view-docs-blue)](https://ubiquitouslearning.github.io/mllm_website/introduction/getstarted/)
+[![Android App](https://img.shields.io/badge/android-app-pink)](https://github.com/lx200916/ChatBotApp/)
 [![Actions Status](https://github.com/UbiquitousLearning/mllm/workflows/Tests/badge.svg)](https://github.com/UbiquitousLearning/mllm/actions)
 </h4>
 
@@ -55,14 +56,22 @@ Wait.. why on-device multimodal LLM? - It's a key building block for [intelligen
 
 <table>
     <tr>
-        <td>Chatting</td>
+<!--         <td>Chatting</td> -->
         <td>Android Intent Invocation</td>
         <td>Image Understanding</td>
     </tr>
     <tr>
-        <td>  <video src="https://github.com/user-attachments/assets/972b3bad-d659-4d76-9141-64ad0ad34d64"> </td>
+<!--         <td>  <video src="https://github.com/user-attachments/assets/972b3bad-d659-4d76-9141-64ad0ad34d64"> </td> -->
         <td>  <video src="https://github.com/user-attachments/assets/deb99f8d-9727-4519-9ca7-c39deb7c5b47"> </td>
-        <td>  <video src="https://github.com/user-attachments/assets/55321a43-8484-4f74-b7b2-d4495f3626d9">  </td>
+        <td>  <video src="https://github.com/user-attachments/assets/55321a43-8484-4f74-b7b2-d4495f3626d9"> </td>
+    </tr>
+    <tr>
+        <td>Chat CPU</td>
+        <td>Chat NPU</td>
+    </tr>    
+    <tr>
+        <td>  <video src="https://github.com/user-attachments/assets/2b0ab0d6-6727-4b85-9ee3-b39d23de5dde"> </td>
+        <td>  <video src="https://github.com/user-attachments/assets/395f8e6e-2ab9-40bc-bf26-164ba5695c64"> </td>
     </tr>
 </table>
 

diff --git a/android b/android
diff --git a/examples/demo_qwen_npu.cpp b/examples/demo_qwen_npu.cpp
@@ -23,6 +23,7 @@ int main(int argc, char **argv) {
     string model_path = cmdParser.get<string>("model");
     string model_billion = cmdParser.get<string>("billion");
     int tokens_limit = cmdParser.get<int>("limits");
+    const int chunk_size = 64;
     CPUBackend::cpu_threads = cmdParser.get<int>("thread");
 
     auto tokenizer = QWenTokenizer(vocab_path, merge_path);
@@ -32,34 +33,81 @@ int main(int argc, char **argv) {
     auto decoding_model = QWenForCausalLM(config);
     decoding_model.load("../models/qwen-1.5-1.8b-chat-q4k.mllm");
 
+    // warmup START
+    std::string input_str = " ";
+    auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);
+    LlmTextGeneratorOpts opt{
+        .max_new_tokens = 1,
+        .do_sample = false,
+        .is_padding = true,
+        .seq_before_padding = real_seq_length,
+        .chunk_size = chunk_size,
+    };
+    model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
+        auto out_string = tokenizer.detokenize({out_token});
+        auto [not_end, output_string] = tokenizer.postprocess(out_string);
+        if (!not_end) { return false; }
+        return true;
+    });
+    Module::isFirstChunk = false;
+    static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(0);
+    static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
+    static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
+    // turn on the multi-chunk prefilling
+    Module::isMultiChunkPrefilling = true;
+    // warmup END
+    std::cout << "Warmup finished." << std::endl;
+
     vector<string> in_strs = {
         " Give me a short introduction to large language model.",
-    };
+        "\"Large Language Models (LLMs) are advanced artificial intelligence systems designed to understand and generate human-like text. These models are trained on vast amounts of data, enabling them to perform a wide range of tasks, from answering questions and summarizing text to generating creative content and engaging in conversational dialogue. LLMs like GPT-3 and GPT-4, developed by OpenAI, have set new benchmarks in natural language processing by leveraging deep learning architectures, particularly transformer models, which excel at capturing context and relationships within text. The scalability and versatility of LLMs make them invaluable tools for applications in education, customer service, content creation, and more. However, their deployment also raises ethical considerations, including issues of bias, misinformation, and the potential for misuse. As the field continues to evolve, ongoing research and responsible deployment strategies are essential to harnessing the full potential of these powerful AI systems while mitigating their risks.\"\nGenerate a title based on the above text."};
 
     for (int i = 0; i < in_strs.size(); ++i) {
         auto input_str = tokenizer.apply_chat_template(in_strs[i]);
-        auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 64, config.vocab_size);
+        auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);
+        const int seq_length_padding = (chunk_size - real_seq_length % chunk_size) + real_seq_length;
+        const int chunk_num = seq_length_padding / chunk_size;
+
         std::cout << "[Q] " << in_strs[i] << std::endl;
         std::cout << "[A] " << std::flush;
 
         LlmTextGeneratorOpts opt{
             .max_new_tokens = 1,
             .do_sample = false,
-            .temperature = 0.3f,
-            .top_k = 50,
-            .top_p = 0.f,
             .is_padding = true,
             .seq_before_padding = real_seq_length,
+            .chunk_size = chunk_size,
         };
-        model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
-            auto out_string = tokenizer.detokenize({out_token});
-            auto [not_end, output_string] = tokenizer.postprocess(out_string);
-            if (!not_end) { return false; }
-            std::cout << output_string << std::flush;
-            return true;
-        });
+
+        // tensor vectors to save the chunked tensors of the QNN prefilling input
+        bool isSwitched = false;
+        vector<Tensor> chunked_tensors(chunk_num);
+        for (int chunk_id = 0; chunk_id < chunk_num; ++chunk_id) {
+            chunked_tensors[chunk_id].setBackend(Backend::global_backends[MLLM_CPU]);
+            chunked_tensors[chunk_id].setTtype(INPUT_TENSOR);
+            chunked_tensors[chunk_id].reshape(1, 1, chunk_size, 1);
+            chunked_tensors[chunk_id].setName("input-chunk-" + to_string(chunk_id));
+            chunked_tensors[chunk_id].deepCopyFrom(&input_tensor, false, {0, 0, chunk_id * chunk_size, 0});
+
+            model.generate(chunked_tensors[chunk_id], opt, [&](unsigned int out_token) -> bool {
+                if (!isSwitched && chunk_id == 0 && static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->isStageSwitching()) {
+                    // turn off switching at the first chunk of following inputs
+                    static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
+                    isSwitched = true;
+                }
+                auto out_string = tokenizer.detokenize({out_token});
+                auto [not_end, output_string] = tokenizer.postprocess(out_string);
+                if (!not_end) { return false; }
+                if (chunk_id == chunk_num - 1) { // print the output of the last chunk
+                    std::cout << output_string << std::flush;
+                }
+                return true;
+            });
+            Module::isFirstChunk = false;
+        }
 
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(AUTOREGRESSIVE);
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
 
         LlmTextGeneratorOpts decoding_opt{
@@ -70,8 +118,8 @@ int main(int argc, char **argv) {
             .top_p = 0.f,
             .is_padding = false,
         };
-        bool isSwitched = false;
-        decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
+        isSwitched = false;
+        decoding_model.generate(chunked_tensors.back(), decoding_opt, [&](unsigned int out_token) -> bool {
             // call only once of switchDecodeTag
             if (!isSwitched) {
                 static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
@@ -86,6 +134,11 @@ int main(int argc, char **argv) {
             }
             return true;
         });
-        std::cout << "\n---------------" << std::endl;
+
+        // turn on switching, set sequence length and execution type
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(0);
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
+        std::cout << "\n";
     }
 }
diff --git a/tools/jni/LibHelper.cpp b/tools/jni/LibHelper.cpp
@@ -66,7 +66,10 @@ bool LibHelper::setUp(const std::string &base_path, std::string weights_path, st
     LOGI("Loading model from %s", weights_path.c_str());
 
     switch (model) {
-    case QWEN:
+    case QWEN25:
+        qwconfig = QWenConfig(tokens_limit, "1.5B");
+    case QWEN15:
+        qwconfig = QWenConfig(tokens_limit, "1.8B");
         tokenizer_ = make_shared<QWenTokenizer>(vocab_path, merge_path);
         module_ = make_shared<QWenForCausalLM>(qwconfig);
 #ifdef USE_QNN
@@ -78,7 +81,7 @@ bool LibHelper::setUp(const std::string &base_path, std::string weights_path, st
             // warmup START
             std::string input_str = " ";
             int chunk_size = 64;
-            auto res = tokenizer->tokenizePaddingByChunk(input_str, chunk_size, 49152);
+            auto res = tokenizer->tokenizePaddingByChunk(input_str, chunk_size, 151936);
             auto input_tensor = res.second;
             auto real_seq_length = res.first;
             LlmTextGeneratorOpts opt{
@@ -169,12 +172,12 @@ void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, u
     unsigned max_new_tokens = 500;
     LOGE("Running backend %d", backend_);
 
-    if (model_ == QWEN) {
+    if (model_ == QWEN15 || model_ == QWEN25) {
         auto tokenizer = dynamic_pointer_cast<QWenTokenizer>(tokenizer_);
         if (chat_template) input_str = tokenizer_->apply_chat_template(input_str);
         if (backend_ == MLLMBackendType::QNN) {
             int chunk_size = 64;
-            auto res = tokenizer->tokenizePaddingByChunk(input_str, chunk_size, 49152);
+            auto res = tokenizer->tokenizePaddingByChunk(input_str, chunk_size, 151936);
             auto input_tensor = res.second;
             max_new_tokens = tokens_limit - input_tensor.sequence();
             auto real_seq_length = res.first;
@@ -198,8 +201,7 @@ void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, u
                 chunked_tensors[chunk_id].deepCopyFrom(&input_tensor, false, {0, 0, chunk_id * chunk_size, 0});
 
                 prefill_module_->generate(chunked_tensors[chunk_id], opt, [&](unsigned int out_token) -> bool {
-                    // if (switch_flag && !isSwitched && chunk_id == 0) {
-                    if (!isSwitched && chunk_id == 0) {
+                    if (!isSwitched && chunk_id == 0 && static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->isStageSwitching()) {
                         // turn off switching at the first chunk of following inputs
                         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
                         isSwitched = true;
@@ -211,7 +213,6 @@ void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, u
                         output_string_ += output_string;
                         callback_(output_string_, !not_end);
                     }
-                    if (!not_end) { return false; }
                     return true;
                 });
                 Module::isFirstChunk = false;

diff --git a/tools/jni/LibHelper.hpp b/tools/jni/LibHelper.hpp
@@ -22,10 +22,11 @@ class PreProcessor;
 class Module;
 class Tensor;
 enum PreDefinedModel {
-    QWEN = 0,
+    QWEN25 = 0,
     FUYU,
     Bert,
-    PhoneLM
+    PhoneLM,
+    QWEN15
 };
 
 enum MLLMBackendType {
@@ -49,7 +50,7 @@ class LibHelper {
 
     // Tokenizer *tokenizer_ = nullptr;
     unsigned int eos_id_ = 2;
-    PreDefinedModel model_ = PreDefinedModel::QWEN;
+    PreDefinedModel model_ = PreDefinedModel::QWEN25;
     MLLMBackendType backend_ = MLLMBackendType::CPU;
     bool is_first_run_cond_ = true;
     int tokens_limit = 4000;
+138 −0		README.md
+9 −7		app/src/main/cpp/LibHelper.hpp
+3 −2		app/src/main/java/org/saltedfish/chatbot/JNIBridge.kt
+1 −1		app/src/main/java/org/saltedfish/chatbot/MainActivity.kt
+19 −6		app/src/main/java/org/saltedfish/chatbot/viewModel.kt