UbiquitousLearning · yirongjie · Nov 12, 2024 · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/README.md b/README.md
@@ -59,17 +59,17 @@ Wait.. why on-device multimodal LLM? - It's a key building block for [intelligen
 
 <table>
     <tr>
-        <td>Demo of LLM chatting</td>
-        <td>Demo of image understanding</td>
-        <td>Demo of UI screen understanding</td>
+        <td>Chatting</td>
+        <td>Android Intent Invocation</td>
+        <td>Image Understanding</td>
     </tr>
     <tr>
-        <td> <video src="https://github.com/UbiquitousLearning/mllm/assets/38753457/7a1eb892-8259-41ff-8c97-b773d16fce7f"> </td>
-        <td> <video src="https://github.com/UbiquitousLearning/mllm/assets/38753457/32549658-5c74-4ce0-962f-6621c919faad"> </td>
-        <td>  <video src="https://github.com/UbiquitousLearning/mllm/assets/38753457/fe234f27-1393-4ee2-84ce-254cee91a27f"> </td>
+        <td>  <video src="https://github.com/user-attachments/assets/972b3bad-d659-4d76-9141-64ad0ad34d64"> </td>
+        <td>  <video src="https://github.com/user-attachments/assets/deb99f8d-9727-4519-9ca7-c39deb7c5b47"> </td>
+        <td>  <video src="https://github.com/user-attachments/assets/55321a43-8484-4f74-b7b2-d4495f3626d9">  </td>
     </tr>
 </table>
-            
+
 ## Support models
 
 [//]: # (* ✔️ : Support and test well on mobile devices.)
@@ -391,3 +391,15 @@ These component is clearly identified in their respective subdirectories along w
 For the full text of the Apache License 2.0, please refer to the [LICENSE-APACHE](third_party/wenet_audio/LICENSE) file
 located in the relevant subdirectories.
 
+## Citation
+```
+@misc{yi2023mllm,
+  title = {mllm: fast and lightweight multimodal LLM inference engine for mobile and edge devices},
+  author = {Rongjie Yi and Xiang Li and Qichen Qiu and Zhenyan Lu and Hao Zhang and Daliang Xu and Liming Yang and Weikai Xie and Chenghua Wang and Mengwei Xu},
+  year = {2023},
+  publisher = {mllm Team},
+  url = {https://github.com/UbiquitousLearning/mllm}
+}
+```
+
+
diff --git a/android b/android
diff --git a/src/models/fuyu/modeling_fuyu.hpp b/src/models/fuyu/modeling_fuyu.hpp
@@ -29,7 +29,7 @@ class PersimmonBlock final : public Module {
         norm1 = LayerNorm(hidden_dim, true, 1e-6, base_name + names._attn_norm_name);
         norm2 = LayerNorm(hidden_dim, true, 1e-6, base_name + names._ffn_norm_name);
     }
-    vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override  {
+    vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override {
         auto x = norm1(inputs[0]);
         x = attention({x, x, x})[0];
         auto tmp = x + inputs[0];
@@ -38,6 +38,10 @@ class PersimmonBlock final : public Module {
         x = x + tmp;
         return {x};
     }
+
+    MultiHeadAttention &get_attention() {
+        return attention;
+    }
 };
 
 class Persimmon final : public Module {
@@ -48,11 +52,11 @@ class Persimmon final : public Module {
 public:
     Persimmon() = default;
     Persimmon(int hidden_dim, int head_size, int ffn_hidden, float rope_theta, int max_position_embeddings, int cache_limit, int block_num, int vocab_size, const FuyuNameConfig &names) {
-        blocks = List<PersimmonBlock>(block_num, hidden_dim, head_size, ffn_hidden,  rope_theta, max_position_embeddings, cache_limit, names, names.blk_name);
+        blocks = List<PersimmonBlock>(block_num, hidden_dim, head_size, ffn_hidden, rope_theta, max_position_embeddings, cache_limit, names, names.blk_name);
         norm = LayerNorm(hidden_dim, true, 1e-6, names.post_norm_name);
         lm_head = Linear(hidden_dim, vocab_size, false, names.lm_head_name);
     }
-    vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override  {
+    vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override {
         auto x = inputs[0];
         for (auto &block : blocks) {
             x = block({x})[0];
@@ -61,6 +65,12 @@ class Persimmon final : public Module {
         x = lm_head(x);
         return {x};
     }
+    void clear_kvcache() override {
+        for (auto &block : blocks) {
+            auto kvcahce = block.get_attention().get_cache();
+            for (auto &cache : kvcahce) { cache->clearCache(); }
+        }
+    }
 };
 
 class FuyuGather final : public Layer {
@@ -84,7 +94,7 @@ class FuyuModel final : public Module {
 public:
     explicit FuyuModel(const FuyuConfig &config) :
         FuyuModel(config.vocab_size, config.hidden_dim, config.head_size, config.ffn_hidden, config.block_num,
-                  config.rope_theta, config.max_position_embeddings, 
+                  config.rope_theta, config.max_position_embeddings,
                   config.cache_limit, config.patch_size, config.chl_size,
                   config.name_config) {
     }
@@ -95,16 +105,19 @@ class FuyuModel final : public Module {
         embed_tokens = Embedding(vocab_size, hidden_dim, names.token_embd_name);
         vision_embed_tokens = Linear(patch_size * patch_size * chl_size, hidden_dim, true, names.vision_embed_tokens_name);
         fuyu_gather = FuyuGather("gather");
-        persimmon = Persimmon(hidden_dim, head_size, ffn_hidden, rope_theta, max_position_embeddings,cache_limit, block_num, vocab_size, names);
+        persimmon = Persimmon(hidden_dim, head_size, ffn_hidden, rope_theta, max_position_embeddings, cache_limit, block_num, vocab_size, names);
     }
-    vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override  {
+    vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override {
         auto input_ids = embed_tokens(inputs[0]);
         if (inputs[1].batch() > 0) {
             auto image_patches = vision_embed_tokens(inputs[1]);
             input_ids = fuyu_gather(input_ids, image_patches, inputs[2]);
         }
         return persimmon({input_ids});
     }
+    void clear_kvcache() override {
+        persimmon.clear_kvcache();
+    }
 };
 
 #endif // MODELING_FUYU_HPP
diff --git a/src/models/fuyu/processing_fuyu.hpp b/src/models/fuyu/processing_fuyu.hpp
@@ -6,6 +6,7 @@
 #define TOKENIZATION_FUYU_HPP
 
 #include <vector>
+#include "Log.h"
 #include "Tensor.hpp"
 #include <utility>
 // #include "processor/FuyuPreProcess.hpp"
@@ -221,8 +222,8 @@ class FuyuProcessor final : public PreProcessor {
     }
 
 public:
-    explicit FuyuProcessor(const std::string &vocab_file) :
-        PreProcessor(1080, 1920, true, true, true, true, {0.5}, {0.5}) {
+    explicit FuyuProcessor(const std::string &vocab_file, int image_height = 1080, int image_width = 1920) :
+        PreProcessor(image_height, image_width, true, true, true, true, {0.5}, {0.5}) {
         Module::initBackend(MLLM_CPU);
         tokenizer_ = new UnigramTokenizer(vocab_file);
         auto tmp_token = vector<token_id_t>();
@@ -247,6 +248,10 @@ class FuyuProcessor final : public PreProcessor {
                 MLLM_LOG_ERROR_STREAM << "load image failed" << std::endl;
                 exit(-1);
             }
+            if (channels_ != 3) {
+                MLLM_LOG_ERROR("Image data channel not 3 but {},change to 3", channels_);
+                channels_ = 3;
+            }
             auto float_data = RescaleImage(data, 255.0, width_ * height_ * channels_);
             images_.emplace_back(float_data, width_, height_, channels_);
         }

diff --git a/src/models/phonelm/configuration_phonelm.hpp b/src/models/phonelm/configuration_phonelm.hpp
@@ -99,7 +99,7 @@ struct PhoneLMConfig : public TransformerConfig {
     int hidden_size = 1024;
     float initializer_range = 0.02;
     int intermediate_size = 2816;
-    int max_position_embeddings = 2048;
+    int max_position_embeddings = 32768;
     int max_window_layers = 21;
     int num_attention_heads = 16;
     int num_hidden_layers = 24;

diff --git a/src/models/qwen/tokenization_qwen.hpp b/src/models/qwen/tokenization_qwen.hpp
@@ -250,6 +250,7 @@ class QWenTokenizer final : public BPETokenizer {
         return {_byte_decode_(BPETokenizer::detokenize({token_idx})), token_idx};
     }
     std::pair<bool, std::string> postprocess(std::string &text) override {
+        if (text == "<|im_end|>") return {false, ""};
         if (text == "<|im_start|>" || text == "<|im_end|>" || text == "<unk>") return {true, ""};
         if (text == "<|endoftext|>") return {false, ""};
         return {true, text};

diff --git a/tools/jni/LibHelper.cpp b/tools/jni/LibHelper.cpp
@@ -23,8 +23,9 @@
 #include "models/qwen/tokenization_qwen.hpp"
 #include "models/smollm/tokenization_smollm.hpp"
 #include "tokenizers/Unigram/Unigram.hpp"
-using namespace mllm;
 #include "models/fuyu/processing_fuyu.hpp"
+#include "processor/PostProcess.hpp"
+using namespace mllm;
 
 #ifdef USE_QNN
 #include "models/qwen/modeling_qwen_npu.hpp"
@@ -51,7 +52,7 @@ unsigned int LibHelper::postProcessing(shared_ptr<Tensor> result, shared_ptr<Ten
 
 bool LibHelper::setUp(const std::string &base_path, std::string weights_path, std::string vocab_path, std::string merge_path, PreDefinedModel model, MLLMBackendType backend_type) {
     FuyuConfig fuyuconfig(tokens_limit, "8B");
-    QWenConfig qwconfig(tokens_limit, "1.8B");
+    QWenConfig qwconfig(tokens_limit, "1.5B");
     BertConfig bertconfig;
     PhoneLMConfig phone_config(tokens_limit, "1.5B");
     vocab_path = base_path + vocab_path;
@@ -75,7 +76,7 @@ bool LibHelper::setUp(const std::string &base_path, std::string weights_path, st
         break;
 
     case FUYU:
-        processor_ = new FuyuProcessor(vocab_path);
+        processor_ = new FuyuProcessor(vocab_path, 224, 224);
         module_ = make_shared<FuyuModel>(fuyuconfig);
         break;
     case Bert:
@@ -107,18 +108,17 @@ void LibHelper::setCallback(callback_t callback) {
 void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, unsigned int image_length, bool chat_template) {
     std::string output_string_;
     LOGE("Running model %d", model_);
-    bool isSwitched = false;
+    unsigned max_new_tokens = 500;
+    LOGE("Running backend %d", backend_);
 
     if (model_ == QWEN) {
         auto tokenizer = dynamic_pointer_cast<QWenTokenizer>(tokenizer_);
         if (chat_template) input_str = tokenizer_->apply_chat_template(input_str);
         auto input_tensor = tokenizer_->tokenize(input_str);
+        max_new_tokens = tokens_limit - input_tensor.sequence();
         LlmTextGeneratorOpts opt{
-            .max_new_tokens = max_step,
-            .do_sample = true,
-            .temperature = 0.3F,
-            .top_k = 50,
-            .top_p = 0.F,
+            .max_new_tokens = max_new_tokens,
+            .do_sample = false,
         };
         if (backend_ == MLLMBackendType::QNN) {
             auto res = tokenizer->tokenizeWithPadding(input_str, 64, 151936);
@@ -139,14 +139,15 @@ void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, u
             static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
             static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
             opt = LlmTextGeneratorOpts{
-                .max_new_tokens = 100,
+                .max_new_tokens = max_new_tokens,
                 .do_sample = false,
                 .temperature = 0.3f,
                 .top_k = 50,
                 .top_p = 0.f,
                 .is_padding = false,
             };
         }
+        bool isSwitched = false;
         module_->generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
             if (!isSwitched && backend_ == MLLMBackendType::QNN) {
                 static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
@@ -165,30 +166,28 @@ void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, u
     } else if (model_ == FUYU) {
         auto processor = dynamic_cast<FuyuProcessor *>(processor_);
         auto input_tensors = processor->process(input_str, {image}, {image_length});
-        for (int step = 0; step < max_step; step++) {
+        for (int step = 0; step < 100; step++) {
             auto result = (*module_)({input_tensors[0], input_tensors[1], input_tensors[2]});
             auto outputs = processor->detokenize(result[0]);
             auto out_string = outputs.first;
             auto out_token = outputs.second;
             auto [end, string] = processor->postprocess(out_string);
             output_string_ += string;
-
             callback_(output_string_, !end);
             if (!end) { break; }
+            chatPostProcessing(out_token, input_tensors[0], {&input_tensors[1], &input_tensors[2]});
         }
+        module_->clear_kvcache();
     } else if (model_ == Bert) {
         LOGE("Bert model is not supported in this version.");
     } else if (model_ == PhoneLM) {
         auto tokenizer = dynamic_pointer_cast<SmolLMTokenizer>(tokenizer_);
-
         if (chat_template) input_str = tokenizer_->apply_chat_template(input_str);
         auto input_tensor = tokenizer_->tokenize(input_str);
+        max_new_tokens = tokens_limit - input_tensor.sequence();
         LlmTextGeneratorOpts opt{
-            .max_new_tokens = 100,
+            .max_new_tokens = max_new_tokens,
             .do_sample = false,
-            // .temperature = 0.3F,
-            // .top_k = 50,
-            // .top_p = 0.F,
         };
         if (backend_ == MLLMBackendType::QNN) {
             auto res = tokenizer->tokenizeWithPadding(input_str, 64, 49152);
@@ -210,17 +209,17 @@ void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, u
             static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
 
             opt = LlmTextGeneratorOpts{
-                .max_new_tokens = 100,
+                .max_new_tokens = max_new_tokens,
                 .do_sample = false,
                 .temperature = 0.3f,
                 .top_k = 50,
                 .top_p = 0.f,
                 .is_padding = false,
             };
         }
-
+        bool isSwitched = false;
         module_->generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
-            if (!isSwitched) {
+            if (!isSwitched && backend_ == MLLMBackendType::QNN) {
                 static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
                 isSwitched = true;
             }
+4 −0		.gitattributes
+14 −11		app/build.gradle
+44 −0		app/objectbox-models/default.json
+22 −0		app/src/main/AndroidManifest.xml
+24 −0		app/src/main/assets/api_vec.jsonl
+26 −25		app/src/main/cpp/LibHelper.hpp
+90 −21		app/src/main/cpp/chatbot.cpp
+ −		app/src/main/cpp/libs/libmllm_lib.a
+27 −5		app/src/main/java/org/saltedfish/chatbot/JNIBridge.kt
+770 −47		app/src/main/java/org/saltedfish/chatbot/MainActivity.kt
+336 −0		app/src/main/java/org/saltedfish/chatbot/PhoneAPI.kt
+263 −0		app/src/main/java/org/saltedfish/chatbot/RAGDB.kt
+180 −13		app/src/main/java/org/saltedfish/chatbot/viewModel.kt
+3 −0		app/src/main/jniLibs/arm64-v8a/libQnnHtp.so
+3 −0		app/src/main/jniLibs/arm64-v8a/libQnnHtpOptraceProfilingReader.so
+3 −0		app/src/main/jniLibs/arm64-v8a/libQnnHtpPrepare.so
+3 −0		app/src/main/jniLibs/arm64-v8a/libQnnHtpProfilingReader.so
+3 −0		app/src/main/jniLibs/arm64-v8a/libQnnHtpV75CalculatorStub.so
+3 −0		app/src/main/jniLibs/arm64-v8a/libQnnHtpV75Skel.so
+3 −0		app/src/main/jniLibs/arm64-v8a/libQnnHtpV75Stub.so
+3 −0		app/src/main/jniLibs/arm64-v8a/libQnnLLaMAPackage_CPU.so
+3 −0		app/src/main/jniLibs/arm64-v8a/libQnnLLaMAPackage_HTP.so
+9 −0		app/src/main/res/drawable/tools.xml
+14 −3		build.gradle
+1 −1		gradle/wrapper/gradle-wrapper.properties