UbiquitousLearning · yirongjie · Nov 16, 2024 · Nov 16, 2024
diff --git a/android b/android
diff --git a/examples/demo_phonelm_npu.cpp b/examples/demo_phonelm_npu.cpp
@@ -13,7 +13,7 @@ int main(int argc, char **argv) {
     cmdline::parser cmdParser;
     cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
     cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt");
-    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/PhoneLM-1.5B-Instruct-128.mllm");
+    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/phonelm-1.5b-instruct-int8.mllm");
     cmdParser.add<string>("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-instruct-q4_0_4_4.mllm");
     cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
     cmdParser.add<int>("thread", 't', "num of threads", false, 4);
@@ -35,17 +35,39 @@ int main(int argc, char **argv) {
     auto decoding_model = PhoneLMForCausalLM(config);
     decoding_model.load(decoding_path);
 
+    // warmup START
+    std::string input_str = " ";
+    auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);
+    LlmTextGeneratorOpts opt{
+        .max_new_tokens = 1,
+        .do_sample = false,
+        .is_padding = true,
+        .seq_before_padding = real_seq_length,
+        .chunk_size = chunk_size,
+    };
+    model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
+        auto out_string = tokenizer.detokenize({out_token});
+        auto [not_end, output_string] = tokenizer.postprocess(out_string);
+        if (!not_end) { return false; }
+        return true;
+    });
+    Module::isFirstChunk = false;
+    static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(0);
+    static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
+    static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
+    // turn on the multi-chunk prefilling
+    Module::isMultiChunkPrefilling = true;
+    // warmup END
+    std::cout << "Warmup finished." << std::endl;
+
     vector<string> in_strs = {
         "Give me a short introduction to large language model.",
         "What is the Beijing University of Posts and Telecommunications.",
         "What is the meaning of life?",
         "Hello, who are you?",
         "What can you do?",
         "Please introduce Beijing University of Posts and Telecommunications.",
-    };
-
-    // turn on the multi-chunk prefilling
-    Module::isMultiChunkPrefilling = true;
+        "\"Large Language Models (LLMs) are advanced artificial intelligence systems designed to understand and generate human-like text. These models are trained on vast amounts of data, enabling them to perform a wide range of tasks, from answering questions and summarizing text to generating creative content and engaging in conversational dialogue. LLMs like GPT-3 and GPT-4, developed by OpenAI, have set new benchmarks in natural language processing by leveraging deep learning architectures, particularly transformer models, which excel at capturing context and relationships within text. The scalability and versatility of LLMs make them invaluable tools for applications in education, customer service, content creation, and more. However, their deployment also raises ethical considerations, including issues of bias, misinformation, and the potential for misuse. As the field continues to evolve, ongoing research and responsible deployment strategies are essential to harnessing the full potential of these powerful AI systems while mitigating their risks.\"\nGenerate a title based on the above text."};
 
     for (int i = 0; i < in_strs.size(); ++i) {
         auto input_str = tokenizer.apply_chat_template(in_strs[i]);
@@ -75,7 +97,8 @@ int main(int argc, char **argv) {
             chunked_tensors[chunk_id].deepCopyFrom(&input_tensor, false, {0, 0, chunk_id * chunk_size, 0});
 
             model.generate(chunked_tensors[chunk_id], opt, [&](unsigned int out_token) -> bool {
-                if (i != 0 && !isSwitched && chunk_id == 0) {
+                // if (i != 0 && !isSwitched && chunk_id == 0) {
+                if (!isSwitched && chunk_id == 0) {
                     // turn off switching at the first chunk of following inputs
                     static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
                     isSwitched = true;

diff --git a/examples/main_phonelm_npu.cpp b/examples/main_phonelm_npu.cpp
@@ -67,7 +67,7 @@ int main(int argc, char **argv) {
 
     cmdParser.parse_check(argc, argv);
 
-    const string npu_model_path = "../models/PhoneLM-1.5B-Instruct-128.mllm";
+    const string npu_model_path = "../models/phonelm-1.5b-instruct-int8.mllm";
     const string cpu_model_path = "../models/phonelm-with-head-q4k.mllm";
     const string merge_file_path = "../vocab/phonelm_merges.txt";
 

diff --git a/scripts/run_phonelm_qnn.sh b/scripts/run_phonelm_qnn.sh
@@ -6,10 +6,10 @@ adb shell mkdir -p /data/local/tmp/mllm/qnn-lib
 adb push ../vocab/phonelm_vocab.mllm /data/local/tmp/mllm/vocab/
 
 
-if ! adb shell [ -f "/data/local/tmp/mllm/models/PhoneLM-1.5B-Instruct-128.mllm" ]; then
-    adb push ../models/PhoneLM-1.5B-Instruct-128.mllm "/data/local/tmp/mllm/models/PhoneLM-1.5B-Instruct-128.mllm"
+if ! adb shell [ -f "/data/local/tmp/mllm/models/phonelm-1.5b-instruct-int8.mllm" ]; then
+    adb push ../models/phonelm-1.5b-instruct-int8.mllm "/data/local/tmp/mllm/models/phonelm-1.5b-instruct-int8.mllm"
 else
-    echo "PhoneLM-1.5B-Instruct-128 file already exists"
+    echo "phonelm-1.5b-instruct-int8 file already exists"
 fi
 
 

diff --git a/src/models/qwen/tokenization_qwen.hpp b/src/models/qwen/tokenization_qwen.hpp
@@ -226,6 +226,57 @@ class QWenTokenizer final : public BPETokenizer {
         ret.resize(seqLength, vocab_size);
         return std::make_pair(realLength, Tokenizer::tokens2Input(ret));
     }
+    // padding the input by neareast multiplication of chunk_size
+    std::pair<int, Tensor> tokenizePaddingByChunk(std::string &text, int chunk_size, int vocab_size) {
+        std::vector<token_id_t> ret;
+
+        if (split_special_tokens_) {
+            const auto word_collection = unicode_regex_split(text, FIXED_PAT_STRS);
+            for (auto &piece : word_collection) {
+                // look up table
+                // std::string token;
+                // for (auto b : UTF8(piece)) token += byte_encoder_[b];
+
+                // using bpe
+                std::vector<token_id_t> tmp;
+                BPETokenizer::tokenize(piece, tmp, false, true, "");
+                ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
+            }
+        } else {
+            auto parts = _splitWithDelimiters(text, special_tokens);
+            // for (auto p : parts) {
+            //     std::cout << "\"" << p << "\"" << std::endl;
+            // }
+            for (auto &p : parts) {
+                if (std::find(special_tokens.begin(), special_tokens.end(), p) != special_tokens.end()) {
+                    std::string token;
+                    for (auto b : UTF8(p)) token += byte_encoder_[b];
+
+                    std::vector<token_id_t> tmp;
+                    BPETokenizer::tokenize(token, tmp, false, special_tokens, true);
+                    ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
+                } else {
+                    const auto word_collection = unicode_regex_split(p, FIXED_PAT_STRS);
+                    for (auto &piece : word_collection) {
+                        // look up table
+                        // std::string token;
+                        // for (auto b : UTF8(piece)) token += byte_encoder_[b];
+
+                        // using bpe
+                        std::vector<token_id_t> tmp;
+                        BPETokenizer::tokenize(piece, tmp, false, true, "");
+                        assert(!tmp.empty());
+                        ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
+                    }
+                }
+            }
+        }
+
+        auto realLength = ret.size();
+        int paddingLength = (chunk_size - realLength % chunk_size) % chunk_size;
+        ret.resize(realLength + paddingLength, vocab_size);
+        return std::make_pair(realLength, Tokenizer::tokens2Input(ret));
+    }
 
     std::string _byte_decode_(const std::string &text) {
         std::string ret;
+1 −1		app/src/main/cpp/LibHelper.hpp
+6 −4		app/src/main/cpp/chatbot.cpp
+2 −2		app/src/main/cpp/libs/libmllm_lib.a
+4 −4		app/src/main/java/org/saltedfish/chatbot/JNIBridge.kt
+1 −0		app/src/main/java/org/saltedfish/chatbot/RAGDB.kt
+23 −13		app/src/main/java/org/saltedfish/chatbot/viewModel.kt