UbiquitousLearning · yirongjie · Nov 12, 2024 · Nov 12, 2024 · Nov 12, 2024
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -94,6 +94,7 @@ if(QNN)
     func_llm_add_executable(main_qwen_npu)
     func_llm_add_executable(demo_phonelm_npu)
     func_llm_add_executable(main_phonelm_npu)
+    func_llm_add_executable(demo_qwen2.5_npu)
 endif()
 
 

diff --git a/examples/demo_phonelm_npu.cpp b/examples/demo_phonelm_npu.cpp
@@ -12,13 +12,15 @@ int main(int argc, char **argv) {
     cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
     cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt");
     cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/PhoneLM-1.5B-Instruct-128.mllm");
+    cmdParser.add<string>("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-droidcall-q4_0_4_4.mllm");
     cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
     cmdParser.add<int>("thread", 't', "num of threads", false, 4);
     cmdParser.parse_check(argc, argv);
 
     string vocab_path = cmdParser.get<string>("vocab");
     string merge_path = cmdParser.get<string>("merge");
     string model_path = cmdParser.get<string>("model");
+    string decoding_path = cmdParser.get<string>("decoding");
     int tokens_limit = cmdParser.get<int>("limits");
     CPUBackend::cpu_threads = cmdParser.get<int>("thread");
 
@@ -27,7 +29,7 @@ int main(int argc, char **argv) {
     auto model = PhoneLMForCausalLM_NPU(config);
     model.load(model_path);
     auto decoding_model = PhoneLMForCausalLM(config);
-    decoding_model.load("../models/phonelm-1.5b-instruct-q4_0_4_4.mllm");
+    decoding_model.load(decoding_path);
 
     vector<string> in_strs = {
         "Give me a short introduction to large language model.",

diff --git a/examples/demo_qwen2.5_npu.cpp b/examples/demo_qwen2.5_npu.cpp
@@ -0,0 +1,93 @@
+#ifdef USE_QNN
+#include "backends/cpu/CPUBackend.hpp"
+#include "cmdline.h"
+#include "models/qwen/configuration_qwen.hpp"
+#include "models/qwen/modeling_qwen_npu.hpp"
+#include "models/qwen/modeling_qwen.hpp"
+#include "models/qwen/tokenization_qwen.hpp"
+#include "processor/PostProcess.hpp"
+
+using namespace mllm;
+
+int main(int argc, char **argv) {
+    cmdline::parser cmdParser;
+    cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen2.5_vocab.mllm");
+    cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen2.5_merges.txt");
+    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/Qwen2.5-1.5B-Instruct.mllm");
+    cmdParser.add<string>("billion", 'b', "[0.5B | 1.8B | 1.5B]", false, "1.8B");
+    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
+    cmdParser.add<int>("thread", 't', "num of threads", false, 4);
+    cmdParser.parse_check(argc, argv);
+
+    string vocab_path = cmdParser.get<string>("vocab");
+    string merge_path = cmdParser.get<string>("merge");
+    string model_path = cmdParser.get<string>("model");
+    string model_billion = cmdParser.get<string>("billion");
+    int tokens_limit = cmdParser.get<int>("limits");
+    CPUBackend::cpu_threads = cmdParser.get<int>("thread");
+
+    auto tokenizer = QWenTokenizer(vocab_path, merge_path);
+    QWenConfig config(tokens_limit, "1.5B", RoPEType::HFHUBROPE);
+    auto model = QWenForCausalLM_NPU(config);
+    model.load(model_path);
+    auto decoding_model = QWenForCausalLM(config);
+    decoding_model.load("../models/qwen-2.5-1.5b-instruct-q4_0_4_4.mllm");
+
+    vector<string> in_strs = {
+        " Give me a short introduction to large language model.",
+    };
+
+    for (int i = 0; i < in_strs.size(); ++i) {
+        auto input_str = tokenizer.apply_chat_template(in_strs[i]);
+        auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 64, config.vocab_size);
+        std::cout << "[Q] " << in_strs[i] << std::endl;
+        std::cout << "[A] " << std::flush;
+
+        LlmTextGeneratorOpts opt{
+            .max_new_tokens = 1,
+            .do_sample = false,
+            .temperature = 0.3f,
+            .top_k = 50,
+            .top_p = 0.f,
+            .is_padding = true,
+            .seq_before_padding = real_seq_length,
+        };
+        model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
+            auto out_string = tokenizer.detokenize({out_token});
+            auto [not_end, output_string] = tokenizer.postprocess(out_string);
+            if (!not_end) { return false; }
+            std::cout << output_string << std::flush;
+            return true;
+        });
+
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+
+        LlmTextGeneratorOpts decoding_opt{
+            .max_new_tokens = 100,
+            .do_sample = false,
+            .temperature = 0.3f,
+            .top_k = 50,
+            .top_p = 0.f,
+            .is_padding = false,
+        };
+        bool isSwitched = false;
+        decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
+            // call only once of switchDecodeTag
+            if (!isSwitched) {
+                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+                isSwitched = true;
+            }
+            auto out_string = tokenizer.detokenize({out_token});
+            auto [isOk, print_string] = tokenizer.postprocess(out_string);
+            if (isOk) {
+                std::cout << print_string << std::flush;
+            } else {
+                return false;
+            }
+            return true;
+        });
+        std::cout << "\n---------------" << std::endl;
+    }
+}
+#endif
diff --git a/src/models/qwen/modeling_qwen_npu.hpp b/src/models/qwen/modeling_qwen_npu.hpp
@@ -50,8 +50,8 @@ class QwenDecoderNPUPart1 final : public Module {
         v_proj = Linear(hidden_size, num_key_value_heads * head_dim, true, base_name + names._v_proj_name);
 
         q_view = View(-1, num_heads, -1, head_dim, base_name + names._q_proj_name + "-00_view_");
-        k_view = View(-1, num_heads, -1, head_dim, base_name + names._k_proj_name + "-00_view_");
-        v_view = View(-1, num_heads, -1, head_dim, base_name + names._v_proj_name + "-00_view_");
+        k_view = View(-1, num_key_value_heads, -1, head_dim, base_name + names._k_proj_name + "-00_view_");
+        v_view = View(-1, num_key_value_heads, -1, head_dim, base_name + names._v_proj_name + "-00_view_");
 
         q_dequant = Dequantize(true, base_name + names._q_proj_name + ".dequantize");
         k_dequant = Dequantize(true, base_name + names._k_proj_name + ".dequantize", false);
@@ -489,7 +489,7 @@ class QWenModel_NPU final : public Module {
         static_assert(std::is_base_of<Module, SHADOW>::value, "SHADOW must be a subclass of Module");
         listIdx = 0;
         vector<unique_ptr<Module>> modules;
-        std::set shadowLayers = {1, 2, 6};
+        std::set shadowLayers = {1, 2, 26};
         // for index in shadowLayers, create shadow decoder, for others, create normal decoder
         for (int i = 0; i < n; i++) {
             auto new_args = change_last(args...); // 创建新的参数包，最后一个参数被修改为原来的值+ std::to_string(listIdx)+ "."