UbiquitousLearning · yirongjie · Nov 7, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 31, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -72,8 +72,9 @@ option(QNN_OLD_FRONTEND "Enable Old QNN" OFF)
 if(QNN)
     add_definitions(-DUSE_QNN) # the USE_QNN should come before cpu subdirectory
 endif()
-if(QNN_OLD_FRONTEND)
-    add_definitions(-DOLD_QNN)
+option(QNN_VALIDATE_NODE "Enable QNN Validate Node When Building Graph" ON)
+if(QNN_VALIDATE_NODE)
+    add_definitions(-DQNN_VALIDATE_NODE)
 endif()
 
 if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -90,8 +90,10 @@ func_vlm_add_executable(demo_imagebind_1mod)
 
 # QNN demo
 if(QNN)
-    func_llm_add_executable(demo_qnn)
+    func_llm_add_executable(demo_qwen_npu)
     func_llm_add_executable(main_qwen_npu)
+    func_llm_add_executable(demo_phonelm_npu)
+    func_llm_add_executable(main_phonelm_npu)
 endif()
 
 

diff --git a/examples/demo_phonelm.cpp b/examples/demo_phonelm.cpp
@@ -11,7 +11,7 @@ int main(int argc, char **argv) {
     cmdline::parser cmdParser;
     cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
     cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt");
-    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/phonelm-1.5b-instruct-fp32.mllm");
+    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/phonelm-1.5b-instruct-q4_0_4_4.mllm");
     cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
     cmdParser.add<int>("thread", 't', "num of threads", false, 4);
     cmdParser.parse_check(argc, argv);

diff --git a/examples/demo_phonelm_npu.cpp b/examples/demo_phonelm_npu.cpp
@@ -0,0 +1,91 @@
+#ifdef USE_QNN
+#include "backends/cpu/CPUBackend.hpp"
+#include "cmdline.h"
+#include "models/phonelm/modeling_phonelm.hpp"
+#include "models/phonelm/modeling_phonelm_npu.hpp"
+#include "models/smollm/tokenization_smollm.hpp"
+
+using namespace mllm;
+
+int main(int argc, char **argv) {
+    cmdline::parser cmdParser;
+    cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
+    cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt");
+    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/PhoneLM-1.5B-Instruct-128.mllm");
+    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
+    cmdParser.add<int>("thread", 't', "num of threads", false, 4);
+    cmdParser.parse_check(argc, argv);
+
+    string vocab_path = cmdParser.get<string>("vocab");
+    string merge_path = cmdParser.get<string>("merge");
+    string model_path = cmdParser.get<string>("model");
+    int tokens_limit = cmdParser.get<int>("limits");
+    CPUBackend::cpu_threads = cmdParser.get<int>("thread");
+
+    auto tokenizer = SmolLMTokenizer(vocab_path, merge_path);
+    PhoneLMConfig config(tokens_limit, "1.5B");
+    auto model = PhoneLMForCausalLM_NPU(config);
+    model.load(model_path);
+    auto decoding_model = PhoneLMForCausalLM(config);
+    decoding_model.load("../models/phonelm-1.5b-instruct-q4_0_4_4.mllm");
+
+    vector<string> in_strs = {
+        "Give me a short introduction to large language model.",
+    };
+
+    for (int i = 0; i < in_strs.size(); ++i) {
+        auto input_str = tokenizer.apply_chat_template(in_strs[i]);
+        auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 64, config.vocab_size);
+        std::cout << real_seq_length << endl;
+        std::cout << input_str << std::endl;
+        std::cout << "[Q] " << in_strs[i] << std::endl;
+        std::cout << "[A] " << std::flush;
+
+        LlmTextGeneratorOpts opt{
+            .max_new_tokens = 1,
+            .do_sample = false,
+            .temperature = 0.3f,
+            .top_k = 50,
+            .top_p = 0.f,
+            .is_padding = true,
+            .seq_before_padding = real_seq_length,
+        };
+        model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
+            auto out_string = tokenizer.detokenize({out_token});
+            auto [not_end, output_string] = tokenizer.postprocess(out_string);
+            if (!not_end) { return false; }
+            std::cout << output_string << std::flush;
+            return true;
+        });
+
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+
+        LlmTextGeneratorOpts decoding_opt{
+            .max_new_tokens = 100,
+            .do_sample = false,
+            .temperature = 0.3f,
+            .top_k = 50,
+            .top_p = 0.f,
+            .is_padding = false,
+        };
+        bool isSwitched = false;
+        decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
+            // call only once of switchDecodeTag
+            if (!isSwitched) {
+                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+                isSwitched = true;
+            }
+            auto out_string = tokenizer.detokenize({out_token});
+            auto [isOk, print_string] = tokenizer.postprocess(out_string);
+            if (isOk) {
+                std::cout << print_string << std::flush;
+            } else {
+                return false;
+            }
+            return true;
+        });
+        std::cout << "\n---------------" << std::endl;
+    }
+}
+#endif
diff --git a/examples/demo_qnn.cpp → examples/demo_qwen_npu.cpp b/examples/demo_qnn.cpp → examples/demo_qwen_npu.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_QNN
 #include "backends/cpu/CPUBackend.hpp"
 #include "cmdline.h"
 #include "models/qwen/configuration_qwen.hpp"
@@ -88,4 +89,5 @@ int main(int argc, char **argv) {
         });
         std::cout << "\n---------------" << std::endl;
     }
-}
+}
+#endif
diff --git a/examples/main_phonelm_npu.cpp b/examples/main_phonelm_npu.cpp
@@ -0,0 +1,236 @@
+#ifdef USE_QNN
+#include <iostream>
+#include <csignal>
+#include <memory>
+#include <vector>
+#include "Executor.hpp"
+#include "Types.hpp"
+#include "backends/qnn/QNNNet.hpp"
+#include "cmdline.h"
+#include "Net.hpp"
+#include "backends/qnn/QNNExecutor.hpp"
+
+#include "models/smollm/tokenization_smollm.hpp"
+#include "main_phonelm_npu.hpp"
+
+using namespace mllm;
+
+unsigned int argmax(const std::vector<float> &scores) {
+    return std::max_element(scores.begin(), scores.end()) - scores.begin();
+}
+
+unsigned int postProcessing(shared_ptr<Tensor> result, shared_ptr<Tensor> &out_result) {
+    assert(result->batch() == 1);
+    assert(result->head() == 1);
+    out_result->reshape(1, 1, 1, 1);
+    out_result->alloc();
+    vector<float> scores;
+    for (int i = 0; i < result->dimension(); ++i) {
+        auto value = result->dataAt<float>(0, 0, result->sequence() - 1, i);
+        scores.push_back(value);
+    }
+    auto token_idx = argmax(scores);
+    out_result->setDataAt<float>(0, 0, 0, 0, token_idx);
+    return token_idx;
+}
+
+unsigned int postProcessing_prefill(shared_ptr<Tensor> result, shared_ptr<Tensor> &out_result, int seq) {
+    assert(result->batch() == 1);
+    assert(result->head() == 1);
+    out_result->reshape(1, 1, 1, 1);
+    out_result->alloc();
+    vector<float> scores;
+    for (int i = 0; i < result->dimension(); ++i) {
+        auto value = result->dataAt<float>(0, 0, seq - 1, i);
+        scores.push_back(value);
+    }
+    auto token_idx = argmax(scores);
+    out_result->setDataAt<float>(0, 0, 0, 0, token_idx);
+    return token_idx;
+}
+
+int main(int argc, char **argv) {
+    cmdline::parser cmdParser;
+    cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
+
+    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 1124);
+
+    cmdParser.add<int>("thread", 't', "num of threads", false, 4);
+    cmdParser.add<int>("seq", 's', "seqenth length", false, 64);
+    cmdParser.add<bool>("chunk", 'c', "use chunk execute", false, true);
+    cmdParser.add<int>("head", 'h', "num of heads", false, 16);
+
+    cmdParser.add<int>("ffn", 'f', "size of ffn hidden size", false, 6816);
+    cmdParser.add<int>("hds", 'd', "size of hidden size", false, 2560);
+
+    cmdParser.add<bool>("readfile", 'r', "read prompt from file", false, false);
+
+    cmdParser.parse_check(argc, argv);
+
+    const string npu_model_path = "../models/PhoneLM-1.5B-Instruct-128.mllm";
+    const string cpu_model_path = "../models/phonelm-with-head-q4k.mllm";
+    const string merge_file_path = "../vocab/phonelm_merges.txt";
+
+    string vocab_path = cmdParser.get<string>("vocab");
+    int tokens_limit = cmdParser.get<int>("limits");
+    int thread_num = cmdParser.get<int>("thread");
+    int seqLength = cmdParser.get<int>("seq");
+    bool isChunkExecute = cmdParser.get<bool>("chunk");
+    int head_num = cmdParser.get<int>("head");
+
+    bool read_file = cmdParser.get<bool>("readfile");
+
+    int chunk = 1;
+    if (isChunkExecute)
+        chunk = seqLength / 256;
+
+    int vocab_size = 49152;
+    int hidden_dim = cmdParser.get<int>("hds");
+    int ffn_hidden_dim = cmdParser.get<int>("ffn");
+
+    vector<string> in_strs = {
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nGive me a short introduction to large language model.<|im_end|>\n<|im_start|>assistant\n",
+        // " What can you do?",
+        // "Please introduce Beijing University of Posts and Telecommunications."};
+    };
+
+    string input_string;
+    if (read_file) {
+        std::ifstream file("./func_prompt.txt");
+        if (!file) {
+            std::cerr << "无法打开文件！" << std::endl;
+            return 1;
+        }
+        std::stringstream buffer;
+        buffer << file.rdbuf();
+        input_string = buffer.str();
+        file.close(); // 关闭文件
+    } else {
+        input_string = in_strs[0];
+    }
+
+    auto tokenizer = SmolLMTokenizer(vocab_path, merge_file_path);
+
+    std::unique_ptr<Context> npu_ctx_ptr(new Context());
+    auto *npu_ctx = npu_ctx_ptr.get();
+    std::unique_ptr<Context> cpu_ctx_ptr(new Context());
+    auto *cpu_ctx = cpu_ctx_ptr.get();
+    std::unique_ptr<Context> inter_ctx_ptr(new Context());
+    auto *inter_ctx = inter_ctx_ptr.get();
+
+    // cache_max should be longer than seqLength
+    modeling::phonelm_npu(npu_ctx, vocab_size, hidden_dim, ffn_hidden_dim, head_num, tokens_limit, seqLength, chunk);
+    modeling::phonelm_npu_cpu_inter(inter_ctx, vocab_size, hidden_dim, ffn_hidden_dim, head_num, tokens_limit, seqLength, chunk);
+    modeling::phonelm_cpu_q40(cpu_ctx, vocab_size, hidden_dim, ffn_hidden_dim, head_num, tokens_limit);
+
+    BackendConfig bn;
+    QNNNet npuNet(bn, npu_ctx);
+    npuNet.convert(npu_ctx, BackendType::MLLM_QNN, thread_num);
+    Net interNet(bn);
+    interNet.convert(inter_ctx->sub_param_, BackendType::MLLM_CPU, thread_num);
+    Net cpuNet(bn);
+    cpuNet.convert(cpu_ctx->sub_param_, BackendType::MLLM_CPU, thread_num);
+
+    ParamLoader npu_prefill_param_loader(npu_model_path);
+    ParamLoader cpu_decoding_param_loader(cpu_model_path);
+    ParamLoader inter_param_loader(npu_model_path);
+
+    QNNExecutor *npuExePtr;
+    if (isChunkExecute) {
+        npuExePtr = new QNNPipelineExecutor(&npu_prefill_param_loader);
+    } else {
+        npuExePtr = new QNNExecutor(&npu_prefill_param_loader);
+    }
+    auto &npuExe = *npuExePtr;
+    npuExe.setup(&npuNet);
+    Executor interExe(&inter_param_loader);
+    interExe.setup(&interNet);
+    Executor cpuExe(&cpu_decoding_param_loader);
+    cpuExe.setup(&cpuNet);
+
+    shared_ptr<Tensor> input = std::make_shared<Tensor>();
+
+    for (int str_i = 0; str_i < in_strs.size(); ++str_i) {
+        // auto in_str = in_strs[str_i];
+        auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_string, seqLength, vocab_size);
+        auto input = std::make_shared<Tensor>(input_tensor);
+
+        if (chunk != 1)
+            npuExe.warmup(npu_ctx, &npuNet, {input});
+
+        std::cout << "real_seq_length: " << real_seq_length << std::endl;
+        std::cout << "[Q] " << input_string << std::endl;
+        std::cout << "[A] " << std::flush;
+
+        do {
+            // 1: Prefill stage using NPU chunk execute
+            if (chunk == 1)
+                npuExe.run(npu_ctx, &npuNet, {input});
+            else
+                npuExe.runExp(npu_ctx, &npuNet, {input});
+            auto result = npuExe.result();
+
+            // result[0]->printData<float>();
+            // exit(0);
+
+            // inter model for prefill-decode
+            interExe.run(&interNet, {result[0]});
+            result = interExe.result();
+
+            auto token_idx = postProcessing_prefill(result[0], input, real_seq_length);
+            if (token_idx == 2) { // "</s>"
+                break;
+            }
+            // exit(0);
+
+            auto out_token = tokenizer.detokenize({token_idx});
+            std::cout << out_token << std::flush;
+
+            auto prefill_cpu_backend = dynamic_cast<CPUBackend *>(npuNet.backends()[MLLM_CPU].get());
+            auto inter_cpu_backend = dynamic_cast<CPUBackend *>(interNet.backends()[MLLM_CPU].get());
+            auto decode_cpu_backend = dynamic_cast<CPUBackend *>(cpuNet.backends()[MLLM_CPU].get());
+            prefill_cpu_backend->setSequenceLength(real_seq_length);
+            prefill_cpu_backend->switchDecodeTag();
+            inter_cpu_backend->setSequenceLength(real_seq_length);
+            inter_cpu_backend->switchDecodeTag();
+            decode_cpu_backend->setSequenceLength(real_seq_length);
+            decode_cpu_backend->switchDecodeTag();
+
+            // // 2: Decoding stage using CPU execute
+            for (int step = real_seq_length; step < real_seq_length + 100; step++) {
+                cpuExe.run(&cpuNet, {input});
+                auto result = cpuExe.result();
+
+                auto token_idx = postProcessing(result[0], input);
+                if (token_idx == 2) { // "</s>"
+                    break;
+                }
+
+                auto out_token = tokenizer.detokenize({token_idx});
+                std::cout << out_token << std::flush;
+
+                if (step == real_seq_length) {
+                    prefill_cpu_backend->switchDecodeTag();
+                    inter_cpu_backend->switchDecodeTag();
+                    decode_cpu_backend->switchDecodeTag();
+                }
+            }
+        } while (false);
+        printf("\n");
+    }
+
+    std::cout << "====================" << std::endl;
+    npuExe.perf();
+    cpuExe.perf();
+
+    // free memory
+    // for (auto *op : npu_ctx->net_ops) {
+    //     delete op;
+    // }
+    // for (auto *tensor : npu_ctx->net_tensors) {
+    //     delete tensor;
+    // }
+
+    return 0;
+}
+#endif