UbiquitousLearning · yirongjie · Nov 14, 2024 · Nov 8, 2024 · Nov 12, 2024 · Nov 13, 2024
diff --git a/examples/demo_phonelm_npu.cpp b/examples/demo_phonelm_npu.cpp
@@ -1,4 +1,6 @@
-#ifdef USE_QNN
+#include "Module.hpp"
+#include "Types.hpp"
+#include <memory>
 #include "backends/cpu/CPUBackend.hpp"
 #include "cmdline.h"
 #include "models/phonelm/modeling_phonelm.hpp"
@@ -12,16 +14,18 @@ int main(int argc, char **argv) {
     cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
     cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt");
     cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/PhoneLM-1.5B-Instruct-128.mllm");
-    cmdParser.add<string>("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-droidcall-q4_0_4_4.mllm");
+    cmdParser.add<string>("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-instruct-q4_0_4_4.mllm");
     cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
     cmdParser.add<int>("thread", 't', "num of threads", false, 4);
+    cmdParser.add<int>("chunk", 'c', "chunk size", false, 64);
     cmdParser.parse_check(argc, argv);
 
     string vocab_path = cmdParser.get<string>("vocab");
     string merge_path = cmdParser.get<string>("merge");
     string model_path = cmdParser.get<string>("model");
     string decoding_path = cmdParser.get<string>("decoding");
     int tokens_limit = cmdParser.get<int>("limits");
+    int chunk_size = cmdParser.get<int>("chunk");
     CPUBackend::cpu_threads = cmdParser.get<int>("thread");
 
     auto tokenizer = SmolLMTokenizer(vocab_path, merge_path);
@@ -33,35 +37,64 @@ int main(int argc, char **argv) {
 
     vector<string> in_strs = {
         "Give me a short introduction to large language model.",
+        "What is the Beijing University of Posts and Telecommunications.",
+        "What is the meaning of life?",
+        "Hello, who are you?",
+        "What can you do?",
+        "Please introduce Beijing University of Posts and Telecommunications.",
     };
 
+    // turn on the multi-chunk prefilling
+    Module::isMultiChunkPrefilling = true;
+
     for (int i = 0; i < in_strs.size(); ++i) {
         auto input_str = tokenizer.apply_chat_template(in_strs[i]);
-        auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 64, config.vocab_size);
-        std::cout << real_seq_length << endl;
-        std::cout << input_str << std::endl;
+        auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);
+        const int seq_length_padding = (chunk_size - real_seq_length % chunk_size) + real_seq_length;
+        const int chunk_num = seq_length_padding / chunk_size;
+        bool isSwitched = false;
+        // std::cout << "real seq length: " << real_seq_length << " padding to: " << seq_length_padding << " chunk num: " << chunk_num << std::endl;
         std::cout << "[Q] " << in_strs[i] << std::endl;
         std::cout << "[A] " << std::flush;
 
+        // tensor vectors to save the chunked tensors of the QNN prefilling input
+        vector<Tensor> chunked_tensors(chunk_num);
         LlmTextGeneratorOpts opt{
             .max_new_tokens = 1,
             .do_sample = false,
-            .temperature = 0.3f,
-            .top_k = 50,
-            .top_p = 0.f,
             .is_padding = true,
             .seq_before_padding = real_seq_length,
+            .chunk_size = chunk_size,
         };
-        model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
-            auto out_string = tokenizer.detokenize({out_token});
-            auto [not_end, output_string] = tokenizer.postprocess(out_string);
-            if (!not_end) { return false; }
-            std::cout << output_string << std::flush;
-            return true;
-        });
 
+        for (int chunk_id = 0; chunk_id < chunk_num; ++chunk_id) {
+            chunked_tensors[chunk_id].setBackend(Backend::global_backends[MLLM_CPU]);
+            chunked_tensors[chunk_id].setTtype(INPUT_TENSOR);
+            chunked_tensors[chunk_id].reshape(1, 1, chunk_size, 1);
+            chunked_tensors[chunk_id].setName("input-chunk-" + to_string(chunk_id));
+            chunked_tensors[chunk_id].deepCopyFrom(&input_tensor, false, {0, 0, chunk_id * chunk_size, 0});
+
+            model.generate(chunked_tensors[chunk_id], opt, [&](unsigned int out_token) -> bool {
+                if (i != 0 && !isSwitched && chunk_id == 0) {
+                    // turn off switching at the first chunk of following inputs
+                    static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
+                    isSwitched = true;
+                }
+                auto out_string = tokenizer.detokenize({out_token});
+                auto [not_end, output_string] = tokenizer.postprocess(out_string);
+                if (!not_end) { return false; }
+                if (chunk_id == chunk_num - 1) { // print the output of the last chunk
+                    std::cout << output_string << std::flush;
+                }
+                return true;
+            });
+            Module::isFirstChunk = false;
+        }
+
+        // turn on switching, set sequence length and execution type
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
-        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(AUTOREGRESSIVE);
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
 
         LlmTextGeneratorOpts decoding_opt{
             .max_new_tokens = 100,
@@ -71,23 +104,23 @@ int main(int argc, char **argv) {
             .top_p = 0.f,
             .is_padding = false,
         };
-        bool isSwitched = false;
-        decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
-            // call only once of switchDecodeTag
-            if (!isSwitched) {
-                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+        isSwitched = false;
+        decoding_model.generate(chunked_tensors.back(), decoding_opt, [&](unsigned int out_token) -> bool {
+            if (!isSwitched) { // turn off switching
+                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
                 isSwitched = true;
             }
             auto out_string = tokenizer.detokenize({out_token});
-            auto [isOk, print_string] = tokenizer.postprocess(out_string);
-            if (isOk) {
-                std::cout << print_string << std::flush;
-            } else {
-                return false;
-            }
+            auto [not_end, output_string] = tokenizer.postprocess(out_string);
+            if (!not_end) { return false; }
+            std::cout << output_string << std::flush;
             return true;
         });
-        std::cout << "\n---------------" << std::endl;
+
+        // turn on switching, set sequence length and execution type
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(0);
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
+        std::cout << "\n";
     }
-}
-#endif
+}
diff --git a/examples/demo_qwen2.5_npu.cpp b/examples/demo_qwen2.5_npu.cpp
@@ -1,4 +1,3 @@
-#ifdef USE_QNN
 #include "backends/cpu/CPUBackend.hpp"
 #include "cmdline.h"
 #include "models/qwen/configuration_qwen.hpp"
@@ -61,7 +60,7 @@ int main(int argc, char **argv) {
         });
 
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
-        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
 
         LlmTextGeneratorOpts decoding_opt{
             .max_new_tokens = 100,
@@ -75,7 +74,7 @@ int main(int argc, char **argv) {
         decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
             // call only once of switchDecodeTag
             if (!isSwitched) {
-                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
                 isSwitched = true;
             }
             auto out_string = tokenizer.detokenize({out_token});
@@ -89,5 +88,4 @@ int main(int argc, char **argv) {
         });
         std::cout << "\n---------------" << std::endl;
     }
-}
-#endif
+}
diff --git a/examples/demo_qwen_npu.cpp b/examples/demo_qwen_npu.cpp
@@ -1,4 +1,3 @@
-#ifdef USE_QNN
 #include "backends/cpu/CPUBackend.hpp"
 #include "cmdline.h"
 #include "models/qwen/configuration_qwen.hpp"
@@ -61,7 +60,7 @@ int main(int argc, char **argv) {
         });
 
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
-        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
 
         LlmTextGeneratorOpts decoding_opt{
             .max_new_tokens = 100,
@@ -75,7 +74,7 @@ int main(int argc, char **argv) {
         decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
             // call only once of switchDecodeTag
             if (!isSwitched) {
-                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
                 isSwitched = true;
             }
             auto out_string = tokenizer.detokenize({out_token});
@@ -89,5 +88,4 @@ int main(int argc, char **argv) {
         });
         std::cout << "\n---------------" << std::endl;
     }
-}
-#endif
+}
diff --git a/examples/main_phonelm_npu.cpp b/examples/main_phonelm_npu.cpp
@@ -190,11 +190,11 @@ int main(int argc, char **argv) {
             auto inter_cpu_backend = dynamic_cast<CPUBackend *>(interNet.backends()[MLLM_CPU].get());
             auto decode_cpu_backend = dynamic_cast<CPUBackend *>(cpuNet.backends()[MLLM_CPU].get());
             prefill_cpu_backend->setSequenceLength(real_seq_length);
-            prefill_cpu_backend->switchDecodeTag();
+            prefill_cpu_backend->toggleSwitching();
             inter_cpu_backend->setSequenceLength(real_seq_length);
-            inter_cpu_backend->switchDecodeTag();
+            inter_cpu_backend->toggleSwitching();
             decode_cpu_backend->setSequenceLength(real_seq_length);
-            decode_cpu_backend->switchDecodeTag();
+            decode_cpu_backend->toggleSwitching();
 
             // // 2: Decoding stage using CPU execute
             for (int step = real_seq_length; step < real_seq_length + 100; step++) {
@@ -210,9 +210,9 @@ int main(int argc, char **argv) {
                 std::cout << out_token << std::flush;
 
                 if (step == real_seq_length) {
-                    prefill_cpu_backend->switchDecodeTag();
-                    inter_cpu_backend->switchDecodeTag();
-                    decode_cpu_backend->switchDecodeTag();
+                    prefill_cpu_backend->toggleSwitching();
+                    inter_cpu_backend->toggleSwitching();
+                    decode_cpu_backend->toggleSwitching();
                 }
             }
         } while (false);

diff --git a/examples/main_qwen_npu.cpp b/examples/main_qwen_npu.cpp
@@ -187,11 +187,11 @@ int main(int argc, char **argv) {
             auto inter_cpu_backend = dynamic_cast<CPUBackend *>(interNet.backends()[MLLM_CPU].get());
             auto decode_cpu_backend = dynamic_cast<CPUBackend *>(cpuNet.backends()[MLLM_CPU].get());
             prefill_cpu_backend->setSequenceLength(real_seq_length);
-            prefill_cpu_backend->switchDecodeTag();
+            prefill_cpu_backend->toggleSwitching();
             inter_cpu_backend->setSequenceLength(real_seq_length);
-            inter_cpu_backend->switchDecodeTag();
+            inter_cpu_backend->toggleSwitching();
             decode_cpu_backend->setSequenceLength(real_seq_length);
-            decode_cpu_backend->switchDecodeTag();
+            decode_cpu_backend->toggleSwitching();
 
             // // 2: Decoding stage using CPU execute
             for (int step = real_seq_length; step < real_seq_length + 100; step++) {
@@ -207,9 +207,9 @@ int main(int argc, char **argv) {
                 std::cout << out_token << std::flush;
 
                 if (step == real_seq_length) {
-                    prefill_cpu_backend->switchDecodeTag();
-                    inter_cpu_backend->switchDecodeTag();
-                    decode_cpu_backend->switchDecodeTag();
+                    prefill_cpu_backend->toggleSwitching();
+                    inter_cpu_backend->toggleSwitching();
+                    decode_cpu_backend->toggleSwitching();
                 }
             }
         } while (false);

diff --git a/include/Types.hpp b/include/Types.hpp
@@ -36,6 +36,7 @@ enum TensorStatus {
     // TENSOR_DYNAMIC,
     TENSOR_STATIC_INIT,
     TENSOR_STATIC_READY,
+    TENSOR_UNDEFINED,
 };
 
 enum ErrorCode {
@@ -141,7 +142,7 @@ enum RoPEType {
     MLAROPE = 5,
 };
 
-enum QNNExecutionType {
+enum ExecutionType {
     PROMPT = 0,
     AUTOREGRESSIVE = 1,
 };

diff --git a/scripts/build_qnn_android.sh b/scripts/build_qnn_android.sh
@@ -16,4 +16,4 @@ cmake .. \
 -DQNN_VALIDATE_NODE=ON \
 -DMLLM_BUILD_XNNPACK_BACKEND=OFF
 
-make -j16
+make -j40
diff --git a/scripts/run_phonelm_qnn.sh b/scripts/run_phonelm_qnn.sh
@@ -0,0 +1,44 @@
+!/bin/bash
+
+adb shell mkdir -p /data/local/tmp/mllm/vocab
+adb shell mkdir -p /data/local/tmp/mllm/qnn-lib
+
+adb push ../vocab/phonelm_vocab.mllm /data/local/tmp/mllm/vocab/
+
+
+if ! adb shell [ -f "/data/local/tmp/mllm/models/PhoneLM-1.5B-Instruct-128.mllm" ]; then
+    adb push ../models/PhoneLM-1.5B-Instruct-128.mllm "/data/local/tmp/mllm/models/PhoneLM-1.5B-Instruct-128.mllm"
+else
+    echo "PhoneLM-1.5B-Instruct-128 file already exists"
+fi
+
+
+if ! adb shell [ -f "/data/local/tmp/mllm/models//phonelm-1.5b-instruct-q4_0_4_4.mllm" ]; then
+    adb push ../models//phonelm-1.5b-instruct-q4_0_4_4.mllm "/data/local/tmp/mllm/models//phonelm-1.5b-instruct-q4_0_4_4.mllm"
+else
+    echo "/phonelm-1.5b-instruct-q4_0_4_4.mllm file already exists"
+fi
+
+LIBPATH=../src/backends/qnn/qualcomm_ai_engine_direct_220/
+ANDR_LIB=$LIBPATH/lib/aarch64-android
+OP_PATH=../src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/build
+DEST=/data/local/tmp/mllm/qnn-lib
+
+adb push $ANDR_LIB/libQnnHtp.so $DEST
+adb push $ANDR_LIB/libQnnHtpV75Stub.so $DEST
+adb push $ANDR_LIB/libQnnHtpPrepare.so $DEST
+adb push $ANDR_LIB/libQnnHtpProfilingReader.so $DEST
+adb push $ANDR_LIB/libQnnHtpOptraceProfilingReader.so $DEST
+adb push $ANDR_LIB/libQnnHtpV75CalculatorStub.so $DEST
+adb push $LIBPATH/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so $DEST
+adb push $OP_PATH/aarch64-android/libQnnLLaMAPackage.so $DEST/libQnnLLaMAPackage_CPU.so
+adb push $OP_PATH/hexagon-v75/libQnnLLaMAPackage.so $DEST/libQnnLLaMAPackage_HTP.so
+
+
+if [ $? -ne 0 ]; then
+    echo "adb push failed"
+    exit 1
+fi
+
+adb push ../bin-arm/demo_phonelm_npu /data/local/tmp/mllm/bin/
+adb shell "cd /data/local/tmp/mllm/bin && export LD_LIBRARY_PATH=/data/local/tmp/mllm/qnn-lib && export ADSP_LIBRARY_PATH=/data/local/tmp/mllm/qnn-lib && ./demo_phonelm_npu"
diff --git a/src/Generate.hpp b/src/Generate.hpp
@@ -29,6 +29,7 @@ struct LlmTextGeneratorOpts {
     float top_p = 0.92;
     bool is_padding = false;
     int seq_before_padding = 0;
+    int chunk_size = -1;
 };
 
 template <typename T>
@@ -51,12 +52,15 @@ enum class LLmTextGeneratorType : int32_t {
 class _LlmTextGenerateMethod {
     bool is_padding = false;
     int seq_before_padding = 0;
+    int chunk_size = -1;
+
 public:
     virtual ~_LlmTextGenerateMethod() = default;
     virtual unsigned int generate(Tensor &t) = 0;
-    inline void setPadding(bool is_padding, int seq_before_padding) {
+    inline void setPadding(bool is_padding, int seq_before_padding, int chunk_size) {
         this->is_padding = is_padding;
         this->seq_before_padding = seq_before_padding;
+        this->chunk_size = chunk_size;
     }
     inline void _tensor_to_vec(Tensor &t, std::vector<float> &scores) {
         assert(t.batch() == 1 && "Batch size of result is not 1. Which is not supported for now.");
@@ -65,7 +69,11 @@ class _LlmTextGenerateMethod {
         int _seq = t.sequence() - 1;
         // padding prefill for QNN
         if (is_padding) {
-            _seq = seq_before_padding - 1;
+            if (chunk_size > 0) {
+                _seq = (seq_before_padding - 1) % chunk_size;
+            } else {
+                _seq = seq_before_padding - 1;
+            }
         }
         for (int i = 0; i < _dims; ++i) {
             auto value = t.dataAt<float>(0, 0, _seq, i);
@@ -159,14 +167,21 @@ class LlmTextGenerator {
 
         // padding prefill for QNN
         if (opt.is_padding) {
-            m_method_class->setPadding(opt.is_padding, opt.seq_before_padding);
+            m_method_class->setPadding(opt.is_padding, opt.seq_before_padding, opt.chunk_size);
         }
     }
 
     inline unsigned int generate(Tensor &t) {
         return m_method_class->generate(t);
     }
 
+    inline unsigned int generate(Tensor &t, const LlmTextGeneratorOpts &opt) {
+        if (opt.is_padding) {
+            m_method_class->setPadding(opt.is_padding, opt.seq_before_padding, opt.chunk_size);
+        }
+        return m_method_class->generate(t);
+    }
+
     inline LLmTextGeneratorType type() {
         return m_type;
     }