From e428bb5a9a4964785a078342e81de0d6546db815 Mon Sep 17 00:00:00 2001
From: oreomaker <zh002919@outlook.com>
Date: Fri, 8 Nov 2024 16:03:42 +0800
Subject: [PATCH 1/4] dev: qnn multi input inference developing

---
 examples/demo_phonelm_npu.cpp               | 23 +++++++++++++++++----
 examples/demo_qwen_npu.cpp                  |  4 ++--
 examples/main_phonelm_npu.cpp               | 12 +++++------
 examples/main_qwen_npu.cpp                  | 12 +++++------
 include/Types.hpp                           |  2 +-
 src/backends/cpu/CPUBackend.hpp             | 15 ++++++++++----
 src/backends/cpu/CPUIRoPE.cpp               |  8 +++++++
 src/backends/cpu/CPUKVCacheNPU.cpp          | 12 ++++++++---
 src/backends/cpu/CPURoPE.cpp                |  7 +++++++
 src/backends/cpu/CPUSoftMax.cpp             |  2 +-
 src/backends/qnn/QNNExecutor.hpp            |  2 +-
 src/models/phonelm/modeling_phonelm_npu.hpp |  6 +++++-
 12 files changed, 76 insertions(+), 29 deletions(-)
diff --git a/examples/demo_phonelm_npu.cpp b/examples/demo_phonelm_npu.cpp
index 6d774230..d4ce1484 100644
--- a/examples/demo_phonelm_npu.cpp
+++ b/examples/demo_phonelm_npu.cpp
@@ -1,3 +1,4 @@
+#include "Types.hpp"
 #ifdef USE_QNN
 #include "backends/cpu/CPUBackend.hpp"
 #include "cmdline.h"
@@ -31,6 +32,8 @@ int main(int argc, char **argv) {
 
     vector<string> in_strs = {
         "Give me a short introduction to large language model.",
+        "What is the Beijing University of Posts and Telecommunications.",
+        "What is the meaning of life?",
     };
 
     for (int i = 0; i < in_strs.size(); ++i) {
@@ -50,7 +53,13 @@ int main(int argc, char **argv) {
             .is_padding = true,
             .seq_before_padding = real_seq_length,
         };
+        bool isSwitched = false;
         model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
+            if (i != 0 && !isSwitched) {
+                // turn off switching
+                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
+                isSwitched = true;
+            }
             auto out_string = tokenizer.detokenize({out_token});
             auto [not_end, output_string] = tokenizer.postprocess(out_string);
             if (!not_end) { return false; }
@@ -58,8 +67,10 @@ int main(int argc, char **argv) {
             return true;
         });
 
+        // turn on switching, set sequence length and execution type
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
-        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(AUTOREGRESSIVE);
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
 
         LlmTextGeneratorOpts decoding_opt{
             .max_new_tokens = 100,
@@ -69,11 +80,11 @@ int main(int argc, char **argv) {
             .top_p = 0.f,
             .is_padding = false,
         };
-        bool isSwitched = false;
+        isSwitched = false;
         decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
-            // call only once of switchDecodeTag
             if (!isSwitched) {
-                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+                // turn off switching
+                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
                 isSwitched = true;
             }
             auto out_string = tokenizer.detokenize({out_token});
@@ -86,6 +97,10 @@ int main(int argc, char **argv) {
             return true;
         });
         std::cout << "\n---------------" << std::endl;
+        // turn on switching, set sequence length and execution type
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(0);
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
     }
 }
 #endif
\ No newline at end of file
diff --git a/examples/demo_qwen_npu.cpp b/examples/demo_qwen_npu.cpp
index 421e7d0e..2ab916c7 100644
--- a/examples/demo_qwen_npu.cpp
+++ b/examples/demo_qwen_npu.cpp
@@ -61,7 +61,7 @@ int main(int argc, char **argv) {
         });
 
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
-        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
 
         LlmTextGeneratorOpts decoding_opt{
             .max_new_tokens = 100,
@@ -75,7 +75,7 @@ int main(int argc, char **argv) {
         decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
             // call only once of switchDecodeTag
             if (!isSwitched) {
-                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
                 isSwitched = true;
             }
             auto out_string = tokenizer.detokenize({out_token});
diff --git a/examples/main_phonelm_npu.cpp b/examples/main_phonelm_npu.cpp
index 0cff3898..af0b6186 100644
--- a/examples/main_phonelm_npu.cpp
+++ b/examples/main_phonelm_npu.cpp
@@ -190,11 +190,11 @@ int main(int argc, char **argv) {
             auto inter_cpu_backend = dynamic_cast<CPUBackend *>(interNet.backends()[MLLM_CPU].get());
             auto decode_cpu_backend = dynamic_cast<CPUBackend *>(cpuNet.backends()[MLLM_CPU].get());
             prefill_cpu_backend->setSequenceLength(real_seq_length);
-            prefill_cpu_backend->switchDecodeTag();
+            prefill_cpu_backend->toggleSwitching();
             inter_cpu_backend->setSequenceLength(real_seq_length);
-            inter_cpu_backend->switchDecodeTag();
+            inter_cpu_backend->toggleSwitching();
             decode_cpu_backend->setSequenceLength(real_seq_length);
-            decode_cpu_backend->switchDecodeTag();
+            decode_cpu_backend->toggleSwitching();
 
             // // 2: Decoding stage using CPU execute
             for (int step = real_seq_length; step < real_seq_length + 100; step++) {
@@ -210,9 +210,9 @@ int main(int argc, char **argv) {
                 std::cout << out_token << std::flush;
 
                 if (step == real_seq_length) {
-                    prefill_cpu_backend->switchDecodeTag();
-                    inter_cpu_backend->switchDecodeTag();
-                    decode_cpu_backend->switchDecodeTag();
+                    prefill_cpu_backend->toggleSwitching();
+                    inter_cpu_backend->toggleSwitching();
+                    decode_cpu_backend->toggleSwitching();
                 }
             }
         } while (false);
diff --git a/examples/main_qwen_npu.cpp b/examples/main_qwen_npu.cpp
index b096100b..4adc7fa7 100644
--- a/examples/main_qwen_npu.cpp
+++ b/examples/main_qwen_npu.cpp
@@ -187,11 +187,11 @@ int main(int argc, char **argv) {
             auto inter_cpu_backend = dynamic_cast<CPUBackend *>(interNet.backends()[MLLM_CPU].get());
             auto decode_cpu_backend = dynamic_cast<CPUBackend *>(cpuNet.backends()[MLLM_CPU].get());
             prefill_cpu_backend->setSequenceLength(real_seq_length);
-            prefill_cpu_backend->switchDecodeTag();
+            prefill_cpu_backend->toggleSwitching();
             inter_cpu_backend->setSequenceLength(real_seq_length);
-            inter_cpu_backend->switchDecodeTag();
+            inter_cpu_backend->toggleSwitching();
             decode_cpu_backend->setSequenceLength(real_seq_length);
-            decode_cpu_backend->switchDecodeTag();
+            decode_cpu_backend->toggleSwitching();
 
             // // 2: Decoding stage using CPU execute
             for (int step = real_seq_length; step < real_seq_length + 100; step++) {
@@ -207,9 +207,9 @@ int main(int argc, char **argv) {
                 std::cout << out_token << std::flush;
 
                 if (step == real_seq_length) {
-                    prefill_cpu_backend->switchDecodeTag();
-                    inter_cpu_backend->switchDecodeTag();
-                    decode_cpu_backend->switchDecodeTag();
+                    prefill_cpu_backend->toggleSwitching();
+                    inter_cpu_backend->toggleSwitching();
+                    decode_cpu_backend->toggleSwitching();
                 }
             }
         } while (false);
diff --git a/include/Types.hpp b/include/Types.hpp
index 70cd9ae4..3ce92ff4 100644
--- a/include/Types.hpp
+++ b/include/Types.hpp
@@ -140,7 +140,7 @@ enum RoPEType {
     MLAROPE = 5,
 };
 
-enum QNNExecutionType {
+enum ExecutionType {
     PROMPT = 0,
     AUTOREGRESSIVE = 1,
 };
diff --git a/src/backends/cpu/CPUBackend.hpp b/src/backends/cpu/CPUBackend.hpp
index 816935d9..75f96412 100644
--- a/src/backends/cpu/CPUBackend.hpp
+++ b/src/backends/cpu/CPUBackend.hpp
@@ -39,11 +39,17 @@ class CPUBackend final : public Backend {
     int getSequenceLength() {
         return sequence_length_;
     }
-    void switchDecodeTag() {
-        isPrefillToDecode = !isPrefillToDecode;
+    void toggleSwitching() {
+        isSwitchingStage = !isSwitchingStage;
     }
     bool isStageSwitching() {
-        return isPrefillToDecode;
+        return isSwitchingStage;
+    }
+    void setExecutionType(ExecutionType type) {
+        execution_type = type;
+    }
+    ExecutionType getExecutionType() {
+        return execution_type;
     }
 #endif
 private:
@@ -51,7 +57,8 @@ class CPUBackend final : public Backend {
     std::map<TensorFuncType, TensorFunction *> map_function_;
 #ifdef USE_QNN
     int sequence_length_ = 0;
-    bool isPrefillToDecode = false;
+    bool isSwitchingStage = false;
+    ExecutionType execution_type = PROMPT;
 #endif
 };
 
diff --git a/src/backends/cpu/CPUIRoPE.cpp b/src/backends/cpu/CPUIRoPE.cpp
index 46562df2..12b4ba39 100644
--- a/src/backends/cpu/CPUIRoPE.cpp
+++ b/src/backends/cpu/CPUIRoPE.cpp
@@ -362,6 +362,14 @@ void CPUIRoPE::rope_mla(shared_ptr<Tensor> input, shared_ptr<Tensor> output){
 }
 
 ErrorCode CPUIRoPE::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    // if use QNN, when a new prompt input, the seq should be reset to 0 here as the setUp is not called
+#ifdef USE_QNN
+    auto cpuBackend = dynamic_cast<CPUBackend *>(backend_);
+    if (cpuBackend->isStageSwitching() && cpuBackend->getExecutionType() == PROMPT) {
+        h_cnt_ = 0;
+    }
+#endif
+
     auto &input = inputs[0];
     auto &output = outputs[0];
     auto out_dtype = output->dtype();
diff --git a/src/backends/cpu/CPUKVCacheNPU.cpp b/src/backends/cpu/CPUKVCacheNPU.cpp
index 582a52e8..ae2c24bd 100644
--- a/src/backends/cpu/CPUKVCacheNPU.cpp
+++ b/src/backends/cpu/CPUKVCacheNPU.cpp
@@ -34,7 +34,7 @@ ErrorCode CPUKVCacheNPU::reshape(vector<shared_ptr<Tensor>> inputs, vector<share
 #ifdef USE_QNN
     // when the execution is switched from pref to dec, the sequence length should be set to the no padding length
     auto cpuBackend = dynamic_cast<CPUBackend *>(backend_);
-    if (cpuBackend->isStageSwitching()) {
+    if (cpuBackend->isStageSwitching() && cpuBackend->getExecutionType() == AUTOREGRESSIVE) {
         cache_seq_len_ = cpuBackend->getSequenceLength();
         isDecoding = true;
     }
@@ -57,9 +57,16 @@ ErrorCode CPUKVCacheNPU::load(AbstructLoader &loader) {
 }
 
 ErrorCode CPUKVCacheNPU::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    // if a new prompt is given, the cache should be updated
+    auto cpuBackend = dynamic_cast<CPUBackend *>(backend_);
+    if (cpuBackend->isStageSwitching() && cpuBackend->getExecutionType() == PROMPT) {
+        isDecoding = false;
+        cache_seq_len_ = 0;
+        outputs[0]->deepCopyFrom(cache_, false, {0, 0, cache_seq_len_ / cache_limit_, 0});
+    }
+
     // when decoding, the input will deepCopy from cache, no need to execute
     if (isDecoding) {
-        int cache_seq_len_old = cache_seq_len_;
         cache_seq_len_ += inputs[0]->sequence();
         return MLLM_NO_ERROR;
     }
@@ -125,7 +132,6 @@ ErrorCode CPUKVCacheNPU::execute(vector<shared_ptr<Tensor>> inputs, vector<share
         }
     }
 
-    int cache_seq_len_old = cache_seq_len_;
     cache_seq_len_ += inputs[0]->sequence();
 
     return Op::execute(inputs, outputs);
diff --git a/src/backends/cpu/CPURoPE.cpp b/src/backends/cpu/CPURoPE.cpp
index 493b2847..b00ab8aa 100644
--- a/src/backends/cpu/CPURoPE.cpp
+++ b/src/backends/cpu/CPURoPE.cpp
@@ -331,6 +331,13 @@ void CPURoPE::rope_mla(shared_ptr<Tensor> input, shared_ptr<Tensor> output) {
 }
 
 ErrorCode CPURoPE::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    // if use QNN, when a new prompt input, the seq should be reset to 0 here as the setUp is not called
+#ifdef USE_QNN
+    auto cpuBackend = dynamic_cast<CPUBackend *>(backend_);
+    if (cpuBackend->isStageSwitching() && cpuBackend->getExecutionType() == PROMPT) {
+        h_cnt_ = 0;
+    }
+#endif
     auto &input = inputs[0];
     auto &output = outputs[0];
     auto out_dtype = output->dtype();
diff --git a/src/backends/cpu/CPUSoftMax.cpp b/src/backends/cpu/CPUSoftMax.cpp
index 45545eaa..d9c5c372 100644
--- a/src/backends/cpu/CPUSoftMax.cpp
+++ b/src/backends/cpu/CPUSoftMax.cpp
@@ -37,7 +37,7 @@ ErrorCode CPUSoftMax::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_p
     } else {
 #ifndef LLAMAFILE_SGEMM
         old_dim = input->dimension() - input->sequence();
-#elif defined(USE_QNN) // TODO: 就前端才用到，一定要记得删除
+#elif defined(USE_QNN) // TODO: 旧前端才用到，一定要记得删除
         old_dim = input->dimension() - input->sequence();
 #endif
     }
diff --git a/src/backends/qnn/QNNExecutor.hpp b/src/backends/qnn/QNNExecutor.hpp
index 6d335c08..5f29d15e 100644
--- a/src/backends/qnn/QNNExecutor.hpp
+++ b/src/backends/qnn/QNNExecutor.hpp
@@ -63,7 +63,7 @@ class QNNExecutor : public Executor {
 
 protected:
     bool isSetup_ = false;
-    QNNExecutionType executionType_ = PROMPT;
+    ExecutionType executionType_ = PROMPT;
 };
 
 class QNNPipelineExecutor : public QNNExecutor {
diff --git a/src/models/phonelm/modeling_phonelm_npu.hpp b/src/models/phonelm/modeling_phonelm_npu.hpp
index 29554c32..0b03cb1d 100644
--- a/src/models/phonelm/modeling_phonelm_npu.hpp
+++ b/src/models/phonelm/modeling_phonelm_npu.hpp
@@ -124,7 +124,11 @@ class PhoneLMQKVmm final : public Module {
         v = v_cache(v);
 
         auto qk = Tensor::mm(q, k.transpose(Chl::SEQUENCE, Chl::DIMENSION));
-        qk = softmax(qk, k_cache.getCacheSeqLen());
+        if (k_cache.ready() && v_cache.ready()) {
+            qk = softmax(qk, k_cache.getCacheSeqLen());
+        } else {
+            qk = softmax(qk);
+        }
         auto o = Tensor::mm(qk, v);
 
         o = o_quantize(o);

From f94937d6cdfe6256292bc0d44fab7389c99915d0 Mon Sep 17 00:00:00 2001
From: oreomaker <zh002919@outlook.com>
Date: Wed, 13 Nov 2024 17:03:30 +0800
Subject: [PATCH 2/4] feat: qnn multi chunk prefilling in new frontend
 refactor: qnn module setup skip in following chunks todo: multi input

---
 examples/demo_phonelm_npu.cpp             | 66 ++++++++++++++++-------
 include/Types.hpp                         |  1 +
 src/Generate.hpp                          | 13 +++--
 src/Module.cpp                            |  3 ++
 src/Module.hpp                            | 15 ++++++
 src/backends/cpu/CPUKVCacheNPU.cpp        | 12 ++---
 src/backends/cpu/CPURoPE.cpp              |  7 ---
 src/backends/xnnpack/third_party/fmt      |  1 +
 src/models/smollm/tokenization_smollm.hpp | 52 ++++++++++++++++++
 9 files changed, 133 insertions(+), 37 deletions(-)
 create mode 160000 src/backends/xnnpack/third_party/fmt

diff --git a/examples/demo_phonelm_npu.cpp b/examples/demo_phonelm_npu.cpp
index d4ce1484..7f568f58 100644
--- a/examples/demo_phonelm_npu.cpp
+++ b/examples/demo_phonelm_npu.cpp
@@ -1,4 +1,6 @@
+#include "Module.hpp"
 #include "Types.hpp"
+#include <memory>
 #ifdef USE_QNN
 #include "backends/cpu/CPUBackend.hpp"
 #include "cmdline.h"
@@ -15,12 +17,14 @@ int main(int argc, char **argv) {
     cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/PhoneLM-1.5B-Instruct-128.mllm");
     cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
     cmdParser.add<int>("thread", 't', "num of threads", false, 4);
+    cmdParser.add<int>("chunk", 'c', "chunk size", false, 64);
     cmdParser.parse_check(argc, argv);
 
     string vocab_path = cmdParser.get<string>("vocab");
     string merge_path = cmdParser.get<string>("merge");
     string model_path = cmdParser.get<string>("model");
     int tokens_limit = cmdParser.get<int>("limits");
+    int chunk_size = cmdParser.get<int>("chunk");
     CPUBackend::cpu_threads = cmdParser.get<int>("thread");
 
     auto tokenizer = SmolLMTokenizer(vocab_path, merge_path);
@@ -36,36 +40,57 @@ int main(int argc, char **argv) {
         "What is the meaning of life?",
     };
 
+    // turn on the multi-chunk prefilling
+    Module::isMultiChunkPrefilling = true;
+
     for (int i = 0; i < in_strs.size(); ++i) {
         auto input_str = tokenizer.apply_chat_template(in_strs[i]);
-        auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 64, config.vocab_size);
-        std::cout << real_seq_length << endl;
-        std::cout << input_str << std::endl;
+        auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);
+
+        const int seq_length_padding = (chunk_size - real_seq_length % chunk_size) + real_seq_length;
+        const int chunk_num = seq_length_padding / chunk_size;
+        bool isSwitched = false;
+
+        std::cout << "real seq length: " << real_seq_length << " padding to: " << seq_length_padding << " chunk num: " << chunk_num << std::endl;
+
         std::cout << "[Q] " << in_strs[i] << std::endl;
         std::cout << "[A] " << std::flush;
 
+        // tensor vectors to save the chunked tensors of the QNN prefilling input
+        vector<Tensor> chunked_tensors(chunk_num);
         LlmTextGeneratorOpts opt{
             .max_new_tokens = 1,
             .do_sample = false,
-            .temperature = 0.3f,
-            .top_k = 50,
-            .top_p = 0.f,
             .is_padding = true,
             .seq_before_padding = real_seq_length,
+            .chunk_size = chunk_size,
         };
-        bool isSwitched = false;
-        model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
-            if (i != 0 && !isSwitched) {
-                // turn off switching
-                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
-                isSwitched = true;
-            }
-            auto out_string = tokenizer.detokenize({out_token});
-            auto [not_end, output_string] = tokenizer.postprocess(out_string);
-            if (!not_end) { return false; }
-            std::cout << output_string << std::flush;
-            return true;
-        });
+
+        for (int chunk_id = 0; chunk_id < chunk_num; ++chunk_id) {
+            chunked_tensors[chunk_id].setBackend(Backend::global_backends[MLLM_CPU]);
+            chunked_tensors[chunk_id].setTtype(INPUT_TENSOR);
+            chunked_tensors[chunk_id].reshape(1, 1, chunk_size, 1);
+            chunked_tensors[chunk_id].setName("input-chunk-" + to_string(chunk_id));
+            chunked_tensors[chunk_id].deepCopyFrom(&input_tensor, false, {0, 0, chunk_id * chunk_size, 0});
+
+            model.generate(chunked_tensors[chunk_id], opt, [&](unsigned int out_token) -> bool {
+                if (i != 0 && !isSwitched && chunk_id == 0) {
+                    // turn off switching at the first chunk of following inputs
+                    static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
+                    isSwitched = true;
+                }
+                auto out_string = tokenizer.detokenize({out_token});
+                auto [not_end, output_string] = tokenizer.postprocess(out_string);
+                if (!not_end) { return false; }
+
+                if (chunk_id == chunk_num - 1) { // print the output of the last chunk
+                    std::cout << output_string << std::flush;
+                }
+                return true;
+            });
+
+            Module::isFirstChunk = false;
+        }
 
         // turn on switching, set sequence length and execution type
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
@@ -80,8 +105,9 @@ int main(int argc, char **argv) {
             .top_p = 0.f,
             .is_padding = false,
         };
+
         isSwitched = false;
-        decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
+        decoding_model.generate(chunked_tensors.back(), decoding_opt, [&](unsigned int out_token) -> bool {
             if (!isSwitched) {
                 // turn off switching
                 static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
diff --git a/include/Types.hpp b/include/Types.hpp
index 9b634493..040e6712 100644
--- a/include/Types.hpp
+++ b/include/Types.hpp
@@ -36,6 +36,7 @@ enum TensorStatus {
     // TENSOR_DYNAMIC,
     TENSOR_STATIC_INIT,
     TENSOR_STATIC_READY,
+    TENSOR_UNDEFINED,
 };
 
 enum ErrorCode {
diff --git a/src/Generate.hpp b/src/Generate.hpp
index 7c5b11b9..4ae9a863 100644
--- a/src/Generate.hpp
+++ b/src/Generate.hpp
@@ -29,6 +29,7 @@ struct LlmTextGeneratorOpts {
     float top_p = 0.92;
     bool is_padding = false;
     int seq_before_padding = 0;
+    int chunk_size = -1;
 };
 
 template <typename T>
@@ -51,12 +52,14 @@ enum class LLmTextGeneratorType : int32_t {
 class _LlmTextGenerateMethod {
     bool is_padding = false;
     int seq_before_padding = 0;
+    int chunk_size = -1;
 public:
     virtual ~_LlmTextGenerateMethod() = default;
     virtual unsigned int generate(Tensor &t) = 0;
-    inline void setPadding(bool is_padding, int seq_before_padding) {
+    inline void setPadding(bool is_padding, int seq_before_padding, int chunk_size) {
         this->is_padding = is_padding;
         this->seq_before_padding = seq_before_padding;
+        this->chunk_size = chunk_size;
     }
     inline void _tensor_to_vec(Tensor &t, std::vector<float> &scores) {
         assert(t.batch() == 1 && "Batch size of result is not 1. Which is not supported for now.");
@@ -65,7 +68,11 @@ class _LlmTextGenerateMethod {
         int _seq = t.sequence() - 1;
         // padding prefill for QNN
         if (is_padding) {
-            _seq = seq_before_padding - 1;
+            if (chunk_size > 0) {
+                _seq = (seq_before_padding - 1) % chunk_size;
+            } else {
+                _seq = seq_before_padding - 1;
+            }
         }
         for (int i = 0; i < _dims; ++i) {
             auto value = t.dataAt<float>(0, 0, _seq, i);
@@ -159,7 +166,7 @@ class LlmTextGenerator {
 
         // padding prefill for QNN
         if (opt.is_padding) {
-            m_method_class->setPadding(opt.is_padding, opt.seq_before_padding);
+            m_method_class->setPadding(opt.is_padding, opt.seq_before_padding, opt.chunk_size);
         }
     }
 
diff --git a/src/Module.cpp b/src/Module.cpp
index 942e9941..95479179 100644
--- a/src/Module.cpp
+++ b/src/Module.cpp
@@ -13,6 +13,9 @@ namespace mllm {
 // The llm_model_ptr is a pointer to the outmost module
 Module *Module::llm_model_ptr;
 
+bool Module::isMultiChunkPrefilling = false;
+bool Module::isFirstChunk = true;
+
 int Module::listIdx;
 int Module::runlistIdx;
 // TensorStatus Tensor::tensor_status;
diff --git a/src/Module.hpp b/src/Module.hpp
index 92f5e48e..69256561 100644
--- a/src/Module.hpp
+++ b/src/Module.hpp
@@ -39,6 +39,10 @@ class Module {
     bool doLoad = false;
 
     static Module *llm_model_ptr;
+    // tag to indicate the multi-chunk prefilling
+    static bool isMultiChunkPrefilling;
+    // tag to indicate the first chunk
+    static bool isFirstChunk;
 
     static int listIdx;
     static int runlistIdx;
@@ -188,6 +192,11 @@ class Module {
                 if (inputs[0].sequence() != 1 && !last_shape_bshd_.empty()) {
                     // if LLM/VLLM model, the `need_setup` should be `true`
                     if (input.batch() == last_shape_bshd_[i][0] & input.sequence() == last_shape_bshd_[i][1] & input.head() == last_shape_bshd_[i][2] & input.dimension() == last_shape_bshd_[i][3]) {
+                        // if it is the QNN multi-chunk prefilling, the `need_setup` should be `true` to reshape & setUp CPU Ops
+                        if (Module::isMultiChunkPrefilling) {
+                            need_setup = true;
+                            break;
+                        }
                         need_setup = false;
                     }
                 }
@@ -215,6 +224,12 @@ class Module {
         } else { // inner Modules
             // offload according to the backends' info inited during loading
             if (Tensor::tensor_status == TENSOR_STATIC_INIT && device_ != MLLM_CPU) { // backend specific module reshape & setup
+                if (Module::isMultiChunkPrefilling && !Module::isFirstChunk) {        // set to TENSOR_UNDEFINED and SKIP executing qnn layers
+                    Tensor::tensor_status = TENSOR_UNDEFINED;
+                    auto outputs =  Forward(inputs, anyArgs);
+                    Tensor::tensor_status = TENSOR_STATIC_INIT;
+                    return outputs;
+                }
                 auto inputs_vec = vector<shared_ptr<Tensor>>();
                 auto outputs_vec = vector<shared_ptr<Tensor>>();
                 for (auto &i : inputs) {
diff --git a/src/backends/cpu/CPUKVCacheNPU.cpp b/src/backends/cpu/CPUKVCacheNPU.cpp
index 06d32ef8..c7b24236 100644
--- a/src/backends/cpu/CPUKVCacheNPU.cpp
+++ b/src/backends/cpu/CPUKVCacheNPU.cpp
@@ -38,6 +38,11 @@ ErrorCode CPUKVCacheNPU::reshape(vector<shared_ptr<Tensor>> inputs, vector<share
         cache_seq_len_ = cpuBackend->getSequenceLength();
         isDecoding = true;
     }
+    // if a new prompt is given, the cache should be updated
+    if (cpuBackend->isStageSwitching() && cpuBackend->getExecutionType() == PROMPT) {
+        cache_seq_len_ = cpuBackend->getSequenceLength();
+        isDecoding = false;
+    }
 #endif
 
     outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence() + cache_seq_len_, inputs[0]->dimension());
@@ -56,13 +61,6 @@ ErrorCode CPUKVCacheNPU::load(AbstructLoader &loader) {
 }
 
 ErrorCode CPUKVCacheNPU::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
-    // if a new prompt is given, the cache should be updated
-    auto cpuBackend = dynamic_cast<CPUBackend *>(backend_);
-    if (cpuBackend->isStageSwitching() && cpuBackend->getExecutionType() == PROMPT) {
-        isDecoding = false;
-        cache_seq_len_ = 0;
-        outputs[0]->deepCopyFrom(cache_, false, {0, 0, cache_seq_len_ / cache_limit_, 0});
-    }
 
     // when decoding, the input will deepCopy from cache, no need to execute
     if (isDecoding) {
diff --git a/src/backends/cpu/CPURoPE.cpp b/src/backends/cpu/CPURoPE.cpp
index 981c484f..2235dad6 100644
--- a/src/backends/cpu/CPURoPE.cpp
+++ b/src/backends/cpu/CPURoPE.cpp
@@ -331,13 +331,6 @@ void CPURoPE::rope_mla(shared_ptr<Tensor> input, shared_ptr<Tensor> output) {
 }
 
 ErrorCode CPURoPE::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
-    // if use QNN, when a new prompt input, the seq should be reset to 0 here as the setUp is not called
-#ifdef USE_QNN
-    auto cpuBackend = dynamic_cast<CPUBackend *>(backend_);
-    if (cpuBackend->isStageSwitching() && cpuBackend->getExecutionType() == PROMPT) {
-        h_cnt_ = 0;
-    }
-#endif
     auto &input = inputs[0];
     auto &output = outputs[0];
     auto out_dtype = output->dtype();
diff --git a/src/backends/xnnpack/third_party/fmt b/src/backends/xnnpack/third_party/fmt
new file mode 160000
index 00000000..0c9fce2f
--- /dev/null
+++ b/src/backends/xnnpack/third_party/fmt
@@ -0,0 +1 @@
+Subproject commit 0c9fce2ffefecfdce794e1859584e25877b7b592
diff --git a/src/models/smollm/tokenization_smollm.hpp b/src/models/smollm/tokenization_smollm.hpp
index 23a7af98..44ed7251 100644
--- a/src/models/smollm/tokenization_smollm.hpp
+++ b/src/models/smollm/tokenization_smollm.hpp
@@ -229,6 +229,58 @@ class SmolLMTokenizer final : public BPETokenizer {
         return std::make_pair(realLength, Tokenizer::tokens2Input(ret));
     }
 
+    // padding the input by neareast multiplication of chunk_size
+    std::pair<int, Tensor> tokenizePaddingByChunk(std::string &text, int chunk_size, int vocab_size) {
+        std::vector<token_id_t> ret;
+
+        if (split_special_tokens_) {
+            const auto word_collection = unicode_regex_split(text, regex_exprs);
+            for (auto &piece : word_collection) {
+                // look up table
+                // std::string token;
+                // for (auto b : UTF8(piece)) token += byte_encoder_[b];
+
+                // using bpe
+                std::vector<token_id_t> tmp;
+                BPETokenizer::tokenize(piece, tmp, false, true, "");
+                ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
+            }
+        } else {
+            auto parts = _splitWithDelimiters(text, special_tokens);
+            // for (auto p : parts) {
+            //     std::cout << "\"" << p << "\"" << std::endl;
+            // }
+            for (auto &p : parts) {
+                if (std::find(special_tokens.begin(), special_tokens.end(), p) != special_tokens.end()) {
+                    std::string token;
+                    for (auto b : UTF8(p)) token += byte_encoder_[b];
+
+                    std::vector<token_id_t> tmp;
+                    BPETokenizer::tokenize(token, tmp, false, special_tokens, true);
+                    ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
+                } else {
+                    const auto word_collection = unicode_regex_split(p, regex_exprs);
+                    for (auto &piece : word_collection) {
+                        // look up table
+                        // std::string token;
+                        // for (auto b : UTF8(piece)) token += byte_encoder_[b];
+
+                        // using bpe
+                        std::vector<token_id_t> tmp;
+                        BPETokenizer::tokenize(piece, tmp, false, true, "");
+                        assert(!tmp.empty());
+                        ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
+                    }
+                }
+            }
+        }
+
+        auto realLength = ret.size();
+        int paddingLength = (chunk_size - realLength % chunk_size) % chunk_size;
+        ret.resize(realLength + paddingLength, vocab_size);
+        return std::make_pair(realLength, Tokenizer::tokens2Input(ret));
+    }
+
     std::string _byte_decode_(const std::string &text) {
         std::string ret;
         auto _ = ORD(text);

From 5ff02cdb6a4c6eae939da9b0fd1b724282b56119 Mon Sep 17 00:00:00 2001
From: yirongjie <yirj0809@gmail.com>
Date: Thu, 14 Nov 2024 02:27:50 +0000
Subject: [PATCH 3/4] fix: clearCache in RoPE

---
 examples/demo_phonelm_npu.cpp                 |  2 +
 examples/demo_qwen2.5_npu.cpp                 |  4 +-
 scripts/build_android_app.sh                  | 10 +--
 scripts/build_qnn_android.sh                  |  2 +-
 src/Layer.hpp                                 |  8 +++
 src/Op.hpp                                    |  2 +-
 src/backends/cpu/CPUIRoPE.cpp                 | 72 +++++++++----------
 src/backends/cpu/CPUIRoPE.hpp                 |  4 +-
 src/backends/cpu/CPURoPE.cpp                  |  2 +-
 src/backends/cpu/CPURoPE.hpp                  |  4 +-
 src/backends/xnnpack/third_party/fmt          |  1 -
 src/models/dclm/modeling_dclm.hpp             |  4 +-
 src/models/fuyu/modeling_fuyu.hpp             |  6 +-
 src/models/llama/modeling_elastic_llama.hpp   | 15 ++--
 src/models/llama/modeling_llama.hpp           |  8 +--
 src/models/minicpm/modeling_minicpm.hpp       |  8 +--
 src/models/openelm/modeling_openelm.hpp       |  4 +-
 src/models/opt/modeling_opt_qnn.hpp           |  8 +--
 src/models/opt/tokenization_opt.hpp           |  2 +-
 src/models/phi3/modeling_phi3.hpp             |  8 +--
 src/models/phonelm/modeling_phonelm.hpp       | 16 +++--
 src/models/phonelm/modeling_phonelm_npu.hpp   | 31 +++++++-
 src/models/qwen/modeling_qwen.hpp             | 13 ++--
 src/models/qwen/modeling_qwen_npu.hpp         |  4 +-
 src/models/qwen/modeling_qwen_xp_sdpa.hpp     |  4 +-
 src/models/smollm/modeling_smollm.hpp         |  8 +--
 src/models/stablelm/modeling_stablelm.hpp     | 11 ++-
 .../transformer/modeling_transformer.hpp      |  7 +-
 test/xnnpack/XpLlamaMHATest.cpp               |  4 +-
 tools/jni/LibHelper.cpp                       |  8 +--
 30 files changed, 167 insertions(+), 113 deletions(-)
 delete mode 160000 src/backends/xnnpack/third_party/fmt

diff --git a/examples/demo_phonelm_npu.cpp b/examples/demo_phonelm_npu.cpp
index 42324cb4..6607585f 100644
--- a/examples/demo_phonelm_npu.cpp
+++ b/examples/demo_phonelm_npu.cpp
@@ -129,6 +129,8 @@ int main(int argc, char **argv) {
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(0);
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
+        model.clear_kvcache();
+        decoding_model.clear_kvcache();
     }
 }
 #endif
\ No newline at end of file
diff --git a/examples/demo_qwen2.5_npu.cpp b/examples/demo_qwen2.5_npu.cpp
index f49fb33e..77c0356a 100644
--- a/examples/demo_qwen2.5_npu.cpp
+++ b/examples/demo_qwen2.5_npu.cpp
@@ -61,7 +61,7 @@ int main(int argc, char **argv) {
         });
 
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
-        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+        static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
 
         LlmTextGeneratorOpts decoding_opt{
             .max_new_tokens = 100,
@@ -75,7 +75,7 @@ int main(int argc, char **argv) {
         decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
             // call only once of switchDecodeTag
             if (!isSwitched) {
-                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
                 isSwitched = true;
             }
             auto out_string = tokenizer.detokenize({out_token});
diff --git a/scripts/build_android_app.sh b/scripts/build_android_app.sh
index f11bab92..f2211c6f 100755
--- a/scripts/build_android_app.sh
+++ b/scripts/build_android_app.sh
@@ -21,9 +21,9 @@ cmake .. \
 
 make mllm_lib -j16
 
-# 2. copy libs
-cp ./libmllm_lib.a ../android/app/src/main/cpp/libs/
+# # 2. copy libs
+# cp ./libmllm_lib.a ../android/app/src/main/cpp/libs/
 
-# 3. build android apk
-cd ../android || exit
-./gradlew assembleDebug
\ No newline at end of file
+# # 3. build android apk
+# cd ../android || exit
+# ./gradlew assembleDebug
\ No newline at end of file
diff --git a/scripts/build_qnn_android.sh b/scripts/build_qnn_android.sh
index f772de80..4daff80d 100755
--- a/scripts/build_qnn_android.sh
+++ b/scripts/build_qnn_android.sh
@@ -16,4 +16,4 @@ cmake .. \
 -DQNN_VALIDATE_NODE=ON \
 -DMLLM_BUILD_XNNPACK_BACKEND=OFF
 
-make -j16
+make -j40
diff --git a/src/Layer.hpp b/src/Layer.hpp
index 7d729b84..5babf22f 100644
--- a/src/Layer.hpp
+++ b/src/Layer.hpp
@@ -484,6 +484,7 @@ class SlidingWindowMask final : public Layer {
 
 class RoPE final : public Layer {
 public:
+    RoPE() = default;
     explicit RoPE(int pose_type, std::string name) {
         param_["pose_type"] = pose_type;
         init(std::move(name), OpType::ROPE);
@@ -505,10 +506,14 @@ class RoPE final : public Layer {
         auto ts = run({input}, 1);
         return ts[0].get();
     }
+    void clearCache() {
+        return op_->clearCache();
+    }
 };
 
 class IRoPE final : public Layer {
 public:
+    IRoPE() = default;
     explicit IRoPE(int pose_type, std::string name) {
         param_["pose_type"] = pose_type;
         init(std::move(name), OpType::IROPE);
@@ -530,6 +535,9 @@ class IRoPE final : public Layer {
         auto ts = run({input}, 1);
         return ts[0].get();
     }
+    void clearCache() {
+        return op_->clearCache();
+    }
 };
 
 class KVCache final : public Layer {
diff --git a/src/Op.hpp b/src/Op.hpp
index 43787971..be04de18 100644
--- a/src/Op.hpp
+++ b/src/Op.hpp
@@ -121,7 +121,7 @@ class Op {
         return -1;
     }
     virtual void clearCache() {
-        assert(type_ == OpType::KVCACHE);
+        assert(type_ == OpType::KVCACHE || type_ == OpType::IROPE || type_ == OpType::ROPE);
         std::cout << "only for KVCache" << std::endl;
     }
 
diff --git a/src/backends/cpu/CPUIRoPE.cpp b/src/backends/cpu/CPUIRoPE.cpp
index 330b9a6f..fe2a5798 100644
--- a/src/backends/cpu/CPUIRoPE.cpp
+++ b/src/backends/cpu/CPUIRoPE.cpp
@@ -124,7 +124,7 @@ ErrorCode CPUIRoPE::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr
     outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence(), inputs[0]->dimension());
     ishape = inputs[0]->dimension() * partial_rotary_factor_;
     // pos_max_ = 16384;
-    if (sin_.empty() || ishape_old < ishape || global_pose_type_ != pose_type_ ) {
+    if (sin_.empty() || ishape_old < ishape || global_pose_type_ != pose_type_) {
         global_pose_type_ = pose_type_;
         ishape_old = ishape;
         if (pose_type_ == LLAMAROPE) {
@@ -145,39 +145,38 @@ ErrorCode CPUIRoPE::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr
     return Op::reshape(inputs, outputs);
 }
 
-
-void CPUIRoPE::rope_llama(shared_ptr<Tensor> input, shared_ptr<Tensor> output){
+void CPUIRoPE::rope_llama(shared_ptr<Tensor> input, shared_ptr<Tensor> output) {
     auto out_dtype = output->dtype();
     int partial_dimension = (input->dimension()) * partial_rotary_factor_;
 #pragma omp parallel for collapse(4) num_threads(thread_count)
     for (int n = 0; n < input->batch(); ++n) {
         for (int h = 0; h < input->head(); ++h) {
             for (int s = 0; s < input->sequence(); ++s) { // sequance
-                for (int d = 0; d < partial_dimension; d+=2) {
+                for (int d = 0; d < partial_dimension; d += 2) {
                     float in_value = input->dataAt<float>(n, h, s, d);
                     float in_value_2 = input->dataAt<float>(n, h, s, d + 1);
-                    float sin_value = static_cast<float>(sin_[s + h_cnt_][d])/127 * sin_max;
-                    float cos_value = static_cast<float>(cos_[s + h_cnt_][d])/127 * cos_max;
+                    float sin_value = static_cast<float>(sin_[s + h_cnt_][d]) / 127 * sin_max;
+                    float cos_value = static_cast<float>(cos_[s + h_cnt_][d]) / 127 * cos_max;
                     auto value = in_value * cos_value - in_value_2 * sin_value;
                     auto value2 = in_value * sin_value + in_value_2 * cos_value;
                     if (out_dtype == MLLM_TYPE_F32) {
                         output->setDataAt<float>(n, h, s, d, value);
-                        output->setDataAt<float>(n, h, s, d+1, value2);
+                        output->setDataAt<float>(n, h, s, d + 1, value2);
                     } else if (out_dtype == MLLM_TYPE_F16) {
                         output->setDataAt<mllm_fp16_t>(n, h, s, d, MLLM_FP32_TO_FP16(value));
-                        output->setDataAt<mllm_fp16_t>(n, h, s, d+1, MLLM_FP32_TO_FP16(value2));
+                        output->setDataAt<mllm_fp16_t>(n, h, s, d + 1, MLLM_FP32_TO_FP16(value2));
                     }
                 }
             }
         }
     }
 }
-void CPUIRoPE::rope_hf(shared_ptr<Tensor> input, shared_ptr<Tensor> output){
+void CPUIRoPE::rope_hf(shared_ptr<Tensor> input, shared_ptr<Tensor> output) {
     auto out_dtype = output->dtype();
     int partial_dimension = (input->dimension()) * partial_rotary_factor_;
     int half = (int)(partial_dimension / 2);
-    assert(partial_dimension%2==0);
-    if(output->ctype() == BSHD){     
+    assert(partial_dimension % 2 == 0);
+    if (output->ctype() == BSHD) {
         if (input->dtype() == MLLM_TYPE_F16) {
 #pragma omp parallel for collapse(4) num_threads(thread_count)
             for (int n = 0; n < input->batch(); ++n) {
@@ -188,9 +187,9 @@ void CPUIRoPE::rope_hf(shared_ptr<Tensor> input, shared_ptr<Tensor> output){
                             auto o = output->ptrAt<mllm_fp16_t>(n, h, s, d);
                             float in_value = static_cast<float>(v[0]);
                             float in_value_2 = static_cast<float>(v[half]);
-                            float sin_value = static_cast<float>(sin_[s + h_cnt_][d])/127 * sin_max;
+                            float sin_value = static_cast<float>(sin_[s + h_cnt_][d]) / 127 * sin_max;
                             auto c = static_cast<float>(cos_[s + h_cnt_][d]);
-                            float cos_value = c/127 * cos_max;
+                            float cos_value = c / 127 * cos_max;
                             auto value = in_value * cos_value - in_value_2 * sin_value;
                             auto value2 = in_value * sin_value + in_value_2 * cos_value;
                             o[0] = MLLM_FP32_TO_FP16(value);
@@ -200,20 +199,19 @@ void CPUIRoPE::rope_hf(shared_ptr<Tensor> input, shared_ptr<Tensor> output){
                 }
             }
 
-        } else   
-        if (out_dtype == MLLM_TYPE_F32){
+        } else if (out_dtype == MLLM_TYPE_F32) {
 #pragma omp parallel for collapse(4) num_threads(thread_count)
             for (int n = 0; n < input->batch(); ++n) {
                 for (int h = 0; h < input->head(); ++h) {
                     for (int s = 0; s < input->sequence(); ++s) { // sequance
-                        for (int d = 0; d < partial_dimension/2; ++d) {
+                        for (int d = 0; d < partial_dimension / 2; ++d) {
                             auto v = input->ptrAt<float>(n, h, s, d);
                             auto o = output->ptrAt<float>(n, h, s, d);
                             float in_value = v[0];
                             float in_value_2 = v[half];
-                            float sin_value = static_cast<float>(sin_[s + h_cnt_][d])/127 * sin_max;
+                            float sin_value = static_cast<float>(sin_[s + h_cnt_][d]) / 127 * sin_max;
                             auto c = static_cast<float>(cos_[s + h_cnt_][d]);
-                            float cos_value = c/127 * cos_max;
+                            float cos_value = c / 127 * cos_max;
                             auto value = in_value * cos_value - in_value_2 * sin_value;
                             auto value2 = in_value * sin_value + in_value_2 * cos_value;
                             o[0] = value;
@@ -222,18 +220,18 @@ void CPUIRoPE::rope_hf(shared_ptr<Tensor> input, shared_ptr<Tensor> output){
                     }
                 }
             }
-        }else if(out_dtype == MLLM_TYPE_F16){
+        } else if (out_dtype == MLLM_TYPE_F16) {
 #pragma omp parallel for collapse(4) num_threads(thread_count)
             for (int n = 0; n < input->batch(); ++n) {
                 for (int h = 0; h < input->head(); ++h) {
                     for (int s = 0; s < input->sequence(); ++s) { // sequance
-                        for (int d = 0; d < partial_dimension/2; ++d) {
+                        for (int d = 0; d < partial_dimension / 2; ++d) {
                             auto v = input->ptrAt<float>(n, h, s, d);
                             auto o = output->ptrAt<mllm_fp16_t>(n, h, s, d);
                             float in_value = v[0];
                             float in_value_2 = v[half];
                             float sin_value = static_cast<float>(sin_[s + h_cnt_][d]) / 127 * sin_max;
-                            float cos_value = static_cast<float>(cos_[s + h_cnt_][d])/127 * cos_max;
+                            float cos_value = static_cast<float>(cos_[s + h_cnt_][d]) / 127 * cos_max;
                             auto value = in_value * cos_value - in_value_2 * sin_value;
                             auto value2 = in_value * sin_value + in_value_2 * cos_value;
                             o[0] = MLLM_FP32_TO_FP16(value);
@@ -249,7 +247,7 @@ void CPUIRoPE::rope_hf(shared_ptr<Tensor> input, shared_ptr<Tensor> output){
     for (int n = 0; n < input->batch(); ++n) {
         for (int h = 0; h < input->head(); ++h) {
             for (int s = 0; s < input->sequence(); ++s) { // sequance
-                for (int d = 0; d < partial_dimension/2; ++d) {
+                for (int d = 0; d < partial_dimension / 2; ++d) {
                     if (input->dtype() == MLLM_TYPE_F16) {
                         float in_value = static_cast<float>(input->dataAt<mllm_fp16_t>(n, h, s, d));
                         float in_value_2 = static_cast<float>(input->dataAt<mllm_fp16_t>(n, h, s, d + partial_dimension / 2));
@@ -268,16 +266,16 @@ void CPUIRoPE::rope_hf(shared_ptr<Tensor> input, shared_ptr<Tensor> output){
                     } else {
                         float in_value = input->dataAt<float>(n, h, s, d);
                         float in_value_2 = input->dataAt<float>(n, h, s, d + partial_dimension / 2);
-                        float sin_value = static_cast<float>(sin_[s + h_cnt_][d])/127 * sin_max;
-                        float cos_value = static_cast<float>(cos_[s + h_cnt_][d])/127 * cos_max;
+                        float sin_value = static_cast<float>(sin_[s + h_cnt_][d]) / 127 * sin_max;
+                        float cos_value = static_cast<float>(cos_[s + h_cnt_][d]) / 127 * cos_max;
                         auto value = in_value * cos_value - in_value_2 * sin_value;
                         auto value2 = in_value * sin_value + in_value_2 * cos_value;
                         if (out_dtype == MLLM_TYPE_F32) {
                             output->setDataAt<float>(n, h, s, d, value);
-                            output->setDataAt<float>(n, h, s, d+ partial_dimension / 2, value2);
+                            output->setDataAt<float>(n, h, s, d + partial_dimension / 2, value2);
                         } else if (out_dtype == MLLM_TYPE_F16) {
                             output->setDataAt<mllm_fp16_t>(n, h, s, d, MLLM_FP32_TO_FP16(value));
-                            output->setDataAt<mllm_fp16_t>(n, h, s, d+ partial_dimension / 2, MLLM_FP32_TO_FP16(value2));
+                            output->setDataAt<mllm_fp16_t>(n, h, s, d + partial_dimension / 2, MLLM_FP32_TO_FP16(value2));
                         }
                     }
                 }
@@ -285,7 +283,7 @@ void CPUIRoPE::rope_hf(shared_ptr<Tensor> input, shared_ptr<Tensor> output){
         }
     }
 }
-void CPUIRoPE::rope_permission(shared_ptr<Tensor> input, shared_ptr<Tensor> output){
+void CPUIRoPE::rope_permission(shared_ptr<Tensor> input, shared_ptr<Tensor> output) {
     auto out_dtype = output->dtype();
     int partial_dimension = (input->dimension()) * partial_rotary_factor_;
 #pragma omp parallel for collapse(4) num_threads(thread_count)
@@ -293,10 +291,10 @@ void CPUIRoPE::rope_permission(shared_ptr<Tensor> input, shared_ptr<Tensor> outp
         for (int h = 0; h < input->head(); ++h) {
             for (int s = 0; s < input->sequence(); ++s) { // sequance
                 for (int d = 0; d < partial_dimension; ++d) {
-                float in_value = input->dataAt<float>(n, h, s, d);
+                    float in_value = input->dataAt<float>(n, h, s, d);
                     float in_value_2;
-                    float sin_value = static_cast<float>(sin_[s + h_cnt_][d])/127 * sin_max;
-                    float cos_value = static_cast<float>(cos_[s + h_cnt_][d])/127 * cos_max;
+                    float sin_value = static_cast<float>(sin_[s + h_cnt_][d]) / 127 * sin_max;
+                    float cos_value = static_cast<float>(cos_[s + h_cnt_][d]) / 127 * cos_max;
                     if (d < partial_dimension / 4) {
                         in_value_2 = -input->dataAt<float>(n, h, s, d + partial_dimension / 4);
                         auto value = in_value * cos_value + in_value_2 * sin_value;
@@ -325,7 +323,7 @@ void CPUIRoPE::rope_permission(shared_ptr<Tensor> input, shared_ptr<Tensor> outp
         }
     }
 }
-void CPUIRoPE::rope_mla(shared_ptr<Tensor> input, shared_ptr<Tensor> output){
+void CPUIRoPE::rope_mla(shared_ptr<Tensor> input, shared_ptr<Tensor> output) {
     auto out_dtype = output->dtype();
     int partial_dimension = (input->dimension()) * partial_rotary_factor_;
 #pragma omp parallel for collapse(4) num_threads(thread_count)
@@ -338,17 +336,17 @@ void CPUIRoPE::rope_mla(shared_ptr<Tensor> input, shared_ptr<Tensor> output){
                     if (d < half_dim) {
                         in_value = input->dataAt<float>(n, h, s, d * 2);
                     } else {
-                        in_value = input->dataAt<float>(n, h, s, 2 *(d - half_dim)+1);
+                        in_value = input->dataAt<float>(n, h, s, 2 * (d - half_dim) + 1);
                     }
                     float in_value_2;
                     if (d < half_dim) {
-                        in_value_2 = -input->dataAt<float>(n, h, s, 2 *d+1);
+                        in_value_2 = -input->dataAt<float>(n, h, s, 2 * d + 1);
                     } else {
-                        in_value_2 = input->dataAt<float>(n, h, s, 2 *(d - half_dim));
+                        in_value_2 = input->dataAt<float>(n, h, s, 2 * (d - half_dim));
                     }
                     // no change
-                    float sin_value = static_cast<float>(sin_[s + h_cnt_][d])/127 * sin_max;
-                    float cos_value = static_cast<float>(cos_[s + h_cnt_][d])/127 * cos_max;
+                    float sin_value = static_cast<float>(sin_[s + h_cnt_][d]) / 127 * sin_max;
+                    float cos_value = static_cast<float>(cos_[s + h_cnt_][d]) / 127 * cos_max;
                     auto value = in_value * cos_value + in_value_2 * sin_value;
                     if (out_dtype == MLLM_TYPE_F32) {
                         output->setDataAt<float>(n, h, s, d, value);
@@ -486,7 +484,7 @@ ErrorCode CPUIRoPE::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr
     // auto end_t = mllm_time_us();
     // std::cout << "RoPE time: " << (end_t - start_t)/1000.0F << " ms " <<partial_dimension<<"  "<<out_dtype<< std::endl;
     h_cnt_ += input->sequence();
-    if (h_cnt_ > pos_max_) {
+    if (h_cnt_ >= pos_max_) {
         h_cnt_ = 0;
     }
 
diff --git a/src/backends/cpu/CPUIRoPE.hpp b/src/backends/cpu/CPUIRoPE.hpp
index 2e8aee56..bd872fec 100644
--- a/src/backends/cpu/CPUIRoPE.hpp
+++ b/src/backends/cpu/CPUIRoPE.hpp
@@ -35,11 +35,13 @@ class CPUIRoPE final : public Op {
     int thread_count = 4;
     float partial_rotary_factor_ = 1;
 
-
     void rope_llama(shared_ptr<Tensor> input, shared_ptr<Tensor> output);
     void rope_hf(shared_ptr<Tensor> input, shared_ptr<Tensor> output);
     void rope_permission(shared_ptr<Tensor> input, shared_ptr<Tensor> output);
     void rope_mla(shared_ptr<Tensor> input, shared_ptr<Tensor> output);
+    void clearCache() override {
+        h_cnt_ = 0;
+    }
 };
 
 class CPUIRoPECreator : public CPUBackend::Creator {
diff --git a/src/backends/cpu/CPURoPE.cpp b/src/backends/cpu/CPURoPE.cpp
index 2235dad6..919df5a7 100644
--- a/src/backends/cpu/CPURoPE.cpp
+++ b/src/backends/cpu/CPURoPE.cpp
@@ -447,7 +447,7 @@ ErrorCode CPURoPE::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<
     // auto end_t = mllm_time_us();
     // std::cout << "RoPE time: " << (end_t - start_t)/1000.0F << " ms " <<partial_dimension<<"  "<<out_dtype<< std::endl;
     h_cnt_ += input->sequence();
-    if (h_cnt_ > pos_max_) {
+    if (h_cnt_ >= pos_max_) {
         h_cnt_ = 0;
     }
 
diff --git a/src/backends/cpu/CPURoPE.hpp b/src/backends/cpu/CPURoPE.hpp
index 058a2746..57ee958d 100644
--- a/src/backends/cpu/CPURoPE.hpp
+++ b/src/backends/cpu/CPURoPE.hpp
@@ -33,11 +33,13 @@ class CPURoPE final : public Op {
     int thread_count = 4;
     float partial_rotary_factor_ = 1;
 
-
     void rope_llama(shared_ptr<Tensor> input, shared_ptr<Tensor> output);
     void rope_hf(shared_ptr<Tensor> input, shared_ptr<Tensor> output);
     void rope_permission(shared_ptr<Tensor> input, shared_ptr<Tensor> output);
     void rope_mla(shared_ptr<Tensor> input, shared_ptr<Tensor> output);
+    void clearCache() override {
+        h_cnt_ = 0;
+    }
 };
 
 class CPURoPECreator : public CPUBackend::Creator {
diff --git a/src/backends/xnnpack/third_party/fmt b/src/backends/xnnpack/third_party/fmt
deleted file mode 160000
index 0c9fce2f..00000000
--- a/src/backends/xnnpack/third_party/fmt
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 0c9fce2ffefecfdce794e1859584e25877b7b592
diff --git a/src/models/dclm/modeling_dclm.hpp b/src/models/dclm/modeling_dclm.hpp
index 82508199..266e3871 100644
--- a/src/models/dclm/modeling_dclm.hpp
+++ b/src/models/dclm/modeling_dclm.hpp
@@ -53,8 +53,8 @@ class DCLMAttention final : public Module {
     Layer out_proj;
     Layer q_norm;
     Layer k_norm;
-    Layer q_rope;
-    Layer k_rope;
+    RoPE q_rope;
+    RoPE k_rope;
     KVCache k_cache;
     KVCache v_cache;
     Layer softmax;
diff --git a/src/models/fuyu/modeling_fuyu.hpp b/src/models/fuyu/modeling_fuyu.hpp
index 5bfe6575..14065bde 100644
--- a/src/models/fuyu/modeling_fuyu.hpp
+++ b/src/models/fuyu/modeling_fuyu.hpp
@@ -67,8 +67,10 @@ class Persimmon final : public Module {
     }
     void clear_kvcache() override {
         for (auto &block : blocks) {
-            auto kvcahce = block.get_attention().get_cache();
-            for (auto &cache : kvcahce) { cache->clearCache(); }
+            auto kvcache = block.get_attention().get_cache();
+            for (auto &cache : kvcache) { cache->clearCache(); }
+            auto ropes = block.get_attention().get_rope();
+            for (auto &rope : ropes) { rope->clearCache(); }
         }
     }
 };
diff --git a/src/models/llama/modeling_elastic_llama.hpp b/src/models/llama/modeling_elastic_llama.hpp
index 84abda6f..a43af045 100644
--- a/src/models/llama/modeling_elastic_llama.hpp
+++ b/src/models/llama/modeling_elastic_llama.hpp
@@ -17,8 +17,8 @@ class ElasticMultiHeadAttention final : public Module {
     ElasticLinear q_proj;
     ElasticLinear k_proj;
     ElasticLinear v_proj;
-    Layer q_rope;
-    Layer k_rope;
+    RoPE q_rope;
+    RoPE k_rope;
     KVCache k_cache;
     KVCache v_cache;
     Softmax softmax;
@@ -86,6 +86,9 @@ class ElasticMultiHeadAttention final : public Module {
     vector<KVCache *> get_cache() {
         return {&k_cache, &v_cache};
     }
+    vector<RoPE *> get_rope() {
+        return {&q_rope, &k_rope};
+    }
 };
 
 class ElasticLLaMAMLP final : public Module {
@@ -180,10 +183,10 @@ class ElasticLLaMAModel final : public Module {
 
     void clear_kvcache() override {
         for (auto &block : blocks) {
-            auto kvcahce = block.get_attention().get_cache();
-            for (auto &cache : kvcahce) {
-                cache->clearCache();
-            }
+            auto kvcache = block.get_attention().get_cache();
+            for (auto &cache : kvcache) { cache->clearCache(); }
+            auto ropes = block.get_attention().get_rope();
+            for (auto &rope : ropes) { rope->clearCache(); }
         }
     }
 };
diff --git a/src/models/llama/modeling_llama.hpp b/src/models/llama/modeling_llama.hpp
index 31d917e0..5fccd18e 100644
--- a/src/models/llama/modeling_llama.hpp
+++ b/src/models/llama/modeling_llama.hpp
@@ -97,10 +97,10 @@ class LLaMAModel final : public Module {
 
     void clear_kvcache() override {
         for (auto &block : blocks) {
-            auto kvcahce = block.get_attention().get_cache();
-            for (auto &cache : kvcahce) {
-                cache->clearCache();
-            }
+            auto kvcache = block.get_attention().get_cache();
+            for (auto &cache : kvcache) { cache->clearCache(); }
+            auto ropes = block.get_attention().get_rope();
+            for (auto &rope : ropes) { rope->clearCache(); }
         }
     }
 };
diff --git a/src/models/minicpm/modeling_minicpm.hpp b/src/models/minicpm/modeling_minicpm.hpp
index 42145856..163bc0b4 100644
--- a/src/models/minicpm/modeling_minicpm.hpp
+++ b/src/models/minicpm/modeling_minicpm.hpp
@@ -94,10 +94,10 @@ class MiniCPMModel final : public Module {
 
     void clear_kvcache() override {
         for (auto &block : blocks) {
-            auto kvcahce = block.get_attention().get_cache();
-            for (auto &cache : kvcahce) {
-                cache->clearCache();
-            }
+            auto kvcache = block.get_attention().get_cache();
+            for (auto &cache : kvcache) { cache->clearCache(); }
+            auto ropes = block.get_attention().get_rope();
+            for (auto &rope : ropes) { rope->clearCache(); }
         }
     }
 
diff --git a/src/models/openelm/modeling_openelm.hpp b/src/models/openelm/modeling_openelm.hpp
index 1d798b8e..2cc1a476 100644
--- a/src/models/openelm/modeling_openelm.hpp
+++ b/src/models/openelm/modeling_openelm.hpp
@@ -39,8 +39,8 @@ class OpenELMMultiHeadCausalAttention final : public Module {
     int v_heads_;
 
     Layer qkv_proj;
-    Layer q_rope;
-    Layer k_rope;
+    RoPE q_rope;
+    RoPE k_rope;
     Layer q_norm;
     Layer k_norm;
     Layer out_proj;
diff --git a/src/models/opt/modeling_opt_qnn.hpp b/src/models/opt/modeling_opt_qnn.hpp
index 101ddac7..e35e3093 100644
--- a/src/models/opt/modeling_opt_qnn.hpp
+++ b/src/models/opt/modeling_opt_qnn.hpp
@@ -16,8 +16,8 @@ class OPTEncoderBlockPart1 final : public Module {
     Layer q_proj;
     Layer k_proj;
     Layer v_proj;
-    Layer q_rope;
-    Layer k_rope;
+    RoPE q_rope;
+    RoPE k_rope;
     Layer norm1;
     int head_size_{};
     int kv_head_size_{};
@@ -74,11 +74,11 @@ class OPTQKVmm final : public Module {
              const OPTNameConfig &names, const string &base_name) {
         attn_hidden_dim_ = attn_hidden_dim;
         head_size_ = head_size;
-        if(cache_limit>0){
+        if (cache_limit > 0) {
             k_cache = KVCache(head_size / kv_head_size, cache_limit, false, base_name + names._attn_base_name + "k_cache");
             v_cache = KVCache(head_size / kv_head_size, cache_limit, false, base_name + names._attn_base_name + "v_cache");
         }
-        
+
         if (do_mask) {
             mask = Causalmask(base_name + "mask");
         }
diff --git a/src/models/opt/tokenization_opt.hpp b/src/models/opt/tokenization_opt.hpp
index 363906e2..1900d899 100644
--- a/src/models/opt/tokenization_opt.hpp
+++ b/src/models/opt/tokenization_opt.hpp
@@ -26,7 +26,7 @@ class OPTTokenizer final : public BPETokenizer {
     }
 
     Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override {
-        string new_text;
+        string new_text = text;
         if (text[0] != ' ') {
             new_text = ' ' + text;
         }
diff --git a/src/models/phi3/modeling_phi3.hpp b/src/models/phi3/modeling_phi3.hpp
index 3016c084..f5f9ea9b 100644
--- a/src/models/phi3/modeling_phi3.hpp
+++ b/src/models/phi3/modeling_phi3.hpp
@@ -97,10 +97,10 @@ class Phi3Model final : public Module {
 
     void clear_kvcache() override {
         for (auto &block : blocks) {
-            auto kvcahce = block.get_attention().get_cache();
-            for (auto &cache : kvcahce) {
-                cache->clearCache();
-            }
+            auto kvcache = block.get_attention().get_cache();
+            for (auto &cache : kvcache) { cache->clearCache(); }
+            auto ropes = block.get_attention().get_rope();
+            for (auto &rope : ropes) { rope->clearCache(); }
         }
     }
 };
diff --git a/src/models/phonelm/modeling_phonelm.hpp b/src/models/phonelm/modeling_phonelm.hpp
index 69bbecda..151ba193 100644
--- a/src/models/phonelm/modeling_phonelm.hpp
+++ b/src/models/phonelm/modeling_phonelm.hpp
@@ -98,10 +98,12 @@ class PhoneLMAttention final : public Module {
         return {o};
     }
 
-    vector<KVCache *>
-    get_cache() {
+    vector<KVCache *> get_cache() {
         return {&k_cache, &v_cache};
     }
+    vector<IRoPE *> get_rope() {
+        return {&q_rope, &k_rope};
+    }
 
 private:
     int hidden_size;
@@ -113,8 +115,8 @@ class PhoneLMAttention final : public Module {
     Layer k_proj;
     Layer v_proj;
     Layer o_proj;
-    Layer q_rope;
-    Layer k_rope;
+    IRoPE q_rope;
+    IRoPE k_rope;
     KVCache k_cache;
     KVCache v_cache;
     Softmax softmax;
@@ -170,8 +172,10 @@ class PhoneLMModel final : public Module {
 
     void clear_kvcache() override {
         for (auto &block : blocks) {
-            auto kvcahce = block.get_attention().get_cache();
-            for (auto &cache : kvcahce) { cache->clearCache(); }
+            auto kvcache = block.get_attention().get_cache();
+            for (auto &cache : kvcache) { cache->clearCache(); }
+            auto ropes = block.get_attention().get_rope();
+            for (auto &rope : ropes) { rope->clearCache(); }
         }
     }
 
diff --git a/src/models/phonelm/modeling_phonelm_npu.hpp b/src/models/phonelm/modeling_phonelm_npu.hpp
index 0b03cb1d..27f44fb6 100644
--- a/src/models/phonelm/modeling_phonelm_npu.hpp
+++ b/src/models/phonelm/modeling_phonelm_npu.hpp
@@ -82,8 +82,8 @@ class PhoneLMDecoderNPUPart1 final : public Module {
 
 // CPU QKV MM part
 class PhoneLMQKVmm final : public Module {
-    Layer q_rope;
-    Layer k_rope;
+    IRoPE q_rope;
+    IRoPE k_rope;
     KVCache k_cache;
     KVCache v_cache;
     Softmax softmax;
@@ -135,6 +135,12 @@ class PhoneLMQKVmm final : public Module {
 
         return {o};
     }
+    vector<KVCache *> get_cache() {
+        return {&k_cache, &v_cache};
+    }
+    vector<IRoPE *> get_rope() {
+        return {&q_rope, &k_rope};
+    }
 };
 
 // QNN mlp part
@@ -415,6 +421,9 @@ class PhoneLMNPU_CPUDecoder final : public Module {
 
         return {x};
     }
+    PhoneLMQKVmm &get_mm() {
+        return qkv_mm;
+    }
 };
 
 class PhoneLMNPU_CPUDecoderWithShadow final : public Module {
@@ -483,6 +492,9 @@ class PhoneLMNPU_CPUDecoderWithShadow final : public Module {
 
         return {x};
     }
+    PhoneLMQKVmm &get_mm() {
+        return qkv_mm;
+    }
 };
 
 // Copied from GemmaModel with Gemma->PhoneLM and set RmsNorm(without add_unit_offset)
@@ -525,6 +537,18 @@ class PhoneLMModel_NPU final : public Module {
         return {x};
     }
 
+    void clear_kvcache() override {
+        for (auto &block : blocks) {
+            auto decoder = dynamic_cast<PhoneLMNPU_CPUDecoder *>(block.get());
+            if (decoder) {
+                auto kvcache = decoder->get_mm().get_cache();
+                for (auto &cache : kvcache) { cache->clearCache(); }
+                auto ropes = decoder->get_mm().get_rope();
+                for (auto &rope : ropes) { rope->clearCache(); }
+            }
+        }
+    }
+
 private:
     std::vector<unique_ptr<Module>> blocks;
     Layer norm;
@@ -588,6 +612,9 @@ class PhoneLMForCausalLM_NPU final : public Module {
             return;
         }
     }
+    void clear_kvcache() override {
+        model.clear_kvcache();
+    }
 
 private:
     int hidden_size;
diff --git a/src/models/qwen/modeling_qwen.hpp b/src/models/qwen/modeling_qwen.hpp
index f2b61aaa..f6ca72e9 100644
--- a/src/models/qwen/modeling_qwen.hpp
+++ b/src/models/qwen/modeling_qwen.hpp
@@ -115,6 +115,9 @@ class QWenAttention final : public Module {
     vector<KVCache *> get_cache() {
         return {&k_cache, &v_cache};
     }
+    vector<RoPE *> get_rope() {
+        return {&q_rope, &k_rope};
+    }
 
 private:
     int hidden_size;
@@ -126,8 +129,8 @@ class QWenAttention final : public Module {
     Layer k_proj;
     Layer v_proj;
     Layer o_proj;
-    Layer q_rope;
-    Layer k_rope;
+    RoPE q_rope;
+    RoPE k_rope;
     KVCache k_cache;
     KVCache v_cache;
     Causalmask mask;
@@ -187,8 +190,10 @@ class QWenModel final : public Module {
 
     void clear_kvcache() override {
         for (auto &block : blocks) {
-            auto kvcahce = block.get_attention().get_cache();
-            for (auto &cache : kvcahce) { cache->clearCache(); }
+            auto kvcache = block.get_attention().get_cache();
+            for (auto &cache : kvcache) { cache->clearCache(); }
+            auto ropes = block.get_attention().get_rope();
+            for (auto &rope : ropes) { rope->clearCache(); }
         }
     }
 
diff --git a/src/models/qwen/modeling_qwen_npu.hpp b/src/models/qwen/modeling_qwen_npu.hpp
index c96a765b..c051170a 100644
--- a/src/models/qwen/modeling_qwen_npu.hpp
+++ b/src/models/qwen/modeling_qwen_npu.hpp
@@ -82,8 +82,8 @@ class QwenDecoderNPUPart1 final : public Module {
 
 // CPU QKV MM part
 class QwenQKVmm final : public Module {
-    Layer q_rope;
-    Layer k_rope;
+    RoPE q_rope;
+    RoPE k_rope;
     KVCache k_cache;
     KVCache v_cache;
     Softmax softmax;
diff --git a/src/models/qwen/modeling_qwen_xp_sdpa.hpp b/src/models/qwen/modeling_qwen_xp_sdpa.hpp
index e9225d3a..dfb28151 100644
--- a/src/models/qwen/modeling_qwen_xp_sdpa.hpp
+++ b/src/models/qwen/modeling_qwen_xp_sdpa.hpp
@@ -21,8 +21,8 @@ class XpDecoderSeperatedPart_1_Xnn : public Module {
     Layer q_proj;
     Layer k_proj;
     Layer v_proj;
-    Layer q_rope;
-    Layer k_rope;
+    RoPE q_rope;
+    RoPE k_rope;
     Layer input_layernorm;
 
     int hidden_size = 0;
diff --git a/src/models/smollm/modeling_smollm.hpp b/src/models/smollm/modeling_smollm.hpp
index 99febfe5..86d8d6ca 100644
--- a/src/models/smollm/modeling_smollm.hpp
+++ b/src/models/smollm/modeling_smollm.hpp
@@ -104,10 +104,10 @@ class SmolLMModel final : public Module {
 
     void clear_kvcache() override {
         for (auto &block : blocks) {
-            auto kvcahce = block.get_attention().get_cache();
-            for (auto &cache : kvcahce) {
-                cache->clearCache();
-            }
+            auto kvcache = block.get_attention().get_cache();
+            for (auto &cache : kvcache) { cache->clearCache(); }
+            auto ropes = block.get_attention().get_rope();
+            for (auto &rope : ropes) { rope->clearCache(); }
         }
     }
 };
diff --git a/src/models/stablelm/modeling_stablelm.hpp b/src/models/stablelm/modeling_stablelm.hpp
index 50737a2a..9e7c0703 100644
--- a/src/models/stablelm/modeling_stablelm.hpp
+++ b/src/models/stablelm/modeling_stablelm.hpp
@@ -12,8 +12,8 @@ class StableLMMultiHeadAttention final : public Module {
     Layer q_proj;
     Layer k_proj;
     Layer v_proj;
-    Layer q_rope;
-    Layer k_rope;
+    RoPE q_rope;
+    RoPE k_rope;
     KVCache k_cache;
     KVCache v_cache;
     Softmax softmax;
@@ -26,8 +26,8 @@ class StableLMMultiHeadAttention final : public Module {
 public:
     StableLMMultiHeadAttention() = default;
     StableLMMultiHeadAttention(int hidden_dim, int head_size, int kv_head_size, int attn_hidden_dim,
-                                  RoPEType RoPE_type, int cache_limit, bool do_mask, bool bias,
-                                  const TransformerNameConfig &names, const string &base_name) {
+                               RoPEType RoPE_type, int cache_limit, bool do_mask, bool bias,
+                               const TransformerNameConfig &names, const string &base_name) {
         attn_hidden_dim_ = attn_hidden_dim;
         head_size_ = head_size;
         kv_head_size_ = kv_head_size;
@@ -44,7 +44,6 @@ class StableLMMultiHeadAttention final : public Module {
         }
         softmax = Softmax(DIMENSION, do_mask, base_name + "softmax");
         o_proj = Linear(head_size * attn_hidden_dim, hidden_dim, false, base_name + names._o_proj_name);
-
     }
     vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override {
         Tensor q, k, v;
@@ -107,7 +106,7 @@ class StableLMBlock final : public Module {
     StableLMBlock() = default;
     StableLMBlock(int hidden_dim, int head_size, int ffn_hidden, RoPEType RoPE_type, int cache_limit, const stablelmNameConfig &names, const string &base_name) {
         attention = StableLMMultiHeadAttention(hidden_dim, head_size, head_size, hidden_dim / head_size,
-                                                  RoPE_type, cache_limit, true, true, names, base_name + names._attn_base_name);
+                                               RoPE_type, cache_limit, true, true, names, base_name + names._attn_base_name);
         mlp = StableLMMLP(hidden_dim, ffn_hidden, names, base_name + names._ffn_base_name);
         norm1 = LayerNorm(hidden_dim, true, 1e-5, base_name + names._attn_norm_name);
         norm2 = LayerNorm(hidden_dim, true, 1e-5, base_name + names._ffn_norm_name);
diff --git a/src/models/transformer/modeling_transformer.hpp b/src/models/transformer/modeling_transformer.hpp
index a4a05040..61244611 100644
--- a/src/models/transformer/modeling_transformer.hpp
+++ b/src/models/transformer/modeling_transformer.hpp
@@ -17,8 +17,8 @@ class MultiHeadAttention final : public Module {
     Layer q_proj;
     Layer k_proj;
     Layer v_proj;
-    Layer q_rope;
-    Layer k_rope;
+    RoPE q_rope;
+    RoPE k_rope;
     Layer q_norm;
     Layer k_norm;
     KVCache k_cache;
@@ -117,6 +117,9 @@ class MultiHeadAttention final : public Module {
     vector<KVCache *> get_cache() {
         return {&k_cache, &v_cache};
     }
+    vector<RoPE *> get_rope() {
+        return {&q_rope, &k_rope};
+    }
 };
 
 class FeedForward final : public Module {
diff --git a/test/xnnpack/XpLlamaMHATest.cpp b/test/xnnpack/XpLlamaMHATest.cpp
index e056203e..540f441a 100644
--- a/test/xnnpack/XpLlamaMHATest.cpp
+++ b/test/xnnpack/XpLlamaMHATest.cpp
@@ -19,8 +19,8 @@ class XpLLaMAMHA final : public Module {
     Layer q_proj;
     Layer k_proj;
     Layer v_proj;
-    Layer q_rope;
-    Layer k_rope;
+    RoPE q_rope;
+    RoPE k_rope;
     Layer k_cache;
     Layer v_cache;
     Layer o_proj;
diff --git a/tools/jni/LibHelper.cpp b/tools/jni/LibHelper.cpp
index 16d0d67e..0964ed50 100644
--- a/tools/jni/LibHelper.cpp
+++ b/tools/jni/LibHelper.cpp
@@ -137,7 +137,7 @@ void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, u
                 return true;
             });
             static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
-            static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+            static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
             opt = LlmTextGeneratorOpts{
                 .max_new_tokens = max_new_tokens,
                 .do_sample = false,
@@ -150,7 +150,7 @@ void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, u
         bool isSwitched = false;
         module_->generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
             if (!isSwitched && backend_ == MLLMBackendType::QNN) {
-                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
                 isSwitched = true;
             }
             auto out_token_string = tokenizer_->detokenize({out_token});
@@ -206,7 +206,7 @@ void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, u
                 return true;
             });
             static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
-            static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+            static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
 
             opt = LlmTextGeneratorOpts{
                 .max_new_tokens = max_new_tokens,
@@ -220,7 +220,7 @@ void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, u
         bool isSwitched = false;
         module_->generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
             if (!isSwitched && backend_ == MLLMBackendType::QNN) {
-                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
+                static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
                 isSwitched = true;
             }
             auto out_token_string = tokenizer_->detokenize({out_token});

From 68ddb997d4a6e7893302ea14702631e31260d772 Mon Sep 17 00:00:00 2001
From: yirongjie <yirj0809@gmail.com>
Date: Thu, 14 Nov 2024 10:57:26 +0000
Subject: [PATCH 4/4] fix: genarate with Padding

---
 examples/demo_phonelm_npu.cpp               | 34 ++++++----------
 examples/demo_qwen2.5_npu.cpp               |  4 +-
 examples/demo_qwen_npu.cpp                  |  4 +-
 scripts/build_android_app.sh                | 10 ++---
 scripts/run_phonelm_qnn.sh                  | 44 +++++++++++++++++++++
 src/Generate.hpp                            |  8 ++++
 src/Op.hpp                                  |  4 +-
 src/backends/cpu/CPUBackend.hpp             |  8 ++--
 src/models/phonelm/modeling_phonelm_npu.hpp | 29 +-------------
 src/models/qwen/modeling_qwen_npu.hpp       |  2 +-
 10 files changed, 79 insertions(+), 68 deletions(-)
 create mode 100755 scripts/run_phonelm_qnn.sh

diff --git a/examples/demo_phonelm_npu.cpp b/examples/demo_phonelm_npu.cpp
index 6607585f..e4112bec 100644
--- a/examples/demo_phonelm_npu.cpp
+++ b/examples/demo_phonelm_npu.cpp
@@ -1,7 +1,6 @@
 #include "Module.hpp"
 #include "Types.hpp"
 #include <memory>
-#ifdef USE_QNN
 #include "backends/cpu/CPUBackend.hpp"
 #include "cmdline.h"
 #include "models/phonelm/modeling_phonelm.hpp"
@@ -15,7 +14,7 @@ int main(int argc, char **argv) {
     cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
     cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt");
     cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/PhoneLM-1.5B-Instruct-128.mllm");
-    cmdParser.add<string>("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-droidcall-q4_0_4_4.mllm");
+    cmdParser.add<string>("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-instruct-q4_0_4_4.mllm");
     cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
     cmdParser.add<int>("thread", 't', "num of threads", false, 4);
     cmdParser.add<int>("chunk", 'c', "chunk size", false, 64);
@@ -40,6 +39,9 @@ int main(int argc, char **argv) {
         "Give me a short introduction to large language model.",
         "What is the Beijing University of Posts and Telecommunications.",
         "What is the meaning of life?",
+        "Hello, who are you?",
+        "What can you do?",
+        "Please introduce Beijing University of Posts and Telecommunications.",
     };
 
     // turn on the multi-chunk prefilling
@@ -48,13 +50,10 @@ int main(int argc, char **argv) {
     for (int i = 0; i < in_strs.size(); ++i) {
         auto input_str = tokenizer.apply_chat_template(in_strs[i]);
         auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);
-
         const int seq_length_padding = (chunk_size - real_seq_length % chunk_size) + real_seq_length;
         const int chunk_num = seq_length_padding / chunk_size;
         bool isSwitched = false;
-
-        std::cout << "real seq length: " << real_seq_length << " padding to: " << seq_length_padding << " chunk num: " << chunk_num << std::endl;
-
+        // std::cout << "real seq length: " << real_seq_length << " padding to: " << seq_length_padding << " chunk num: " << chunk_num << std::endl;
         std::cout << "[Q] " << in_strs[i] << std::endl;
         std::cout << "[A] " << std::flush;
 
@@ -84,13 +83,11 @@ int main(int argc, char **argv) {
                 auto out_string = tokenizer.detokenize({out_token});
                 auto [not_end, output_string] = tokenizer.postprocess(out_string);
                 if (!not_end) { return false; }
-
                 if (chunk_id == chunk_num - 1) { // print the output of the last chunk
                     std::cout << output_string << std::flush;
                 }
                 return true;
             });
-
             Module::isFirstChunk = false;
         }
 
@@ -107,30 +104,23 @@ int main(int argc, char **argv) {
             .top_p = 0.f,
             .is_padding = false,
         };
-
         isSwitched = false;
         decoding_model.generate(chunked_tensors.back(), decoding_opt, [&](unsigned int out_token) -> bool {
-            if (!isSwitched) {
-                // turn off switching
+            if (!isSwitched) { // turn off switching
                 static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
                 isSwitched = true;
             }
             auto out_string = tokenizer.detokenize({out_token});
-            auto [isOk, print_string] = tokenizer.postprocess(out_string);
-            if (isOk) {
-                std::cout << print_string << std::flush;
-            } else {
-                return false;
-            }
+            auto [not_end, output_string] = tokenizer.postprocess(out_string);
+            if (!not_end) { return false; }
+            std::cout << output_string << std::flush;
             return true;
         });
-        std::cout << "\n---------------" << std::endl;
+
         // turn on switching, set sequence length and execution type
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(0);
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
         static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
-        model.clear_kvcache();
-        decoding_model.clear_kvcache();
+        std::cout << "\n";
     }
-}
-#endif
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/examples/demo_qwen2.5_npu.cpp b/examples/demo_qwen2.5_npu.cpp
index 77c0356a..af6b2e34 100644
--- a/examples/demo_qwen2.5_npu.cpp
+++ b/examples/demo_qwen2.5_npu.cpp
@@ -1,4 +1,3 @@
-#ifdef USE_QNN
 #include "backends/cpu/CPUBackend.hpp"
 #include "cmdline.h"
 #include "models/qwen/configuration_qwen.hpp"
@@ -89,5 +88,4 @@ int main(int argc, char **argv) {
         });
         std::cout << "\n---------------" << std::endl;
     }
-}
-#endif
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/examples/demo_qwen_npu.cpp b/examples/demo_qwen_npu.cpp
index 2ab916c7..04875c30 100644
--- a/examples/demo_qwen_npu.cpp
+++ b/examples/demo_qwen_npu.cpp
@@ -1,4 +1,3 @@
-#ifdef USE_QNN
 #include "backends/cpu/CPUBackend.hpp"
 #include "cmdline.h"
 #include "models/qwen/configuration_qwen.hpp"
@@ -89,5 +88,4 @@ int main(int argc, char **argv) {
         });
         std::cout << "\n---------------" << std::endl;
     }
-}
-#endif
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/scripts/build_android_app.sh b/scripts/build_android_app.sh
index f2211c6f..f11bab92 100755
--- a/scripts/build_android_app.sh
+++ b/scripts/build_android_app.sh
@@ -21,9 +21,9 @@ cmake .. \
 
 make mllm_lib -j16
 
-# # 2. copy libs
-# cp ./libmllm_lib.a ../android/app/src/main/cpp/libs/
+# 2. copy libs
+cp ./libmllm_lib.a ../android/app/src/main/cpp/libs/
 
-# # 3. build android apk
-# cd ../android || exit
-# ./gradlew assembleDebug
\ No newline at end of file
+# 3. build android apk
+cd ../android || exit
+./gradlew assembleDebug
\ No newline at end of file
diff --git a/scripts/run_phonelm_qnn.sh b/scripts/run_phonelm_qnn.sh
new file mode 100755
index 00000000..a3f814de
--- /dev/null
+++ b/scripts/run_phonelm_qnn.sh
@@ -0,0 +1,44 @@
+!/bin/bash
+
+adb shell mkdir -p /data/local/tmp/mllm/vocab
+adb shell mkdir -p /data/local/tmp/mllm/qnn-lib
+
+adb push ../vocab/phonelm_vocab.mllm /data/local/tmp/mllm/vocab/
+
+
+if ! adb shell [ -f "/data/local/tmp/mllm/models/PhoneLM-1.5B-Instruct-128.mllm" ]; then
+    adb push ../models/PhoneLM-1.5B-Instruct-128.mllm "/data/local/tmp/mllm/models/PhoneLM-1.5B-Instruct-128.mllm"
+else
+    echo "PhoneLM-1.5B-Instruct-128 file already exists"
+fi
+
+
+if ! adb shell [ -f "/data/local/tmp/mllm/models//phonelm-1.5b-instruct-q4_0_4_4.mllm" ]; then
+    adb push ../models//phonelm-1.5b-instruct-q4_0_4_4.mllm "/data/local/tmp/mllm/models//phonelm-1.5b-instruct-q4_0_4_4.mllm"
+else
+    echo "/phonelm-1.5b-instruct-q4_0_4_4.mllm file already exists"
+fi
+
+LIBPATH=../src/backends/qnn/qualcomm_ai_engine_direct_220/
+ANDR_LIB=$LIBPATH/lib/aarch64-android
+OP_PATH=../src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/build
+DEST=/data/local/tmp/mllm/qnn-lib
+
+adb push $ANDR_LIB/libQnnHtp.so $DEST
+adb push $ANDR_LIB/libQnnHtpV75Stub.so $DEST
+adb push $ANDR_LIB/libQnnHtpPrepare.so $DEST
+adb push $ANDR_LIB/libQnnHtpProfilingReader.so $DEST
+adb push $ANDR_LIB/libQnnHtpOptraceProfilingReader.so $DEST
+adb push $ANDR_LIB/libQnnHtpV75CalculatorStub.so $DEST
+adb push $LIBPATH/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so $DEST
+adb push $OP_PATH/aarch64-android/libQnnLLaMAPackage.so $DEST/libQnnLLaMAPackage_CPU.so
+adb push $OP_PATH/hexagon-v75/libQnnLLaMAPackage.so $DEST/libQnnLLaMAPackage_HTP.so
+
+
+if [ $? -ne 0 ]; then
+    echo "adb push failed"
+    exit 1
+fi
+
+adb push ../bin-arm/demo_phonelm_npu /data/local/tmp/mllm/bin/
+adb shell "cd /data/local/tmp/mllm/bin && export LD_LIBRARY_PATH=/data/local/tmp/mllm/qnn-lib && export ADSP_LIBRARY_PATH=/data/local/tmp/mllm/qnn-lib && ./demo_phonelm_npu"
\ No newline at end of file
diff --git a/src/Generate.hpp b/src/Generate.hpp
index 4ae9a863..4eb669d8 100644
--- a/src/Generate.hpp
+++ b/src/Generate.hpp
@@ -53,6 +53,7 @@ class _LlmTextGenerateMethod {
     bool is_padding = false;
     int seq_before_padding = 0;
     int chunk_size = -1;
+
 public:
     virtual ~_LlmTextGenerateMethod() = default;
     virtual unsigned int generate(Tensor &t) = 0;
@@ -174,6 +175,13 @@ class LlmTextGenerator {
         return m_method_class->generate(t);
     }
 
+    inline unsigned int generate(Tensor &t, const LlmTextGeneratorOpts &opt) {
+        if (opt.is_padding) {
+            m_method_class->setPadding(opt.is_padding, opt.seq_before_padding, opt.chunk_size);
+        }
+        return m_method_class->generate(t);
+    }
+
     inline LLmTextGeneratorType type() {
         return m_type;
     }
diff --git a/src/Op.hpp b/src/Op.hpp
index be04de18..47a4b721 100644
--- a/src/Op.hpp
+++ b/src/Op.hpp
@@ -116,12 +116,12 @@ class Op {
     }
 
     virtual int getCacheSeqLen() {
-        assert(type_ == OpType::KVCACHE);
+        assert(type_ == OpType::KVCACHE || type_ == OpType::KVCACHENPU);
         std::cout << "only for KVCache" << std::endl;
         return -1;
     }
     virtual void clearCache() {
-        assert(type_ == OpType::KVCACHE || type_ == OpType::IROPE || type_ == OpType::ROPE);
+        assert(type_ == OpType::KVCACHE || type_ == OpType::KVCACHENPU || type_ == OpType::IROPE || type_ == OpType::ROPE);
         std::cout << "only for KVCache" << std::endl;
     }
 
diff --git a/src/backends/cpu/CPUBackend.hpp b/src/backends/cpu/CPUBackend.hpp
index 75f96412..c808db80 100644
--- a/src/backends/cpu/CPUBackend.hpp
+++ b/src/backends/cpu/CPUBackend.hpp
@@ -32,7 +32,7 @@ class CPUBackend final : public Backend {
 
     static int cpu_threads;
 
-#ifdef USE_QNN
+    // #ifdef USE_QNN
     void setSequenceLength(int sequence_length) {
         sequence_length_ = sequence_length;
     }
@@ -51,15 +51,15 @@ class CPUBackend final : public Backend {
     ExecutionType getExecutionType() {
         return execution_type;
     }
-#endif
+    // #endif
 private:
     std::map<OpType, CPUBackend::Creator *> map_creator_;
     std::map<TensorFuncType, TensorFunction *> map_function_;
-#ifdef USE_QNN
+    // #ifdef USE_QNN
     int sequence_length_ = 0;
     bool isSwitchingStage = false;
     ExecutionType execution_type = PROMPT;
-#endif
+    // #endif
 };
 
 } // namespace mllm
diff --git a/src/models/phonelm/modeling_phonelm_npu.hpp b/src/models/phonelm/modeling_phonelm_npu.hpp
index 27f44fb6..295bdab7 100644
--- a/src/models/phonelm/modeling_phonelm_npu.hpp
+++ b/src/models/phonelm/modeling_phonelm_npu.hpp
@@ -135,12 +135,6 @@ class PhoneLMQKVmm final : public Module {
 
         return {o};
     }
-    vector<KVCache *> get_cache() {
-        return {&k_cache, &v_cache};
-    }
-    vector<IRoPE *> get_rope() {
-        return {&q_rope, &k_rope};
-    }
 };
 
 // QNN mlp part
@@ -421,9 +415,6 @@ class PhoneLMNPU_CPUDecoder final : public Module {
 
         return {x};
     }
-    PhoneLMQKVmm &get_mm() {
-        return qkv_mm;
-    }
 };
 
 class PhoneLMNPU_CPUDecoderWithShadow final : public Module {
@@ -492,9 +483,6 @@ class PhoneLMNPU_CPUDecoderWithShadow final : public Module {
 
         return {x};
     }
-    PhoneLMQKVmm &get_mm() {
-        return qkv_mm;
-    }
 };
 
 // Copied from GemmaModel with Gemma->PhoneLM and set RmsNorm(without add_unit_offset)
@@ -537,18 +525,6 @@ class PhoneLMModel_NPU final : public Module {
         return {x};
     }
 
-    void clear_kvcache() override {
-        for (auto &block : blocks) {
-            auto decoder = dynamic_cast<PhoneLMNPU_CPUDecoder *>(block.get());
-            if (decoder) {
-                auto kvcache = decoder->get_mm().get_cache();
-                for (auto &cache : kvcache) { cache->clearCache(); }
-                auto ropes = decoder->get_mm().get_rope();
-                for (auto &rope : ropes) { rope->clearCache(); }
-            }
-        }
-    }
-
 private:
     std::vector<unique_ptr<Module>> blocks;
     Layer norm;
@@ -606,15 +582,12 @@ class PhoneLMForCausalLM_NPU final : public Module {
 
         for (int step = 0; step < opt.max_new_tokens; ++step) {
             auto _out = (*this)({input_ids});
-            auto out_token = text_generator_->generate(_out[0]);
+            auto out_token = text_generator_->generate(_out[0], opt);
             if (!call_back(out_token)) break;
             chatPostProcessing(out_token, input_ids, {});
             return;
         }
     }
-    void clear_kvcache() override {
-        model.clear_kvcache();
-    }
 
 private:
     int hidden_size;
diff --git a/src/models/qwen/modeling_qwen_npu.hpp b/src/models/qwen/modeling_qwen_npu.hpp
index c051170a..acc4d7e3 100644
--- a/src/models/qwen/modeling_qwen_npu.hpp
+++ b/src/models/qwen/modeling_qwen_npu.hpp
@@ -586,7 +586,7 @@ class QWenForCausalLM_NPU final : public Module {
 
         for (int step = 0; step < opt.max_new_tokens; ++step) {
             auto _out = (*this)({input_ids});
-            auto out_token = text_generator_->generate(_out[0]);
+            auto out_token = text_generator_->generate(_out[0], opt);
             if (!call_back(out_token)) break;
             chatPostProcessing(out_token, input_ids, {});
             return;