From e428bb5a9a4964785a078342e81de0d6546db815 Mon Sep 17 00:00:00 2001 From: oreomaker Date: Fri, 8 Nov 2024 16:03:42 +0800 Subject: [PATCH 1/4] dev: qnn multi input inference developing --- examples/demo_phonelm_npu.cpp | 23 +++++++++++++++++---- examples/demo_qwen_npu.cpp | 4 ++-- examples/main_phonelm_npu.cpp | 12 +++++------ examples/main_qwen_npu.cpp | 12 +++++------ include/Types.hpp | 2 +- src/backends/cpu/CPUBackend.hpp | 15 ++++++++++---- src/backends/cpu/CPUIRoPE.cpp | 8 +++++++ src/backends/cpu/CPUKVCacheNPU.cpp | 12 ++++++++--- src/backends/cpu/CPURoPE.cpp | 7 +++++++ src/backends/cpu/CPUSoftMax.cpp | 2 +- src/backends/qnn/QNNExecutor.hpp | 2 +- src/models/phonelm/modeling_phonelm_npu.hpp | 6 +++++- 12 files changed, 76 insertions(+), 29 deletions(-) diff --git a/examples/demo_phonelm_npu.cpp b/examples/demo_phonelm_npu.cpp index 6d774230..d4ce1484 100644 --- a/examples/demo_phonelm_npu.cpp +++ b/examples/demo_phonelm_npu.cpp @@ -1,3 +1,4 @@ +#include "Types.hpp" #ifdef USE_QNN #include "backends/cpu/CPUBackend.hpp" #include "cmdline.h" @@ -31,6 +32,8 @@ int main(int argc, char **argv) { vector in_strs = { "Give me a short introduction to large language model.", + "What is the Beijing University of Posts and Telecommunications.", + "What is the meaning of life?", }; for (int i = 0; i < in_strs.size(); ++i) { @@ -50,7 +53,13 @@ int main(int argc, char **argv) { .is_padding = true, .seq_before_padding = real_seq_length, }; + bool isSwitched = false; model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool { + if (i != 0 && !isSwitched) { + // turn off switching + static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); + isSwitched = true; + } auto out_string = tokenizer.detokenize({out_token}); auto [not_end, output_string] = tokenizer.postprocess(out_string); if (!not_end) { return false; } @@ -58,8 +67,10 @@ int main(int argc, char **argv) { return true; }); + // turn on switching, set sequence length and execution type static_cast(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length); - static_cast(Backend::global_backends[MLLM_CPU])->switchDecodeTag(); + static_cast(Backend::global_backends[MLLM_CPU])->setExecutionType(AUTOREGRESSIVE); + static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); LlmTextGeneratorOpts decoding_opt{ .max_new_tokens = 100, @@ -69,11 +80,11 @@ int main(int argc, char **argv) { .top_p = 0.f, .is_padding = false, }; - bool isSwitched = false; + isSwitched = false; decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool { - // call only once of switchDecodeTag if (!isSwitched) { - static_cast(Backend::global_backends[MLLM_CPU])->switchDecodeTag(); + // turn off switching + static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); isSwitched = true; } auto out_string = tokenizer.detokenize({out_token}); @@ -86,6 +97,10 @@ int main(int argc, char **argv) { return true; }); std::cout << "\n---------------" << std::endl; + // turn on switching, set sequence length and execution type + static_cast(Backend::global_backends[MLLM_CPU])->setSequenceLength(0); + static_cast(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT); + static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); } } #endif \ No newline at end of file diff --git a/examples/demo_qwen_npu.cpp b/examples/demo_qwen_npu.cpp index 421e7d0e..2ab916c7 100644 --- a/examples/demo_qwen_npu.cpp +++ b/examples/demo_qwen_npu.cpp @@ -61,7 +61,7 @@ int main(int argc, char **argv) { }); static_cast(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length); - static_cast(Backend::global_backends[MLLM_CPU])->switchDecodeTag(); + static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); LlmTextGeneratorOpts decoding_opt{ .max_new_tokens = 100, @@ -75,7 +75,7 @@ int main(int argc, char **argv) { decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool { // call only once of switchDecodeTag if (!isSwitched) { - static_cast(Backend::global_backends[MLLM_CPU])->switchDecodeTag(); + static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); isSwitched = true; } auto out_string = tokenizer.detokenize({out_token}); diff --git a/examples/main_phonelm_npu.cpp b/examples/main_phonelm_npu.cpp index 0cff3898..af0b6186 100644 --- a/examples/main_phonelm_npu.cpp +++ b/examples/main_phonelm_npu.cpp @@ -190,11 +190,11 @@ int main(int argc, char **argv) { auto inter_cpu_backend = dynamic_cast(interNet.backends()[MLLM_CPU].get()); auto decode_cpu_backend = dynamic_cast(cpuNet.backends()[MLLM_CPU].get()); prefill_cpu_backend->setSequenceLength(real_seq_length); - prefill_cpu_backend->switchDecodeTag(); + prefill_cpu_backend->toggleSwitching(); inter_cpu_backend->setSequenceLength(real_seq_length); - inter_cpu_backend->switchDecodeTag(); + inter_cpu_backend->toggleSwitching(); decode_cpu_backend->setSequenceLength(real_seq_length); - decode_cpu_backend->switchDecodeTag(); + decode_cpu_backend->toggleSwitching(); // // 2: Decoding stage using CPU execute for (int step = real_seq_length; step < real_seq_length + 100; step++) { @@ -210,9 +210,9 @@ int main(int argc, char **argv) { std::cout << out_token << std::flush; if (step == real_seq_length) { - prefill_cpu_backend->switchDecodeTag(); - inter_cpu_backend->switchDecodeTag(); - decode_cpu_backend->switchDecodeTag(); + prefill_cpu_backend->toggleSwitching(); + inter_cpu_backend->toggleSwitching(); + decode_cpu_backend->toggleSwitching(); } } } while (false); diff --git a/examples/main_qwen_npu.cpp b/examples/main_qwen_npu.cpp index b096100b..4adc7fa7 100644 --- a/examples/main_qwen_npu.cpp +++ b/examples/main_qwen_npu.cpp @@ -187,11 +187,11 @@ int main(int argc, char **argv) { auto inter_cpu_backend = dynamic_cast(interNet.backends()[MLLM_CPU].get()); auto decode_cpu_backend = dynamic_cast(cpuNet.backends()[MLLM_CPU].get()); prefill_cpu_backend->setSequenceLength(real_seq_length); - prefill_cpu_backend->switchDecodeTag(); + prefill_cpu_backend->toggleSwitching(); inter_cpu_backend->setSequenceLength(real_seq_length); - inter_cpu_backend->switchDecodeTag(); + inter_cpu_backend->toggleSwitching(); decode_cpu_backend->setSequenceLength(real_seq_length); - decode_cpu_backend->switchDecodeTag(); + decode_cpu_backend->toggleSwitching(); // // 2: Decoding stage using CPU execute for (int step = real_seq_length; step < real_seq_length + 100; step++) { @@ -207,9 +207,9 @@ int main(int argc, char **argv) { std::cout << out_token << std::flush; if (step == real_seq_length) { - prefill_cpu_backend->switchDecodeTag(); - inter_cpu_backend->switchDecodeTag(); - decode_cpu_backend->switchDecodeTag(); + prefill_cpu_backend->toggleSwitching(); + inter_cpu_backend->toggleSwitching(); + decode_cpu_backend->toggleSwitching(); } } } while (false); diff --git a/include/Types.hpp b/include/Types.hpp index 70cd9ae4..3ce92ff4 100644 --- a/include/Types.hpp +++ b/include/Types.hpp @@ -140,7 +140,7 @@ enum RoPEType { MLAROPE = 5, }; -enum QNNExecutionType { +enum ExecutionType { PROMPT = 0, AUTOREGRESSIVE = 1, }; diff --git a/src/backends/cpu/CPUBackend.hpp b/src/backends/cpu/CPUBackend.hpp index 816935d9..75f96412 100644 --- a/src/backends/cpu/CPUBackend.hpp +++ b/src/backends/cpu/CPUBackend.hpp @@ -39,11 +39,17 @@ class CPUBackend final : public Backend { int getSequenceLength() { return sequence_length_; } - void switchDecodeTag() { - isPrefillToDecode = !isPrefillToDecode; + void toggleSwitching() { + isSwitchingStage = !isSwitchingStage; } bool isStageSwitching() { - return isPrefillToDecode; + return isSwitchingStage; + } + void setExecutionType(ExecutionType type) { + execution_type = type; + } + ExecutionType getExecutionType() { + return execution_type; } #endif private: @@ -51,7 +57,8 @@ class CPUBackend final : public Backend { std::map map_function_; #ifdef USE_QNN int sequence_length_ = 0; - bool isPrefillToDecode = false; + bool isSwitchingStage = false; + ExecutionType execution_type = PROMPT; #endif }; diff --git a/src/backends/cpu/CPUIRoPE.cpp b/src/backends/cpu/CPUIRoPE.cpp index 46562df2..12b4ba39 100644 --- a/src/backends/cpu/CPUIRoPE.cpp +++ b/src/backends/cpu/CPUIRoPE.cpp @@ -362,6 +362,14 @@ void CPUIRoPE::rope_mla(shared_ptr input, shared_ptr output){ } ErrorCode CPUIRoPE::execute(vector> inputs, vector> outputs) { + // if use QNN, when a new prompt input, the seq should be reset to 0 here as the setUp is not called +#ifdef USE_QNN + auto cpuBackend = dynamic_cast(backend_); + if (cpuBackend->isStageSwitching() && cpuBackend->getExecutionType() == PROMPT) { + h_cnt_ = 0; + } +#endif + auto &input = inputs[0]; auto &output = outputs[0]; auto out_dtype = output->dtype(); diff --git a/src/backends/cpu/CPUKVCacheNPU.cpp b/src/backends/cpu/CPUKVCacheNPU.cpp index 582a52e8..ae2c24bd 100644 --- a/src/backends/cpu/CPUKVCacheNPU.cpp +++ b/src/backends/cpu/CPUKVCacheNPU.cpp @@ -34,7 +34,7 @@ ErrorCode CPUKVCacheNPU::reshape(vector> inputs, vector(backend_); - if (cpuBackend->isStageSwitching()) { + if (cpuBackend->isStageSwitching() && cpuBackend->getExecutionType() == AUTOREGRESSIVE) { cache_seq_len_ = cpuBackend->getSequenceLength(); isDecoding = true; } @@ -57,9 +57,16 @@ ErrorCode CPUKVCacheNPU::load(AbstructLoader &loader) { } ErrorCode CPUKVCacheNPU::execute(vector> inputs, vector> outputs) { + // if a new prompt is given, the cache should be updated + auto cpuBackend = dynamic_cast(backend_); + if (cpuBackend->isStageSwitching() && cpuBackend->getExecutionType() == PROMPT) { + isDecoding = false; + cache_seq_len_ = 0; + outputs[0]->deepCopyFrom(cache_, false, {0, 0, cache_seq_len_ / cache_limit_, 0}); + } + // when decoding, the input will deepCopy from cache, no need to execute if (isDecoding) { - int cache_seq_len_old = cache_seq_len_; cache_seq_len_ += inputs[0]->sequence(); return MLLM_NO_ERROR; } @@ -125,7 +132,6 @@ ErrorCode CPUKVCacheNPU::execute(vector> inputs, vectorsequence(); return Op::execute(inputs, outputs); diff --git a/src/backends/cpu/CPURoPE.cpp b/src/backends/cpu/CPURoPE.cpp index 493b2847..b00ab8aa 100644 --- a/src/backends/cpu/CPURoPE.cpp +++ b/src/backends/cpu/CPURoPE.cpp @@ -331,6 +331,13 @@ void CPURoPE::rope_mla(shared_ptr input, shared_ptr output) { } ErrorCode CPURoPE::execute(vector> inputs, vector> outputs) { + // if use QNN, when a new prompt input, the seq should be reset to 0 here as the setUp is not called +#ifdef USE_QNN + auto cpuBackend = dynamic_cast(backend_); + if (cpuBackend->isStageSwitching() && cpuBackend->getExecutionType() == PROMPT) { + h_cnt_ = 0; + } +#endif auto &input = inputs[0]; auto &output = outputs[0]; auto out_dtype = output->dtype(); diff --git a/src/backends/cpu/CPUSoftMax.cpp b/src/backends/cpu/CPUSoftMax.cpp index 45545eaa..d9c5c372 100644 --- a/src/backends/cpu/CPUSoftMax.cpp +++ b/src/backends/cpu/CPUSoftMax.cpp @@ -37,7 +37,7 @@ ErrorCode CPUSoftMax::execute(vector> inputs, vectordimension() - input->sequence(); -#elif defined(USE_QNN) // TODO: 就前端才用到,一定要记得删除 +#elif defined(USE_QNN) // TODO: 旧前端才用到,一定要记得删除 old_dim = input->dimension() - input->sequence(); #endif } diff --git a/src/backends/qnn/QNNExecutor.hpp b/src/backends/qnn/QNNExecutor.hpp index 6d335c08..5f29d15e 100644 --- a/src/backends/qnn/QNNExecutor.hpp +++ b/src/backends/qnn/QNNExecutor.hpp @@ -63,7 +63,7 @@ class QNNExecutor : public Executor { protected: bool isSetup_ = false; - QNNExecutionType executionType_ = PROMPT; + ExecutionType executionType_ = PROMPT; }; class QNNPipelineExecutor : public QNNExecutor { diff --git a/src/models/phonelm/modeling_phonelm_npu.hpp b/src/models/phonelm/modeling_phonelm_npu.hpp index 29554c32..0b03cb1d 100644 --- a/src/models/phonelm/modeling_phonelm_npu.hpp +++ b/src/models/phonelm/modeling_phonelm_npu.hpp @@ -124,7 +124,11 @@ class PhoneLMQKVmm final : public Module { v = v_cache(v); auto qk = Tensor::mm(q, k.transpose(Chl::SEQUENCE, Chl::DIMENSION)); - qk = softmax(qk, k_cache.getCacheSeqLen()); + if (k_cache.ready() && v_cache.ready()) { + qk = softmax(qk, k_cache.getCacheSeqLen()); + } else { + qk = softmax(qk); + } auto o = Tensor::mm(qk, v); o = o_quantize(o); From f94937d6cdfe6256292bc0d44fab7389c99915d0 Mon Sep 17 00:00:00 2001 From: oreomaker Date: Wed, 13 Nov 2024 17:03:30 +0800 Subject: [PATCH 2/4] feat: qnn multi chunk prefilling in new frontend refactor: qnn module setup skip in following chunks todo: multi input --- examples/demo_phonelm_npu.cpp | 66 ++++++++++++++++------- include/Types.hpp | 1 + src/Generate.hpp | 13 +++-- src/Module.cpp | 3 ++ src/Module.hpp | 15 ++++++ src/backends/cpu/CPUKVCacheNPU.cpp | 12 ++--- src/backends/cpu/CPURoPE.cpp | 7 --- src/backends/xnnpack/third_party/fmt | 1 + src/models/smollm/tokenization_smollm.hpp | 52 ++++++++++++++++++ 9 files changed, 133 insertions(+), 37 deletions(-) create mode 160000 src/backends/xnnpack/third_party/fmt diff --git a/examples/demo_phonelm_npu.cpp b/examples/demo_phonelm_npu.cpp index d4ce1484..7f568f58 100644 --- a/examples/demo_phonelm_npu.cpp +++ b/examples/demo_phonelm_npu.cpp @@ -1,4 +1,6 @@ +#include "Module.hpp" #include "Types.hpp" +#include #ifdef USE_QNN #include "backends/cpu/CPUBackend.hpp" #include "cmdline.h" @@ -15,12 +17,14 @@ int main(int argc, char **argv) { cmdParser.add("model", 'm', "specify mllm model path", false, "../models/PhoneLM-1.5B-Instruct-128.mllm"); cmdParser.add("limits", 'l', "max KV cache size", false, 400); cmdParser.add("thread", 't', "num of threads", false, 4); + cmdParser.add("chunk", 'c', "chunk size", false, 64); cmdParser.parse_check(argc, argv); string vocab_path = cmdParser.get("vocab"); string merge_path = cmdParser.get("merge"); string model_path = cmdParser.get("model"); int tokens_limit = cmdParser.get("limits"); + int chunk_size = cmdParser.get("chunk"); CPUBackend::cpu_threads = cmdParser.get("thread"); auto tokenizer = SmolLMTokenizer(vocab_path, merge_path); @@ -36,36 +40,57 @@ int main(int argc, char **argv) { "What is the meaning of life?", }; + // turn on the multi-chunk prefilling + Module::isMultiChunkPrefilling = true; + for (int i = 0; i < in_strs.size(); ++i) { auto input_str = tokenizer.apply_chat_template(in_strs[i]); - auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 64, config.vocab_size); - std::cout << real_seq_length << endl; - std::cout << input_str << std::endl; + auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size); + + const int seq_length_padding = (chunk_size - real_seq_length % chunk_size) + real_seq_length; + const int chunk_num = seq_length_padding / chunk_size; + bool isSwitched = false; + + std::cout << "real seq length: " << real_seq_length << " padding to: " << seq_length_padding << " chunk num: " << chunk_num << std::endl; + std::cout << "[Q] " << in_strs[i] << std::endl; std::cout << "[A] " << std::flush; + // tensor vectors to save the chunked tensors of the QNN prefilling input + vector chunked_tensors(chunk_num); LlmTextGeneratorOpts opt{ .max_new_tokens = 1, .do_sample = false, - .temperature = 0.3f, - .top_k = 50, - .top_p = 0.f, .is_padding = true, .seq_before_padding = real_seq_length, + .chunk_size = chunk_size, }; - bool isSwitched = false; - model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool { - if (i != 0 && !isSwitched) { - // turn off switching - static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); - isSwitched = true; - } - auto out_string = tokenizer.detokenize({out_token}); - auto [not_end, output_string] = tokenizer.postprocess(out_string); - if (!not_end) { return false; } - std::cout << output_string << std::flush; - return true; - }); + + for (int chunk_id = 0; chunk_id < chunk_num; ++chunk_id) { + chunked_tensors[chunk_id].setBackend(Backend::global_backends[MLLM_CPU]); + chunked_tensors[chunk_id].setTtype(INPUT_TENSOR); + chunked_tensors[chunk_id].reshape(1, 1, chunk_size, 1); + chunked_tensors[chunk_id].setName("input-chunk-" + to_string(chunk_id)); + chunked_tensors[chunk_id].deepCopyFrom(&input_tensor, false, {0, 0, chunk_id * chunk_size, 0}); + + model.generate(chunked_tensors[chunk_id], opt, [&](unsigned int out_token) -> bool { + if (i != 0 && !isSwitched && chunk_id == 0) { + // turn off switching at the first chunk of following inputs + static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); + isSwitched = true; + } + auto out_string = tokenizer.detokenize({out_token}); + auto [not_end, output_string] = tokenizer.postprocess(out_string); + if (!not_end) { return false; } + + if (chunk_id == chunk_num - 1) { // print the output of the last chunk + std::cout << output_string << std::flush; + } + return true; + }); + + Module::isFirstChunk = false; + } // turn on switching, set sequence length and execution type static_cast(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length); @@ -80,8 +105,9 @@ int main(int argc, char **argv) { .top_p = 0.f, .is_padding = false, }; + isSwitched = false; - decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool { + decoding_model.generate(chunked_tensors.back(), decoding_opt, [&](unsigned int out_token) -> bool { if (!isSwitched) { // turn off switching static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); diff --git a/include/Types.hpp b/include/Types.hpp index 9b634493..040e6712 100644 --- a/include/Types.hpp +++ b/include/Types.hpp @@ -36,6 +36,7 @@ enum TensorStatus { // TENSOR_DYNAMIC, TENSOR_STATIC_INIT, TENSOR_STATIC_READY, + TENSOR_UNDEFINED, }; enum ErrorCode { diff --git a/src/Generate.hpp b/src/Generate.hpp index 7c5b11b9..4ae9a863 100644 --- a/src/Generate.hpp +++ b/src/Generate.hpp @@ -29,6 +29,7 @@ struct LlmTextGeneratorOpts { float top_p = 0.92; bool is_padding = false; int seq_before_padding = 0; + int chunk_size = -1; }; template @@ -51,12 +52,14 @@ enum class LLmTextGeneratorType : int32_t { class _LlmTextGenerateMethod { bool is_padding = false; int seq_before_padding = 0; + int chunk_size = -1; public: virtual ~_LlmTextGenerateMethod() = default; virtual unsigned int generate(Tensor &t) = 0; - inline void setPadding(bool is_padding, int seq_before_padding) { + inline void setPadding(bool is_padding, int seq_before_padding, int chunk_size) { this->is_padding = is_padding; this->seq_before_padding = seq_before_padding; + this->chunk_size = chunk_size; } inline void _tensor_to_vec(Tensor &t, std::vector &scores) { assert(t.batch() == 1 && "Batch size of result is not 1. Which is not supported for now."); @@ -65,7 +68,11 @@ class _LlmTextGenerateMethod { int _seq = t.sequence() - 1; // padding prefill for QNN if (is_padding) { - _seq = seq_before_padding - 1; + if (chunk_size > 0) { + _seq = (seq_before_padding - 1) % chunk_size; + } else { + _seq = seq_before_padding - 1; + } } for (int i = 0; i < _dims; ++i) { auto value = t.dataAt(0, 0, _seq, i); @@ -159,7 +166,7 @@ class LlmTextGenerator { // padding prefill for QNN if (opt.is_padding) { - m_method_class->setPadding(opt.is_padding, opt.seq_before_padding); + m_method_class->setPadding(opt.is_padding, opt.seq_before_padding, opt.chunk_size); } } diff --git a/src/Module.cpp b/src/Module.cpp index 942e9941..95479179 100644 --- a/src/Module.cpp +++ b/src/Module.cpp @@ -13,6 +13,9 @@ namespace mllm { // The llm_model_ptr is a pointer to the outmost module Module *Module::llm_model_ptr; +bool Module::isMultiChunkPrefilling = false; +bool Module::isFirstChunk = true; + int Module::listIdx; int Module::runlistIdx; // TensorStatus Tensor::tensor_status; diff --git a/src/Module.hpp b/src/Module.hpp index 92f5e48e..69256561 100644 --- a/src/Module.hpp +++ b/src/Module.hpp @@ -39,6 +39,10 @@ class Module { bool doLoad = false; static Module *llm_model_ptr; + // tag to indicate the multi-chunk prefilling + static bool isMultiChunkPrefilling; + // tag to indicate the first chunk + static bool isFirstChunk; static int listIdx; static int runlistIdx; @@ -188,6 +192,11 @@ class Module { if (inputs[0].sequence() != 1 && !last_shape_bshd_.empty()) { // if LLM/VLLM model, the `need_setup` should be `true` if (input.batch() == last_shape_bshd_[i][0] & input.sequence() == last_shape_bshd_[i][1] & input.head() == last_shape_bshd_[i][2] & input.dimension() == last_shape_bshd_[i][3]) { + // if it is the QNN multi-chunk prefilling, the `need_setup` should be `true` to reshape & setUp CPU Ops + if (Module::isMultiChunkPrefilling) { + need_setup = true; + break; + } need_setup = false; } } @@ -215,6 +224,12 @@ class Module { } else { // inner Modules // offload according to the backends' info inited during loading if (Tensor::tensor_status == TENSOR_STATIC_INIT && device_ != MLLM_CPU) { // backend specific module reshape & setup + if (Module::isMultiChunkPrefilling && !Module::isFirstChunk) { // set to TENSOR_UNDEFINED and SKIP executing qnn layers + Tensor::tensor_status = TENSOR_UNDEFINED; + auto outputs = Forward(inputs, anyArgs); + Tensor::tensor_status = TENSOR_STATIC_INIT; + return outputs; + } auto inputs_vec = vector>(); auto outputs_vec = vector>(); for (auto &i : inputs) { diff --git a/src/backends/cpu/CPUKVCacheNPU.cpp b/src/backends/cpu/CPUKVCacheNPU.cpp index 06d32ef8..c7b24236 100644 --- a/src/backends/cpu/CPUKVCacheNPU.cpp +++ b/src/backends/cpu/CPUKVCacheNPU.cpp @@ -38,6 +38,11 @@ ErrorCode CPUKVCacheNPU::reshape(vector> inputs, vectorgetSequenceLength(); isDecoding = true; } + // if a new prompt is given, the cache should be updated + if (cpuBackend->isStageSwitching() && cpuBackend->getExecutionType() == PROMPT) { + cache_seq_len_ = cpuBackend->getSequenceLength(); + isDecoding = false; + } #endif outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence() + cache_seq_len_, inputs[0]->dimension()); @@ -56,13 +61,6 @@ ErrorCode CPUKVCacheNPU::load(AbstructLoader &loader) { } ErrorCode CPUKVCacheNPU::execute(vector> inputs, vector> outputs) { - // if a new prompt is given, the cache should be updated - auto cpuBackend = dynamic_cast(backend_); - if (cpuBackend->isStageSwitching() && cpuBackend->getExecutionType() == PROMPT) { - isDecoding = false; - cache_seq_len_ = 0; - outputs[0]->deepCopyFrom(cache_, false, {0, 0, cache_seq_len_ / cache_limit_, 0}); - } // when decoding, the input will deepCopy from cache, no need to execute if (isDecoding) { diff --git a/src/backends/cpu/CPURoPE.cpp b/src/backends/cpu/CPURoPE.cpp index 981c484f..2235dad6 100644 --- a/src/backends/cpu/CPURoPE.cpp +++ b/src/backends/cpu/CPURoPE.cpp @@ -331,13 +331,6 @@ void CPURoPE::rope_mla(shared_ptr input, shared_ptr output) { } ErrorCode CPURoPE::execute(vector> inputs, vector> outputs) { - // if use QNN, when a new prompt input, the seq should be reset to 0 here as the setUp is not called -#ifdef USE_QNN - auto cpuBackend = dynamic_cast(backend_); - if (cpuBackend->isStageSwitching() && cpuBackend->getExecutionType() == PROMPT) { - h_cnt_ = 0; - } -#endif auto &input = inputs[0]; auto &output = outputs[0]; auto out_dtype = output->dtype(); diff --git a/src/backends/xnnpack/third_party/fmt b/src/backends/xnnpack/third_party/fmt new file mode 160000 index 00000000..0c9fce2f --- /dev/null +++ b/src/backends/xnnpack/third_party/fmt @@ -0,0 +1 @@ +Subproject commit 0c9fce2ffefecfdce794e1859584e25877b7b592 diff --git a/src/models/smollm/tokenization_smollm.hpp b/src/models/smollm/tokenization_smollm.hpp index 23a7af98..44ed7251 100644 --- a/src/models/smollm/tokenization_smollm.hpp +++ b/src/models/smollm/tokenization_smollm.hpp @@ -229,6 +229,58 @@ class SmolLMTokenizer final : public BPETokenizer { return std::make_pair(realLength, Tokenizer::tokens2Input(ret)); } + // padding the input by neareast multiplication of chunk_size + std::pair tokenizePaddingByChunk(std::string &text, int chunk_size, int vocab_size) { + std::vector ret; + + if (split_special_tokens_) { + const auto word_collection = unicode_regex_split(text, regex_exprs); + for (auto &piece : word_collection) { + // look up table + // std::string token; + // for (auto b : UTF8(piece)) token += byte_encoder_[b]; + + // using bpe + std::vector tmp; + BPETokenizer::tokenize(piece, tmp, false, true, ""); + ret.insert(ret.end(), tmp.begin(), tmp.end() - 1); + } + } else { + auto parts = _splitWithDelimiters(text, special_tokens); + // for (auto p : parts) { + // std::cout << "\"" << p << "\"" << std::endl; + // } + for (auto &p : parts) { + if (std::find(special_tokens.begin(), special_tokens.end(), p) != special_tokens.end()) { + std::string token; + for (auto b : UTF8(p)) token += byte_encoder_[b]; + + std::vector tmp; + BPETokenizer::tokenize(token, tmp, false, special_tokens, true); + ret.insert(ret.end(), tmp.begin(), tmp.end() - 1); + } else { + const auto word_collection = unicode_regex_split(p, regex_exprs); + for (auto &piece : word_collection) { + // look up table + // std::string token; + // for (auto b : UTF8(piece)) token += byte_encoder_[b]; + + // using bpe + std::vector tmp; + BPETokenizer::tokenize(piece, tmp, false, true, ""); + assert(!tmp.empty()); + ret.insert(ret.end(), tmp.begin(), tmp.end() - 1); + } + } + } + } + + auto realLength = ret.size(); + int paddingLength = (chunk_size - realLength % chunk_size) % chunk_size; + ret.resize(realLength + paddingLength, vocab_size); + return std::make_pair(realLength, Tokenizer::tokens2Input(ret)); + } + std::string _byte_decode_(const std::string &text) { std::string ret; auto _ = ORD(text); From 5ff02cdb6a4c6eae939da9b0fd1b724282b56119 Mon Sep 17 00:00:00 2001 From: yirongjie Date: Thu, 14 Nov 2024 02:27:50 +0000 Subject: [PATCH 3/4] fix: clearCache in RoPE --- examples/demo_phonelm_npu.cpp | 2 + examples/demo_qwen2.5_npu.cpp | 4 +- scripts/build_android_app.sh | 10 +-- scripts/build_qnn_android.sh | 2 +- src/Layer.hpp | 8 +++ src/Op.hpp | 2 +- src/backends/cpu/CPUIRoPE.cpp | 72 +++++++++---------- src/backends/cpu/CPUIRoPE.hpp | 4 +- src/backends/cpu/CPURoPE.cpp | 2 +- src/backends/cpu/CPURoPE.hpp | 4 +- src/backends/xnnpack/third_party/fmt | 1 - src/models/dclm/modeling_dclm.hpp | 4 +- src/models/fuyu/modeling_fuyu.hpp | 6 +- src/models/llama/modeling_elastic_llama.hpp | 15 ++-- src/models/llama/modeling_llama.hpp | 8 +-- src/models/minicpm/modeling_minicpm.hpp | 8 +-- src/models/openelm/modeling_openelm.hpp | 4 +- src/models/opt/modeling_opt_qnn.hpp | 8 +-- src/models/opt/tokenization_opt.hpp | 2 +- src/models/phi3/modeling_phi3.hpp | 8 +-- src/models/phonelm/modeling_phonelm.hpp | 16 +++-- src/models/phonelm/modeling_phonelm_npu.hpp | 31 +++++++- src/models/qwen/modeling_qwen.hpp | 13 ++-- src/models/qwen/modeling_qwen_npu.hpp | 4 +- src/models/qwen/modeling_qwen_xp_sdpa.hpp | 4 +- src/models/smollm/modeling_smollm.hpp | 8 +-- src/models/stablelm/modeling_stablelm.hpp | 11 ++- .../transformer/modeling_transformer.hpp | 7 +- test/xnnpack/XpLlamaMHATest.cpp | 4 +- tools/jni/LibHelper.cpp | 8 +-- 30 files changed, 167 insertions(+), 113 deletions(-) delete mode 160000 src/backends/xnnpack/third_party/fmt diff --git a/examples/demo_phonelm_npu.cpp b/examples/demo_phonelm_npu.cpp index 42324cb4..6607585f 100644 --- a/examples/demo_phonelm_npu.cpp +++ b/examples/demo_phonelm_npu.cpp @@ -129,6 +129,8 @@ int main(int argc, char **argv) { static_cast(Backend::global_backends[MLLM_CPU])->setSequenceLength(0); static_cast(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT); static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); + model.clear_kvcache(); + decoding_model.clear_kvcache(); } } #endif \ No newline at end of file diff --git a/examples/demo_qwen2.5_npu.cpp b/examples/demo_qwen2.5_npu.cpp index f49fb33e..77c0356a 100644 --- a/examples/demo_qwen2.5_npu.cpp +++ b/examples/demo_qwen2.5_npu.cpp @@ -61,7 +61,7 @@ int main(int argc, char **argv) { }); static_cast(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length); - static_cast(Backend::global_backends[MLLM_CPU])->switchDecodeTag(); + static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); LlmTextGeneratorOpts decoding_opt{ .max_new_tokens = 100, @@ -75,7 +75,7 @@ int main(int argc, char **argv) { decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool { // call only once of switchDecodeTag if (!isSwitched) { - static_cast(Backend::global_backends[MLLM_CPU])->switchDecodeTag(); + static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); isSwitched = true; } auto out_string = tokenizer.detokenize({out_token}); diff --git a/scripts/build_android_app.sh b/scripts/build_android_app.sh index f11bab92..f2211c6f 100755 --- a/scripts/build_android_app.sh +++ b/scripts/build_android_app.sh @@ -21,9 +21,9 @@ cmake .. \ make mllm_lib -j16 -# 2. copy libs -cp ./libmllm_lib.a ../android/app/src/main/cpp/libs/ +# # 2. copy libs +# cp ./libmllm_lib.a ../android/app/src/main/cpp/libs/ -# 3. build android apk -cd ../android || exit -./gradlew assembleDebug \ No newline at end of file +# # 3. build android apk +# cd ../android || exit +# ./gradlew assembleDebug \ No newline at end of file diff --git a/scripts/build_qnn_android.sh b/scripts/build_qnn_android.sh index f772de80..4daff80d 100755 --- a/scripts/build_qnn_android.sh +++ b/scripts/build_qnn_android.sh @@ -16,4 +16,4 @@ cmake .. \ -DQNN_VALIDATE_NODE=ON \ -DMLLM_BUILD_XNNPACK_BACKEND=OFF -make -j16 +make -j40 diff --git a/src/Layer.hpp b/src/Layer.hpp index 7d729b84..5babf22f 100644 --- a/src/Layer.hpp +++ b/src/Layer.hpp @@ -484,6 +484,7 @@ class SlidingWindowMask final : public Layer { class RoPE final : public Layer { public: + RoPE() = default; explicit RoPE(int pose_type, std::string name) { param_["pose_type"] = pose_type; init(std::move(name), OpType::ROPE); @@ -505,10 +506,14 @@ class RoPE final : public Layer { auto ts = run({input}, 1); return ts[0].get(); } + void clearCache() { + return op_->clearCache(); + } }; class IRoPE final : public Layer { public: + IRoPE() = default; explicit IRoPE(int pose_type, std::string name) { param_["pose_type"] = pose_type; init(std::move(name), OpType::IROPE); @@ -530,6 +535,9 @@ class IRoPE final : public Layer { auto ts = run({input}, 1); return ts[0].get(); } + void clearCache() { + return op_->clearCache(); + } }; class KVCache final : public Layer { diff --git a/src/Op.hpp b/src/Op.hpp index 43787971..be04de18 100644 --- a/src/Op.hpp +++ b/src/Op.hpp @@ -121,7 +121,7 @@ class Op { return -1; } virtual void clearCache() { - assert(type_ == OpType::KVCACHE); + assert(type_ == OpType::KVCACHE || type_ == OpType::IROPE || type_ == OpType::ROPE); std::cout << "only for KVCache" << std::endl; } diff --git a/src/backends/cpu/CPUIRoPE.cpp b/src/backends/cpu/CPUIRoPE.cpp index 330b9a6f..fe2a5798 100644 --- a/src/backends/cpu/CPUIRoPE.cpp +++ b/src/backends/cpu/CPUIRoPE.cpp @@ -124,7 +124,7 @@ ErrorCode CPUIRoPE::reshape(vector> inputs, vectorreshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence(), inputs[0]->dimension()); ishape = inputs[0]->dimension() * partial_rotary_factor_; // pos_max_ = 16384; - if (sin_.empty() || ishape_old < ishape || global_pose_type_ != pose_type_ ) { + if (sin_.empty() || ishape_old < ishape || global_pose_type_ != pose_type_) { global_pose_type_ = pose_type_; ishape_old = ishape; if (pose_type_ == LLAMAROPE) { @@ -145,39 +145,38 @@ ErrorCode CPUIRoPE::reshape(vector> inputs, vector input, shared_ptr output){ +void CPUIRoPE::rope_llama(shared_ptr input, shared_ptr output) { auto out_dtype = output->dtype(); int partial_dimension = (input->dimension()) * partial_rotary_factor_; #pragma omp parallel for collapse(4) num_threads(thread_count) for (int n = 0; n < input->batch(); ++n) { for (int h = 0; h < input->head(); ++h) { for (int s = 0; s < input->sequence(); ++s) { // sequance - for (int d = 0; d < partial_dimension; d+=2) { + for (int d = 0; d < partial_dimension; d += 2) { float in_value = input->dataAt(n, h, s, d); float in_value_2 = input->dataAt(n, h, s, d + 1); - float sin_value = static_cast(sin_[s + h_cnt_][d])/127 * sin_max; - float cos_value = static_cast(cos_[s + h_cnt_][d])/127 * cos_max; + float sin_value = static_cast(sin_[s + h_cnt_][d]) / 127 * sin_max; + float cos_value = static_cast(cos_[s + h_cnt_][d]) / 127 * cos_max; auto value = in_value * cos_value - in_value_2 * sin_value; auto value2 = in_value * sin_value + in_value_2 * cos_value; if (out_dtype == MLLM_TYPE_F32) { output->setDataAt(n, h, s, d, value); - output->setDataAt(n, h, s, d+1, value2); + output->setDataAt(n, h, s, d + 1, value2); } else if (out_dtype == MLLM_TYPE_F16) { output->setDataAt(n, h, s, d, MLLM_FP32_TO_FP16(value)); - output->setDataAt(n, h, s, d+1, MLLM_FP32_TO_FP16(value2)); + output->setDataAt(n, h, s, d + 1, MLLM_FP32_TO_FP16(value2)); } } } } } } -void CPUIRoPE::rope_hf(shared_ptr input, shared_ptr output){ +void CPUIRoPE::rope_hf(shared_ptr input, shared_ptr output) { auto out_dtype = output->dtype(); int partial_dimension = (input->dimension()) * partial_rotary_factor_; int half = (int)(partial_dimension / 2); - assert(partial_dimension%2==0); - if(output->ctype() == BSHD){ + assert(partial_dimension % 2 == 0); + if (output->ctype() == BSHD) { if (input->dtype() == MLLM_TYPE_F16) { #pragma omp parallel for collapse(4) num_threads(thread_count) for (int n = 0; n < input->batch(); ++n) { @@ -188,9 +187,9 @@ void CPUIRoPE::rope_hf(shared_ptr input, shared_ptr output){ auto o = output->ptrAt(n, h, s, d); float in_value = static_cast(v[0]); float in_value_2 = static_cast(v[half]); - float sin_value = static_cast(sin_[s + h_cnt_][d])/127 * sin_max; + float sin_value = static_cast(sin_[s + h_cnt_][d]) / 127 * sin_max; auto c = static_cast(cos_[s + h_cnt_][d]); - float cos_value = c/127 * cos_max; + float cos_value = c / 127 * cos_max; auto value = in_value * cos_value - in_value_2 * sin_value; auto value2 = in_value * sin_value + in_value_2 * cos_value; o[0] = MLLM_FP32_TO_FP16(value); @@ -200,20 +199,19 @@ void CPUIRoPE::rope_hf(shared_ptr input, shared_ptr output){ } } - } else - if (out_dtype == MLLM_TYPE_F32){ + } else if (out_dtype == MLLM_TYPE_F32) { #pragma omp parallel for collapse(4) num_threads(thread_count) for (int n = 0; n < input->batch(); ++n) { for (int h = 0; h < input->head(); ++h) { for (int s = 0; s < input->sequence(); ++s) { // sequance - for (int d = 0; d < partial_dimension/2; ++d) { + for (int d = 0; d < partial_dimension / 2; ++d) { auto v = input->ptrAt(n, h, s, d); auto o = output->ptrAt(n, h, s, d); float in_value = v[0]; float in_value_2 = v[half]; - float sin_value = static_cast(sin_[s + h_cnt_][d])/127 * sin_max; + float sin_value = static_cast(sin_[s + h_cnt_][d]) / 127 * sin_max; auto c = static_cast(cos_[s + h_cnt_][d]); - float cos_value = c/127 * cos_max; + float cos_value = c / 127 * cos_max; auto value = in_value * cos_value - in_value_2 * sin_value; auto value2 = in_value * sin_value + in_value_2 * cos_value; o[0] = value; @@ -222,18 +220,18 @@ void CPUIRoPE::rope_hf(shared_ptr input, shared_ptr output){ } } } - }else if(out_dtype == MLLM_TYPE_F16){ + } else if (out_dtype == MLLM_TYPE_F16) { #pragma omp parallel for collapse(4) num_threads(thread_count) for (int n = 0; n < input->batch(); ++n) { for (int h = 0; h < input->head(); ++h) { for (int s = 0; s < input->sequence(); ++s) { // sequance - for (int d = 0; d < partial_dimension/2; ++d) { + for (int d = 0; d < partial_dimension / 2; ++d) { auto v = input->ptrAt(n, h, s, d); auto o = output->ptrAt(n, h, s, d); float in_value = v[0]; float in_value_2 = v[half]; float sin_value = static_cast(sin_[s + h_cnt_][d]) / 127 * sin_max; - float cos_value = static_cast(cos_[s + h_cnt_][d])/127 * cos_max; + float cos_value = static_cast(cos_[s + h_cnt_][d]) / 127 * cos_max; auto value = in_value * cos_value - in_value_2 * sin_value; auto value2 = in_value * sin_value + in_value_2 * cos_value; o[0] = MLLM_FP32_TO_FP16(value); @@ -249,7 +247,7 @@ void CPUIRoPE::rope_hf(shared_ptr input, shared_ptr output){ for (int n = 0; n < input->batch(); ++n) { for (int h = 0; h < input->head(); ++h) { for (int s = 0; s < input->sequence(); ++s) { // sequance - for (int d = 0; d < partial_dimension/2; ++d) { + for (int d = 0; d < partial_dimension / 2; ++d) { if (input->dtype() == MLLM_TYPE_F16) { float in_value = static_cast(input->dataAt(n, h, s, d)); float in_value_2 = static_cast(input->dataAt(n, h, s, d + partial_dimension / 2)); @@ -268,16 +266,16 @@ void CPUIRoPE::rope_hf(shared_ptr input, shared_ptr output){ } else { float in_value = input->dataAt(n, h, s, d); float in_value_2 = input->dataAt(n, h, s, d + partial_dimension / 2); - float sin_value = static_cast(sin_[s + h_cnt_][d])/127 * sin_max; - float cos_value = static_cast(cos_[s + h_cnt_][d])/127 * cos_max; + float sin_value = static_cast(sin_[s + h_cnt_][d]) / 127 * sin_max; + float cos_value = static_cast(cos_[s + h_cnt_][d]) / 127 * cos_max; auto value = in_value * cos_value - in_value_2 * sin_value; auto value2 = in_value * sin_value + in_value_2 * cos_value; if (out_dtype == MLLM_TYPE_F32) { output->setDataAt(n, h, s, d, value); - output->setDataAt(n, h, s, d+ partial_dimension / 2, value2); + output->setDataAt(n, h, s, d + partial_dimension / 2, value2); } else if (out_dtype == MLLM_TYPE_F16) { output->setDataAt(n, h, s, d, MLLM_FP32_TO_FP16(value)); - output->setDataAt(n, h, s, d+ partial_dimension / 2, MLLM_FP32_TO_FP16(value2)); + output->setDataAt(n, h, s, d + partial_dimension / 2, MLLM_FP32_TO_FP16(value2)); } } } @@ -285,7 +283,7 @@ void CPUIRoPE::rope_hf(shared_ptr input, shared_ptr output){ } } } -void CPUIRoPE::rope_permission(shared_ptr input, shared_ptr output){ +void CPUIRoPE::rope_permission(shared_ptr input, shared_ptr output) { auto out_dtype = output->dtype(); int partial_dimension = (input->dimension()) * partial_rotary_factor_; #pragma omp parallel for collapse(4) num_threads(thread_count) @@ -293,10 +291,10 @@ void CPUIRoPE::rope_permission(shared_ptr input, shared_ptr outp for (int h = 0; h < input->head(); ++h) { for (int s = 0; s < input->sequence(); ++s) { // sequance for (int d = 0; d < partial_dimension; ++d) { - float in_value = input->dataAt(n, h, s, d); + float in_value = input->dataAt(n, h, s, d); float in_value_2; - float sin_value = static_cast(sin_[s + h_cnt_][d])/127 * sin_max; - float cos_value = static_cast(cos_[s + h_cnt_][d])/127 * cos_max; + float sin_value = static_cast(sin_[s + h_cnt_][d]) / 127 * sin_max; + float cos_value = static_cast(cos_[s + h_cnt_][d]) / 127 * cos_max; if (d < partial_dimension / 4) { in_value_2 = -input->dataAt(n, h, s, d + partial_dimension / 4); auto value = in_value * cos_value + in_value_2 * sin_value; @@ -325,7 +323,7 @@ void CPUIRoPE::rope_permission(shared_ptr input, shared_ptr outp } } } -void CPUIRoPE::rope_mla(shared_ptr input, shared_ptr output){ +void CPUIRoPE::rope_mla(shared_ptr input, shared_ptr output) { auto out_dtype = output->dtype(); int partial_dimension = (input->dimension()) * partial_rotary_factor_; #pragma omp parallel for collapse(4) num_threads(thread_count) @@ -338,17 +336,17 @@ void CPUIRoPE::rope_mla(shared_ptr input, shared_ptr output){ if (d < half_dim) { in_value = input->dataAt(n, h, s, d * 2); } else { - in_value = input->dataAt(n, h, s, 2 *(d - half_dim)+1); + in_value = input->dataAt(n, h, s, 2 * (d - half_dim) + 1); } float in_value_2; if (d < half_dim) { - in_value_2 = -input->dataAt(n, h, s, 2 *d+1); + in_value_2 = -input->dataAt(n, h, s, 2 * d + 1); } else { - in_value_2 = input->dataAt(n, h, s, 2 *(d - half_dim)); + in_value_2 = input->dataAt(n, h, s, 2 * (d - half_dim)); } // no change - float sin_value = static_cast(sin_[s + h_cnt_][d])/127 * sin_max; - float cos_value = static_cast(cos_[s + h_cnt_][d])/127 * cos_max; + float sin_value = static_cast(sin_[s + h_cnt_][d]) / 127 * sin_max; + float cos_value = static_cast(cos_[s + h_cnt_][d]) / 127 * cos_max; auto value = in_value * cos_value + in_value_2 * sin_value; if (out_dtype == MLLM_TYPE_F32) { output->setDataAt(n, h, s, d, value); @@ -486,7 +484,7 @@ ErrorCode CPUIRoPE::execute(vector> inputs, vectorsequence(); - if (h_cnt_ > pos_max_) { + if (h_cnt_ >= pos_max_) { h_cnt_ = 0; } diff --git a/src/backends/cpu/CPUIRoPE.hpp b/src/backends/cpu/CPUIRoPE.hpp index 2e8aee56..bd872fec 100644 --- a/src/backends/cpu/CPUIRoPE.hpp +++ b/src/backends/cpu/CPUIRoPE.hpp @@ -35,11 +35,13 @@ class CPUIRoPE final : public Op { int thread_count = 4; float partial_rotary_factor_ = 1; - void rope_llama(shared_ptr input, shared_ptr output); void rope_hf(shared_ptr input, shared_ptr output); void rope_permission(shared_ptr input, shared_ptr output); void rope_mla(shared_ptr input, shared_ptr output); + void clearCache() override { + h_cnt_ = 0; + } }; class CPUIRoPECreator : public CPUBackend::Creator { diff --git a/src/backends/cpu/CPURoPE.cpp b/src/backends/cpu/CPURoPE.cpp index 2235dad6..919df5a7 100644 --- a/src/backends/cpu/CPURoPE.cpp +++ b/src/backends/cpu/CPURoPE.cpp @@ -447,7 +447,7 @@ ErrorCode CPURoPE::execute(vector> inputs, vectorsequence(); - if (h_cnt_ > pos_max_) { + if (h_cnt_ >= pos_max_) { h_cnt_ = 0; } diff --git a/src/backends/cpu/CPURoPE.hpp b/src/backends/cpu/CPURoPE.hpp index 058a2746..57ee958d 100644 --- a/src/backends/cpu/CPURoPE.hpp +++ b/src/backends/cpu/CPURoPE.hpp @@ -33,11 +33,13 @@ class CPURoPE final : public Op { int thread_count = 4; float partial_rotary_factor_ = 1; - void rope_llama(shared_ptr input, shared_ptr output); void rope_hf(shared_ptr input, shared_ptr output); void rope_permission(shared_ptr input, shared_ptr output); void rope_mla(shared_ptr input, shared_ptr output); + void clearCache() override { + h_cnt_ = 0; + } }; class CPURoPECreator : public CPUBackend::Creator { diff --git a/src/backends/xnnpack/third_party/fmt b/src/backends/xnnpack/third_party/fmt deleted file mode 160000 index 0c9fce2f..00000000 --- a/src/backends/xnnpack/third_party/fmt +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0c9fce2ffefecfdce794e1859584e25877b7b592 diff --git a/src/models/dclm/modeling_dclm.hpp b/src/models/dclm/modeling_dclm.hpp index 82508199..266e3871 100644 --- a/src/models/dclm/modeling_dclm.hpp +++ b/src/models/dclm/modeling_dclm.hpp @@ -53,8 +53,8 @@ class DCLMAttention final : public Module { Layer out_proj; Layer q_norm; Layer k_norm; - Layer q_rope; - Layer k_rope; + RoPE q_rope; + RoPE k_rope; KVCache k_cache; KVCache v_cache; Layer softmax; diff --git a/src/models/fuyu/modeling_fuyu.hpp b/src/models/fuyu/modeling_fuyu.hpp index 5bfe6575..14065bde 100644 --- a/src/models/fuyu/modeling_fuyu.hpp +++ b/src/models/fuyu/modeling_fuyu.hpp @@ -67,8 +67,10 @@ class Persimmon final : public Module { } void clear_kvcache() override { for (auto &block : blocks) { - auto kvcahce = block.get_attention().get_cache(); - for (auto &cache : kvcahce) { cache->clearCache(); } + auto kvcache = block.get_attention().get_cache(); + for (auto &cache : kvcache) { cache->clearCache(); } + auto ropes = block.get_attention().get_rope(); + for (auto &rope : ropes) { rope->clearCache(); } } } }; diff --git a/src/models/llama/modeling_elastic_llama.hpp b/src/models/llama/modeling_elastic_llama.hpp index 84abda6f..a43af045 100644 --- a/src/models/llama/modeling_elastic_llama.hpp +++ b/src/models/llama/modeling_elastic_llama.hpp @@ -17,8 +17,8 @@ class ElasticMultiHeadAttention final : public Module { ElasticLinear q_proj; ElasticLinear k_proj; ElasticLinear v_proj; - Layer q_rope; - Layer k_rope; + RoPE q_rope; + RoPE k_rope; KVCache k_cache; KVCache v_cache; Softmax softmax; @@ -86,6 +86,9 @@ class ElasticMultiHeadAttention final : public Module { vector get_cache() { return {&k_cache, &v_cache}; } + vector get_rope() { + return {&q_rope, &k_rope}; + } }; class ElasticLLaMAMLP final : public Module { @@ -180,10 +183,10 @@ class ElasticLLaMAModel final : public Module { void clear_kvcache() override { for (auto &block : blocks) { - auto kvcahce = block.get_attention().get_cache(); - for (auto &cache : kvcahce) { - cache->clearCache(); - } + auto kvcache = block.get_attention().get_cache(); + for (auto &cache : kvcache) { cache->clearCache(); } + auto ropes = block.get_attention().get_rope(); + for (auto &rope : ropes) { rope->clearCache(); } } } }; diff --git a/src/models/llama/modeling_llama.hpp b/src/models/llama/modeling_llama.hpp index 31d917e0..5fccd18e 100644 --- a/src/models/llama/modeling_llama.hpp +++ b/src/models/llama/modeling_llama.hpp @@ -97,10 +97,10 @@ class LLaMAModel final : public Module { void clear_kvcache() override { for (auto &block : blocks) { - auto kvcahce = block.get_attention().get_cache(); - for (auto &cache : kvcahce) { - cache->clearCache(); - } + auto kvcache = block.get_attention().get_cache(); + for (auto &cache : kvcache) { cache->clearCache(); } + auto ropes = block.get_attention().get_rope(); + for (auto &rope : ropes) { rope->clearCache(); } } } }; diff --git a/src/models/minicpm/modeling_minicpm.hpp b/src/models/minicpm/modeling_minicpm.hpp index 42145856..163bc0b4 100644 --- a/src/models/minicpm/modeling_minicpm.hpp +++ b/src/models/minicpm/modeling_minicpm.hpp @@ -94,10 +94,10 @@ class MiniCPMModel final : public Module { void clear_kvcache() override { for (auto &block : blocks) { - auto kvcahce = block.get_attention().get_cache(); - for (auto &cache : kvcahce) { - cache->clearCache(); - } + auto kvcache = block.get_attention().get_cache(); + for (auto &cache : kvcache) { cache->clearCache(); } + auto ropes = block.get_attention().get_rope(); + for (auto &rope : ropes) { rope->clearCache(); } } } diff --git a/src/models/openelm/modeling_openelm.hpp b/src/models/openelm/modeling_openelm.hpp index 1d798b8e..2cc1a476 100644 --- a/src/models/openelm/modeling_openelm.hpp +++ b/src/models/openelm/modeling_openelm.hpp @@ -39,8 +39,8 @@ class OpenELMMultiHeadCausalAttention final : public Module { int v_heads_; Layer qkv_proj; - Layer q_rope; - Layer k_rope; + RoPE q_rope; + RoPE k_rope; Layer q_norm; Layer k_norm; Layer out_proj; diff --git a/src/models/opt/modeling_opt_qnn.hpp b/src/models/opt/modeling_opt_qnn.hpp index 101ddac7..e35e3093 100644 --- a/src/models/opt/modeling_opt_qnn.hpp +++ b/src/models/opt/modeling_opt_qnn.hpp @@ -16,8 +16,8 @@ class OPTEncoderBlockPart1 final : public Module { Layer q_proj; Layer k_proj; Layer v_proj; - Layer q_rope; - Layer k_rope; + RoPE q_rope; + RoPE k_rope; Layer norm1; int head_size_{}; int kv_head_size_{}; @@ -74,11 +74,11 @@ class OPTQKVmm final : public Module { const OPTNameConfig &names, const string &base_name) { attn_hidden_dim_ = attn_hidden_dim; head_size_ = head_size; - if(cache_limit>0){ + if (cache_limit > 0) { k_cache = KVCache(head_size / kv_head_size, cache_limit, false, base_name + names._attn_base_name + "k_cache"); v_cache = KVCache(head_size / kv_head_size, cache_limit, false, base_name + names._attn_base_name + "v_cache"); } - + if (do_mask) { mask = Causalmask(base_name + "mask"); } diff --git a/src/models/opt/tokenization_opt.hpp b/src/models/opt/tokenization_opt.hpp index 363906e2..1900d899 100644 --- a/src/models/opt/tokenization_opt.hpp +++ b/src/models/opt/tokenization_opt.hpp @@ -26,7 +26,7 @@ class OPTTokenizer final : public BPETokenizer { } Tensor tokenize(const std::string &text, string name = "input", BackendType type = MLLM_CPU) override { - string new_text; + string new_text = text; if (text[0] != ' ') { new_text = ' ' + text; } diff --git a/src/models/phi3/modeling_phi3.hpp b/src/models/phi3/modeling_phi3.hpp index 3016c084..f5f9ea9b 100644 --- a/src/models/phi3/modeling_phi3.hpp +++ b/src/models/phi3/modeling_phi3.hpp @@ -97,10 +97,10 @@ class Phi3Model final : public Module { void clear_kvcache() override { for (auto &block : blocks) { - auto kvcahce = block.get_attention().get_cache(); - for (auto &cache : kvcahce) { - cache->clearCache(); - } + auto kvcache = block.get_attention().get_cache(); + for (auto &cache : kvcache) { cache->clearCache(); } + auto ropes = block.get_attention().get_rope(); + for (auto &rope : ropes) { rope->clearCache(); } } } }; diff --git a/src/models/phonelm/modeling_phonelm.hpp b/src/models/phonelm/modeling_phonelm.hpp index 69bbecda..151ba193 100644 --- a/src/models/phonelm/modeling_phonelm.hpp +++ b/src/models/phonelm/modeling_phonelm.hpp @@ -98,10 +98,12 @@ class PhoneLMAttention final : public Module { return {o}; } - vector - get_cache() { + vector get_cache() { return {&k_cache, &v_cache}; } + vector get_rope() { + return {&q_rope, &k_rope}; + } private: int hidden_size; @@ -113,8 +115,8 @@ class PhoneLMAttention final : public Module { Layer k_proj; Layer v_proj; Layer o_proj; - Layer q_rope; - Layer k_rope; + IRoPE q_rope; + IRoPE k_rope; KVCache k_cache; KVCache v_cache; Softmax softmax; @@ -170,8 +172,10 @@ class PhoneLMModel final : public Module { void clear_kvcache() override { for (auto &block : blocks) { - auto kvcahce = block.get_attention().get_cache(); - for (auto &cache : kvcahce) { cache->clearCache(); } + auto kvcache = block.get_attention().get_cache(); + for (auto &cache : kvcache) { cache->clearCache(); } + auto ropes = block.get_attention().get_rope(); + for (auto &rope : ropes) { rope->clearCache(); } } } diff --git a/src/models/phonelm/modeling_phonelm_npu.hpp b/src/models/phonelm/modeling_phonelm_npu.hpp index 0b03cb1d..27f44fb6 100644 --- a/src/models/phonelm/modeling_phonelm_npu.hpp +++ b/src/models/phonelm/modeling_phonelm_npu.hpp @@ -82,8 +82,8 @@ class PhoneLMDecoderNPUPart1 final : public Module { // CPU QKV MM part class PhoneLMQKVmm final : public Module { - Layer q_rope; - Layer k_rope; + IRoPE q_rope; + IRoPE k_rope; KVCache k_cache; KVCache v_cache; Softmax softmax; @@ -135,6 +135,12 @@ class PhoneLMQKVmm final : public Module { return {o}; } + vector get_cache() { + return {&k_cache, &v_cache}; + } + vector get_rope() { + return {&q_rope, &k_rope}; + } }; // QNN mlp part @@ -415,6 +421,9 @@ class PhoneLMNPU_CPUDecoder final : public Module { return {x}; } + PhoneLMQKVmm &get_mm() { + return qkv_mm; + } }; class PhoneLMNPU_CPUDecoderWithShadow final : public Module { @@ -483,6 +492,9 @@ class PhoneLMNPU_CPUDecoderWithShadow final : public Module { return {x}; } + PhoneLMQKVmm &get_mm() { + return qkv_mm; + } }; // Copied from GemmaModel with Gemma->PhoneLM and set RmsNorm(without add_unit_offset) @@ -525,6 +537,18 @@ class PhoneLMModel_NPU final : public Module { return {x}; } + void clear_kvcache() override { + for (auto &block : blocks) { + auto decoder = dynamic_cast(block.get()); + if (decoder) { + auto kvcache = decoder->get_mm().get_cache(); + for (auto &cache : kvcache) { cache->clearCache(); } + auto ropes = decoder->get_mm().get_rope(); + for (auto &rope : ropes) { rope->clearCache(); } + } + } + } + private: std::vector> blocks; Layer norm; @@ -588,6 +612,9 @@ class PhoneLMForCausalLM_NPU final : public Module { return; } } + void clear_kvcache() override { + model.clear_kvcache(); + } private: int hidden_size; diff --git a/src/models/qwen/modeling_qwen.hpp b/src/models/qwen/modeling_qwen.hpp index f2b61aaa..f6ca72e9 100644 --- a/src/models/qwen/modeling_qwen.hpp +++ b/src/models/qwen/modeling_qwen.hpp @@ -115,6 +115,9 @@ class QWenAttention final : public Module { vector get_cache() { return {&k_cache, &v_cache}; } + vector get_rope() { + return {&q_rope, &k_rope}; + } private: int hidden_size; @@ -126,8 +129,8 @@ class QWenAttention final : public Module { Layer k_proj; Layer v_proj; Layer o_proj; - Layer q_rope; - Layer k_rope; + RoPE q_rope; + RoPE k_rope; KVCache k_cache; KVCache v_cache; Causalmask mask; @@ -187,8 +190,10 @@ class QWenModel final : public Module { void clear_kvcache() override { for (auto &block : blocks) { - auto kvcahce = block.get_attention().get_cache(); - for (auto &cache : kvcahce) { cache->clearCache(); } + auto kvcache = block.get_attention().get_cache(); + for (auto &cache : kvcache) { cache->clearCache(); } + auto ropes = block.get_attention().get_rope(); + for (auto &rope : ropes) { rope->clearCache(); } } } diff --git a/src/models/qwen/modeling_qwen_npu.hpp b/src/models/qwen/modeling_qwen_npu.hpp index c96a765b..c051170a 100644 --- a/src/models/qwen/modeling_qwen_npu.hpp +++ b/src/models/qwen/modeling_qwen_npu.hpp @@ -82,8 +82,8 @@ class QwenDecoderNPUPart1 final : public Module { // CPU QKV MM part class QwenQKVmm final : public Module { - Layer q_rope; - Layer k_rope; + RoPE q_rope; + RoPE k_rope; KVCache k_cache; KVCache v_cache; Softmax softmax; diff --git a/src/models/qwen/modeling_qwen_xp_sdpa.hpp b/src/models/qwen/modeling_qwen_xp_sdpa.hpp index e9225d3a..dfb28151 100644 --- a/src/models/qwen/modeling_qwen_xp_sdpa.hpp +++ b/src/models/qwen/modeling_qwen_xp_sdpa.hpp @@ -21,8 +21,8 @@ class XpDecoderSeperatedPart_1_Xnn : public Module { Layer q_proj; Layer k_proj; Layer v_proj; - Layer q_rope; - Layer k_rope; + RoPE q_rope; + RoPE k_rope; Layer input_layernorm; int hidden_size = 0; diff --git a/src/models/smollm/modeling_smollm.hpp b/src/models/smollm/modeling_smollm.hpp index 99febfe5..86d8d6ca 100644 --- a/src/models/smollm/modeling_smollm.hpp +++ b/src/models/smollm/modeling_smollm.hpp @@ -104,10 +104,10 @@ class SmolLMModel final : public Module { void clear_kvcache() override { for (auto &block : blocks) { - auto kvcahce = block.get_attention().get_cache(); - for (auto &cache : kvcahce) { - cache->clearCache(); - } + auto kvcache = block.get_attention().get_cache(); + for (auto &cache : kvcache) { cache->clearCache(); } + auto ropes = block.get_attention().get_rope(); + for (auto &rope : ropes) { rope->clearCache(); } } } }; diff --git a/src/models/stablelm/modeling_stablelm.hpp b/src/models/stablelm/modeling_stablelm.hpp index 50737a2a..9e7c0703 100644 --- a/src/models/stablelm/modeling_stablelm.hpp +++ b/src/models/stablelm/modeling_stablelm.hpp @@ -12,8 +12,8 @@ class StableLMMultiHeadAttention final : public Module { Layer q_proj; Layer k_proj; Layer v_proj; - Layer q_rope; - Layer k_rope; + RoPE q_rope; + RoPE k_rope; KVCache k_cache; KVCache v_cache; Softmax softmax; @@ -26,8 +26,8 @@ class StableLMMultiHeadAttention final : public Module { public: StableLMMultiHeadAttention() = default; StableLMMultiHeadAttention(int hidden_dim, int head_size, int kv_head_size, int attn_hidden_dim, - RoPEType RoPE_type, int cache_limit, bool do_mask, bool bias, - const TransformerNameConfig &names, const string &base_name) { + RoPEType RoPE_type, int cache_limit, bool do_mask, bool bias, + const TransformerNameConfig &names, const string &base_name) { attn_hidden_dim_ = attn_hidden_dim; head_size_ = head_size; kv_head_size_ = kv_head_size; @@ -44,7 +44,6 @@ class StableLMMultiHeadAttention final : public Module { } softmax = Softmax(DIMENSION, do_mask, base_name + "softmax"); o_proj = Linear(head_size * attn_hidden_dim, hidden_dim, false, base_name + names._o_proj_name); - } vector Forward(vector inputs, vector args) override { Tensor q, k, v; @@ -107,7 +106,7 @@ class StableLMBlock final : public Module { StableLMBlock() = default; StableLMBlock(int hidden_dim, int head_size, int ffn_hidden, RoPEType RoPE_type, int cache_limit, const stablelmNameConfig &names, const string &base_name) { attention = StableLMMultiHeadAttention(hidden_dim, head_size, head_size, hidden_dim / head_size, - RoPE_type, cache_limit, true, true, names, base_name + names._attn_base_name); + RoPE_type, cache_limit, true, true, names, base_name + names._attn_base_name); mlp = StableLMMLP(hidden_dim, ffn_hidden, names, base_name + names._ffn_base_name); norm1 = LayerNorm(hidden_dim, true, 1e-5, base_name + names._attn_norm_name); norm2 = LayerNorm(hidden_dim, true, 1e-5, base_name + names._ffn_norm_name); diff --git a/src/models/transformer/modeling_transformer.hpp b/src/models/transformer/modeling_transformer.hpp index a4a05040..61244611 100644 --- a/src/models/transformer/modeling_transformer.hpp +++ b/src/models/transformer/modeling_transformer.hpp @@ -17,8 +17,8 @@ class MultiHeadAttention final : public Module { Layer q_proj; Layer k_proj; Layer v_proj; - Layer q_rope; - Layer k_rope; + RoPE q_rope; + RoPE k_rope; Layer q_norm; Layer k_norm; KVCache k_cache; @@ -117,6 +117,9 @@ class MultiHeadAttention final : public Module { vector get_cache() { return {&k_cache, &v_cache}; } + vector get_rope() { + return {&q_rope, &k_rope}; + } }; class FeedForward final : public Module { diff --git a/test/xnnpack/XpLlamaMHATest.cpp b/test/xnnpack/XpLlamaMHATest.cpp index e056203e..540f441a 100644 --- a/test/xnnpack/XpLlamaMHATest.cpp +++ b/test/xnnpack/XpLlamaMHATest.cpp @@ -19,8 +19,8 @@ class XpLLaMAMHA final : public Module { Layer q_proj; Layer k_proj; Layer v_proj; - Layer q_rope; - Layer k_rope; + RoPE q_rope; + RoPE k_rope; Layer k_cache; Layer v_cache; Layer o_proj; diff --git a/tools/jni/LibHelper.cpp b/tools/jni/LibHelper.cpp index 16d0d67e..0964ed50 100644 --- a/tools/jni/LibHelper.cpp +++ b/tools/jni/LibHelper.cpp @@ -137,7 +137,7 @@ void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, u return true; }); static_cast(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length); - static_cast(Backend::global_backends[MLLM_CPU])->switchDecodeTag(); + static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); opt = LlmTextGeneratorOpts{ .max_new_tokens = max_new_tokens, .do_sample = false, @@ -150,7 +150,7 @@ void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, u bool isSwitched = false; module_->generate(input_tensor, opt, [&](unsigned int out_token) -> bool { if (!isSwitched && backend_ == MLLMBackendType::QNN) { - static_cast(Backend::global_backends[MLLM_CPU])->switchDecodeTag(); + static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); isSwitched = true; } auto out_token_string = tokenizer_->detokenize({out_token}); @@ -206,7 +206,7 @@ void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, u return true; }); static_cast(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length); - static_cast(Backend::global_backends[MLLM_CPU])->switchDecodeTag(); + static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); opt = LlmTextGeneratorOpts{ .max_new_tokens = max_new_tokens, @@ -220,7 +220,7 @@ void LibHelper::run(std::string &input_str, uint8_t *image, unsigned max_step, u bool isSwitched = false; module_->generate(input_tensor, opt, [&](unsigned int out_token) -> bool { if (!isSwitched && backend_ == MLLMBackendType::QNN) { - static_cast(Backend::global_backends[MLLM_CPU])->switchDecodeTag(); + static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); isSwitched = true; } auto out_token_string = tokenizer_->detokenize({out_token}); From 68ddb997d4a6e7893302ea14702631e31260d772 Mon Sep 17 00:00:00 2001 From: yirongjie Date: Thu, 14 Nov 2024 10:57:26 +0000 Subject: [PATCH 4/4] fix: genarate with Padding --- examples/demo_phonelm_npu.cpp | 34 ++++++---------- examples/demo_qwen2.5_npu.cpp | 4 +- examples/demo_qwen_npu.cpp | 4 +- scripts/build_android_app.sh | 10 ++--- scripts/run_phonelm_qnn.sh | 44 +++++++++++++++++++++ src/Generate.hpp | 8 ++++ src/Op.hpp | 4 +- src/backends/cpu/CPUBackend.hpp | 8 ++-- src/models/phonelm/modeling_phonelm_npu.hpp | 29 +------------- src/models/qwen/modeling_qwen_npu.hpp | 2 +- 10 files changed, 79 insertions(+), 68 deletions(-) create mode 100755 scripts/run_phonelm_qnn.sh diff --git a/examples/demo_phonelm_npu.cpp b/examples/demo_phonelm_npu.cpp index 6607585f..e4112bec 100644 --- a/examples/demo_phonelm_npu.cpp +++ b/examples/demo_phonelm_npu.cpp @@ -1,7 +1,6 @@ #include "Module.hpp" #include "Types.hpp" #include -#ifdef USE_QNN #include "backends/cpu/CPUBackend.hpp" #include "cmdline.h" #include "models/phonelm/modeling_phonelm.hpp" @@ -15,7 +14,7 @@ int main(int argc, char **argv) { cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm"); cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt"); cmdParser.add("model", 'm', "specify mllm model path", false, "../models/PhoneLM-1.5B-Instruct-128.mllm"); - cmdParser.add("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-droidcall-q4_0_4_4.mllm"); + cmdParser.add("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-instruct-q4_0_4_4.mllm"); cmdParser.add("limits", 'l', "max KV cache size", false, 400); cmdParser.add("thread", 't', "num of threads", false, 4); cmdParser.add("chunk", 'c', "chunk size", false, 64); @@ -40,6 +39,9 @@ int main(int argc, char **argv) { "Give me a short introduction to large language model.", "What is the Beijing University of Posts and Telecommunications.", "What is the meaning of life?", + "Hello, who are you?", + "What can you do?", + "Please introduce Beijing University of Posts and Telecommunications.", }; // turn on the multi-chunk prefilling @@ -48,13 +50,10 @@ int main(int argc, char **argv) { for (int i = 0; i < in_strs.size(); ++i) { auto input_str = tokenizer.apply_chat_template(in_strs[i]); auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size); - const int seq_length_padding = (chunk_size - real_seq_length % chunk_size) + real_seq_length; const int chunk_num = seq_length_padding / chunk_size; bool isSwitched = false; - - std::cout << "real seq length: " << real_seq_length << " padding to: " << seq_length_padding << " chunk num: " << chunk_num << std::endl; - + // std::cout << "real seq length: " << real_seq_length << " padding to: " << seq_length_padding << " chunk num: " << chunk_num << std::endl; std::cout << "[Q] " << in_strs[i] << std::endl; std::cout << "[A] " << std::flush; @@ -84,13 +83,11 @@ int main(int argc, char **argv) { auto out_string = tokenizer.detokenize({out_token}); auto [not_end, output_string] = tokenizer.postprocess(out_string); if (!not_end) { return false; } - if (chunk_id == chunk_num - 1) { // print the output of the last chunk std::cout << output_string << std::flush; } return true; }); - Module::isFirstChunk = false; } @@ -107,30 +104,23 @@ int main(int argc, char **argv) { .top_p = 0.f, .is_padding = false, }; - isSwitched = false; decoding_model.generate(chunked_tensors.back(), decoding_opt, [&](unsigned int out_token) -> bool { - if (!isSwitched) { - // turn off switching + if (!isSwitched) { // turn off switching static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); isSwitched = true; } auto out_string = tokenizer.detokenize({out_token}); - auto [isOk, print_string] = tokenizer.postprocess(out_string); - if (isOk) { - std::cout << print_string << std::flush; - } else { - return false; - } + auto [not_end, output_string] = tokenizer.postprocess(out_string); + if (!not_end) { return false; } + std::cout << output_string << std::flush; return true; }); - std::cout << "\n---------------" << std::endl; + // turn on switching, set sequence length and execution type static_cast(Backend::global_backends[MLLM_CPU])->setSequenceLength(0); static_cast(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT); static_cast(Backend::global_backends[MLLM_CPU])->toggleSwitching(); - model.clear_kvcache(); - decoding_model.clear_kvcache(); + std::cout << "\n"; } -} -#endif \ No newline at end of file +} \ No newline at end of file diff --git a/examples/demo_qwen2.5_npu.cpp b/examples/demo_qwen2.5_npu.cpp index 77c0356a..af6b2e34 100644 --- a/examples/demo_qwen2.5_npu.cpp +++ b/examples/demo_qwen2.5_npu.cpp @@ -1,4 +1,3 @@ -#ifdef USE_QNN #include "backends/cpu/CPUBackend.hpp" #include "cmdline.h" #include "models/qwen/configuration_qwen.hpp" @@ -89,5 +88,4 @@ int main(int argc, char **argv) { }); std::cout << "\n---------------" << std::endl; } -} -#endif \ No newline at end of file +} \ No newline at end of file diff --git a/examples/demo_qwen_npu.cpp b/examples/demo_qwen_npu.cpp index 2ab916c7..04875c30 100644 --- a/examples/demo_qwen_npu.cpp +++ b/examples/demo_qwen_npu.cpp @@ -1,4 +1,3 @@ -#ifdef USE_QNN #include "backends/cpu/CPUBackend.hpp" #include "cmdline.h" #include "models/qwen/configuration_qwen.hpp" @@ -89,5 +88,4 @@ int main(int argc, char **argv) { }); std::cout << "\n---------------" << std::endl; } -} -#endif \ No newline at end of file +} \ No newline at end of file diff --git a/scripts/build_android_app.sh b/scripts/build_android_app.sh index f2211c6f..f11bab92 100755 --- a/scripts/build_android_app.sh +++ b/scripts/build_android_app.sh @@ -21,9 +21,9 @@ cmake .. \ make mllm_lib -j16 -# # 2. copy libs -# cp ./libmllm_lib.a ../android/app/src/main/cpp/libs/ +# 2. copy libs +cp ./libmllm_lib.a ../android/app/src/main/cpp/libs/ -# # 3. build android apk -# cd ../android || exit -# ./gradlew assembleDebug \ No newline at end of file +# 3. build android apk +cd ../android || exit +./gradlew assembleDebug \ No newline at end of file diff --git a/scripts/run_phonelm_qnn.sh b/scripts/run_phonelm_qnn.sh new file mode 100755 index 00000000..a3f814de --- /dev/null +++ b/scripts/run_phonelm_qnn.sh @@ -0,0 +1,44 @@ +!/bin/bash + +adb shell mkdir -p /data/local/tmp/mllm/vocab +adb shell mkdir -p /data/local/tmp/mllm/qnn-lib + +adb push ../vocab/phonelm_vocab.mllm /data/local/tmp/mllm/vocab/ + + +if ! adb shell [ -f "/data/local/tmp/mllm/models/PhoneLM-1.5B-Instruct-128.mllm" ]; then + adb push ../models/PhoneLM-1.5B-Instruct-128.mllm "/data/local/tmp/mllm/models/PhoneLM-1.5B-Instruct-128.mllm" +else + echo "PhoneLM-1.5B-Instruct-128 file already exists" +fi + + +if ! adb shell [ -f "/data/local/tmp/mllm/models//phonelm-1.5b-instruct-q4_0_4_4.mllm" ]; then + adb push ../models//phonelm-1.5b-instruct-q4_0_4_4.mllm "/data/local/tmp/mllm/models//phonelm-1.5b-instruct-q4_0_4_4.mllm" +else + echo "/phonelm-1.5b-instruct-q4_0_4_4.mllm file already exists" +fi + +LIBPATH=../src/backends/qnn/qualcomm_ai_engine_direct_220/ +ANDR_LIB=$LIBPATH/lib/aarch64-android +OP_PATH=../src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/build +DEST=/data/local/tmp/mllm/qnn-lib + +adb push $ANDR_LIB/libQnnHtp.so $DEST +adb push $ANDR_LIB/libQnnHtpV75Stub.so $DEST +adb push $ANDR_LIB/libQnnHtpPrepare.so $DEST +adb push $ANDR_LIB/libQnnHtpProfilingReader.so $DEST +adb push $ANDR_LIB/libQnnHtpOptraceProfilingReader.so $DEST +adb push $ANDR_LIB/libQnnHtpV75CalculatorStub.so $DEST +adb push $LIBPATH/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so $DEST +adb push $OP_PATH/aarch64-android/libQnnLLaMAPackage.so $DEST/libQnnLLaMAPackage_CPU.so +adb push $OP_PATH/hexagon-v75/libQnnLLaMAPackage.so $DEST/libQnnLLaMAPackage_HTP.so + + +if [ $? -ne 0 ]; then + echo "adb push failed" + exit 1 +fi + +adb push ../bin-arm/demo_phonelm_npu /data/local/tmp/mllm/bin/ +adb shell "cd /data/local/tmp/mllm/bin && export LD_LIBRARY_PATH=/data/local/tmp/mllm/qnn-lib && export ADSP_LIBRARY_PATH=/data/local/tmp/mllm/qnn-lib && ./demo_phonelm_npu" \ No newline at end of file diff --git a/src/Generate.hpp b/src/Generate.hpp index 4ae9a863..4eb669d8 100644 --- a/src/Generate.hpp +++ b/src/Generate.hpp @@ -53,6 +53,7 @@ class _LlmTextGenerateMethod { bool is_padding = false; int seq_before_padding = 0; int chunk_size = -1; + public: virtual ~_LlmTextGenerateMethod() = default; virtual unsigned int generate(Tensor &t) = 0; @@ -174,6 +175,13 @@ class LlmTextGenerator { return m_method_class->generate(t); } + inline unsigned int generate(Tensor &t, const LlmTextGeneratorOpts &opt) { + if (opt.is_padding) { + m_method_class->setPadding(opt.is_padding, opt.seq_before_padding, opt.chunk_size); + } + return m_method_class->generate(t); + } + inline LLmTextGeneratorType type() { return m_type; } diff --git a/src/Op.hpp b/src/Op.hpp index be04de18..47a4b721 100644 --- a/src/Op.hpp +++ b/src/Op.hpp @@ -116,12 +116,12 @@ class Op { } virtual int getCacheSeqLen() { - assert(type_ == OpType::KVCACHE); + assert(type_ == OpType::KVCACHE || type_ == OpType::KVCACHENPU); std::cout << "only for KVCache" << std::endl; return -1; } virtual void clearCache() { - assert(type_ == OpType::KVCACHE || type_ == OpType::IROPE || type_ == OpType::ROPE); + assert(type_ == OpType::KVCACHE || type_ == OpType::KVCACHENPU || type_ == OpType::IROPE || type_ == OpType::ROPE); std::cout << "only for KVCache" << std::endl; } diff --git a/src/backends/cpu/CPUBackend.hpp b/src/backends/cpu/CPUBackend.hpp index 75f96412..c808db80 100644 --- a/src/backends/cpu/CPUBackend.hpp +++ b/src/backends/cpu/CPUBackend.hpp @@ -32,7 +32,7 @@ class CPUBackend final : public Backend { static int cpu_threads; -#ifdef USE_QNN + // #ifdef USE_QNN void setSequenceLength(int sequence_length) { sequence_length_ = sequence_length; } @@ -51,15 +51,15 @@ class CPUBackend final : public Backend { ExecutionType getExecutionType() { return execution_type; } -#endif + // #endif private: std::map map_creator_; std::map map_function_; -#ifdef USE_QNN + // #ifdef USE_QNN int sequence_length_ = 0; bool isSwitchingStage = false; ExecutionType execution_type = PROMPT; -#endif + // #endif }; } // namespace mllm diff --git a/src/models/phonelm/modeling_phonelm_npu.hpp b/src/models/phonelm/modeling_phonelm_npu.hpp index 27f44fb6..295bdab7 100644 --- a/src/models/phonelm/modeling_phonelm_npu.hpp +++ b/src/models/phonelm/modeling_phonelm_npu.hpp @@ -135,12 +135,6 @@ class PhoneLMQKVmm final : public Module { return {o}; } - vector get_cache() { - return {&k_cache, &v_cache}; - } - vector get_rope() { - return {&q_rope, &k_rope}; - } }; // QNN mlp part @@ -421,9 +415,6 @@ class PhoneLMNPU_CPUDecoder final : public Module { return {x}; } - PhoneLMQKVmm &get_mm() { - return qkv_mm; - } }; class PhoneLMNPU_CPUDecoderWithShadow final : public Module { @@ -492,9 +483,6 @@ class PhoneLMNPU_CPUDecoderWithShadow final : public Module { return {x}; } - PhoneLMQKVmm &get_mm() { - return qkv_mm; - } }; // Copied from GemmaModel with Gemma->PhoneLM and set RmsNorm(without add_unit_offset) @@ -537,18 +525,6 @@ class PhoneLMModel_NPU final : public Module { return {x}; } - void clear_kvcache() override { - for (auto &block : blocks) { - auto decoder = dynamic_cast(block.get()); - if (decoder) { - auto kvcache = decoder->get_mm().get_cache(); - for (auto &cache : kvcache) { cache->clearCache(); } - auto ropes = decoder->get_mm().get_rope(); - for (auto &rope : ropes) { rope->clearCache(); } - } - } - } - private: std::vector> blocks; Layer norm; @@ -606,15 +582,12 @@ class PhoneLMForCausalLM_NPU final : public Module { for (int step = 0; step < opt.max_new_tokens; ++step) { auto _out = (*this)({input_ids}); - auto out_token = text_generator_->generate(_out[0]); + auto out_token = text_generator_->generate(_out[0], opt); if (!call_back(out_token)) break; chatPostProcessing(out_token, input_ids, {}); return; } } - void clear_kvcache() override { - model.clear_kvcache(); - } private: int hidden_size; diff --git a/src/models/qwen/modeling_qwen_npu.hpp b/src/models/qwen/modeling_qwen_npu.hpp index c051170a..acc4d7e3 100644 --- a/src/models/qwen/modeling_qwen_npu.hpp +++ b/src/models/qwen/modeling_qwen_npu.hpp @@ -586,7 +586,7 @@ class QWenForCausalLM_NPU final : public Module { for (int step = 0; step < opt.max_new_tokens; ++step) { auto _out = (*this)({input_ids}); - auto out_token = text_generator_->generate(_out[0]); + auto out_token = text_generator_->generate(_out[0], opt); if (!call_back(out_token)) break; chatPostProcessing(out_token, input_ids, {}); return;