Skip to content

Commit

Permalink
feat: QNN Multi Chunk Execution in New Frontend (#191)
Browse files Browse the repository at this point in the history
* dev: qnn multi input inference developing

* feat: qnn multi chunk prefilling in new frontend
refactor: qnn module setup skip in following chunks
todo: multi input

* fix: clearCache in RoPE

* fix: genarate with Padding

---------

Co-authored-by: Rongjie Yi <[email protected]>
Co-authored-by: yirongjie <[email protected]>
  • Loading branch information
3 people authored Nov 14, 2024
1 parent d279e01 commit 11a2fb2
Show file tree
Hide file tree
Showing 41 changed files with 386 additions and 178 deletions.
93 changes: 63 additions & 30 deletions examples/demo_phonelm_npu.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#ifdef USE_QNN
#include "Module.hpp"
#include "Types.hpp"
#include <memory>
#include "backends/cpu/CPUBackend.hpp"
#include "cmdline.h"
#include "models/phonelm/modeling_phonelm.hpp"
Expand All @@ -12,16 +14,18 @@ int main(int argc, char **argv) {
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/PhoneLM-1.5B-Instruct-128.mllm");
cmdParser.add<string>("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-droidcall-q4_0_4_4.mllm");
cmdParser.add<string>("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-instruct-q4_0_4_4.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.add<int>("chunk", 'c', "chunk size", false, 64);
cmdParser.parse_check(argc, argv);

string vocab_path = cmdParser.get<string>("vocab");
string merge_path = cmdParser.get<string>("merge");
string model_path = cmdParser.get<string>("model");
string decoding_path = cmdParser.get<string>("decoding");
int tokens_limit = cmdParser.get<int>("limits");
int chunk_size = cmdParser.get<int>("chunk");
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

auto tokenizer = SmolLMTokenizer(vocab_path, merge_path);
Expand All @@ -33,35 +37,64 @@ int main(int argc, char **argv) {

vector<string> in_strs = {
"Give me a short introduction to large language model.",
"What is the Beijing University of Posts and Telecommunications.",
"What is the meaning of life?",
"Hello, who are you?",
"What can you do?",
"Please introduce Beijing University of Posts and Telecommunications.",
};

// turn on the multi-chunk prefilling
Module::isMultiChunkPrefilling = true;

for (int i = 0; i < in_strs.size(); ++i) {
auto input_str = tokenizer.apply_chat_template(in_strs[i]);
auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 64, config.vocab_size);
std::cout << real_seq_length << endl;
std::cout << input_str << std::endl;
auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);
const int seq_length_padding = (chunk_size - real_seq_length % chunk_size) + real_seq_length;
const int chunk_num = seq_length_padding / chunk_size;
bool isSwitched = false;
// std::cout << "real seq length: " << real_seq_length << " padding to: " << seq_length_padding << " chunk num: " << chunk_num << std::endl;
std::cout << "[Q] " << in_strs[i] << std::endl;
std::cout << "[A] " << std::flush;

// tensor vectors to save the chunked tensors of the QNN prefilling input
vector<Tensor> chunked_tensors(chunk_num);
LlmTextGeneratorOpts opt{
.max_new_tokens = 1,
.do_sample = false,
.temperature = 0.3f,
.top_k = 50,
.top_p = 0.f,
.is_padding = true,
.seq_before_padding = real_seq_length,
.chunk_size = chunk_size,
};
model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
auto out_string = tokenizer.detokenize({out_token});
auto [not_end, output_string] = tokenizer.postprocess(out_string);
if (!not_end) { return false; }
std::cout << output_string << std::flush;
return true;
});

for (int chunk_id = 0; chunk_id < chunk_num; ++chunk_id) {
chunked_tensors[chunk_id].setBackend(Backend::global_backends[MLLM_CPU]);
chunked_tensors[chunk_id].setTtype(INPUT_TENSOR);
chunked_tensors[chunk_id].reshape(1, 1, chunk_size, 1);
chunked_tensors[chunk_id].setName("input-chunk-" + to_string(chunk_id));
chunked_tensors[chunk_id].deepCopyFrom(&input_tensor, false, {0, 0, chunk_id * chunk_size, 0});

model.generate(chunked_tensors[chunk_id], opt, [&](unsigned int out_token) -> bool {
if (i != 0 && !isSwitched && chunk_id == 0) {
// turn off switching at the first chunk of following inputs
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
isSwitched = true;
}
auto out_string = tokenizer.detokenize({out_token});
auto [not_end, output_string] = tokenizer.postprocess(out_string);
if (!not_end) { return false; }
if (chunk_id == chunk_num - 1) { // print the output of the last chunk
std::cout << output_string << std::flush;
}
return true;
});
Module::isFirstChunk = false;
}

// turn on switching, set sequence length and execution type
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(AUTOREGRESSIVE);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();

LlmTextGeneratorOpts decoding_opt{
.max_new_tokens = 100,
Expand All @@ -71,23 +104,23 @@ int main(int argc, char **argv) {
.top_p = 0.f,
.is_padding = false,
};
bool isSwitched = false;
decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
// call only once of switchDecodeTag
if (!isSwitched) {
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
isSwitched = false;
decoding_model.generate(chunked_tensors.back(), decoding_opt, [&](unsigned int out_token) -> bool {
if (!isSwitched) { // turn off switching
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
isSwitched = true;
}
auto out_string = tokenizer.detokenize({out_token});
auto [isOk, print_string] = tokenizer.postprocess(out_string);
if (isOk) {
std::cout << print_string << std::flush;
} else {
return false;
}
auto [not_end, output_string] = tokenizer.postprocess(out_string);
if (!not_end) { return false; }
std::cout << output_string << std::flush;
return true;
});
std::cout << "\n---------------" << std::endl;

// turn on switching, set sequence length and execution type
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(0);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
std::cout << "\n";
}
}
#endif
}
8 changes: 3 additions & 5 deletions examples/demo_qwen2.5_npu.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#ifdef USE_QNN
#include "backends/cpu/CPUBackend.hpp"
#include "cmdline.h"
#include "models/qwen/configuration_qwen.hpp"
Expand Down Expand Up @@ -61,7 +60,7 @@ int main(int argc, char **argv) {
});

static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();

LlmTextGeneratorOpts decoding_opt{
.max_new_tokens = 100,
Expand All @@ -75,7 +74,7 @@ int main(int argc, char **argv) {
decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
// call only once of switchDecodeTag
if (!isSwitched) {
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
isSwitched = true;
}
auto out_string = tokenizer.detokenize({out_token});
Expand All @@ -89,5 +88,4 @@ int main(int argc, char **argv) {
});
std::cout << "\n---------------" << std::endl;
}
}
#endif
}
8 changes: 3 additions & 5 deletions examples/demo_qwen_npu.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#ifdef USE_QNN
#include "backends/cpu/CPUBackend.hpp"
#include "cmdline.h"
#include "models/qwen/configuration_qwen.hpp"
Expand Down Expand Up @@ -61,7 +60,7 @@ int main(int argc, char **argv) {
});

static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();

LlmTextGeneratorOpts decoding_opt{
.max_new_tokens = 100,
Expand All @@ -75,7 +74,7 @@ int main(int argc, char **argv) {
decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
// call only once of switchDecodeTag
if (!isSwitched) {
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
isSwitched = true;
}
auto out_string = tokenizer.detokenize({out_token});
Expand All @@ -89,5 +88,4 @@ int main(int argc, char **argv) {
});
std::cout << "\n---------------" << std::endl;
}
}
#endif
}
12 changes: 6 additions & 6 deletions examples/main_phonelm_npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,11 +190,11 @@ int main(int argc, char **argv) {
auto inter_cpu_backend = dynamic_cast<CPUBackend *>(interNet.backends()[MLLM_CPU].get());
auto decode_cpu_backend = dynamic_cast<CPUBackend *>(cpuNet.backends()[MLLM_CPU].get());
prefill_cpu_backend->setSequenceLength(real_seq_length);
prefill_cpu_backend->switchDecodeTag();
prefill_cpu_backend->toggleSwitching();
inter_cpu_backend->setSequenceLength(real_seq_length);
inter_cpu_backend->switchDecodeTag();
inter_cpu_backend->toggleSwitching();
decode_cpu_backend->setSequenceLength(real_seq_length);
decode_cpu_backend->switchDecodeTag();
decode_cpu_backend->toggleSwitching();

// // 2: Decoding stage using CPU execute
for (int step = real_seq_length; step < real_seq_length + 100; step++) {
Expand All @@ -210,9 +210,9 @@ int main(int argc, char **argv) {
std::cout << out_token << std::flush;

if (step == real_seq_length) {
prefill_cpu_backend->switchDecodeTag();
inter_cpu_backend->switchDecodeTag();
decode_cpu_backend->switchDecodeTag();
prefill_cpu_backend->toggleSwitching();
inter_cpu_backend->toggleSwitching();
decode_cpu_backend->toggleSwitching();
}
}
} while (false);
Expand Down
12 changes: 6 additions & 6 deletions examples/main_qwen_npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,11 +187,11 @@ int main(int argc, char **argv) {
auto inter_cpu_backend = dynamic_cast<CPUBackend *>(interNet.backends()[MLLM_CPU].get());
auto decode_cpu_backend = dynamic_cast<CPUBackend *>(cpuNet.backends()[MLLM_CPU].get());
prefill_cpu_backend->setSequenceLength(real_seq_length);
prefill_cpu_backend->switchDecodeTag();
prefill_cpu_backend->toggleSwitching();
inter_cpu_backend->setSequenceLength(real_seq_length);
inter_cpu_backend->switchDecodeTag();
inter_cpu_backend->toggleSwitching();
decode_cpu_backend->setSequenceLength(real_seq_length);
decode_cpu_backend->switchDecodeTag();
decode_cpu_backend->toggleSwitching();

// // 2: Decoding stage using CPU execute
for (int step = real_seq_length; step < real_seq_length + 100; step++) {
Expand All @@ -207,9 +207,9 @@ int main(int argc, char **argv) {
std::cout << out_token << std::flush;

if (step == real_seq_length) {
prefill_cpu_backend->switchDecodeTag();
inter_cpu_backend->switchDecodeTag();
decode_cpu_backend->switchDecodeTag();
prefill_cpu_backend->toggleSwitching();
inter_cpu_backend->toggleSwitching();
decode_cpu_backend->toggleSwitching();
}
}
} while (false);
Expand Down
3 changes: 2 additions & 1 deletion include/Types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ enum TensorStatus {
// TENSOR_DYNAMIC,
TENSOR_STATIC_INIT,
TENSOR_STATIC_READY,
TENSOR_UNDEFINED,
};

enum ErrorCode {
Expand Down Expand Up @@ -141,7 +142,7 @@ enum RoPEType {
MLAROPE = 5,
};

enum QNNExecutionType {
enum ExecutionType {
PROMPT = 0,
AUTOREGRESSIVE = 1,
};
Expand Down
2 changes: 1 addition & 1 deletion scripts/build_qnn_android.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ cmake .. \
-DQNN_VALIDATE_NODE=ON \
-DMLLM_BUILD_XNNPACK_BACKEND=OFF

make -j16
make -j40
44 changes: 44 additions & 0 deletions scripts/run_phonelm_qnn.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
!/bin/bash

adb shell mkdir -p /data/local/tmp/mllm/vocab
adb shell mkdir -p /data/local/tmp/mllm/qnn-lib

adb push ../vocab/phonelm_vocab.mllm /data/local/tmp/mllm/vocab/


if ! adb shell [ -f "/data/local/tmp/mllm/models/PhoneLM-1.5B-Instruct-128.mllm" ]; then
adb push ../models/PhoneLM-1.5B-Instruct-128.mllm "/data/local/tmp/mllm/models/PhoneLM-1.5B-Instruct-128.mllm"
else
echo "PhoneLM-1.5B-Instruct-128 file already exists"
fi


if ! adb shell [ -f "/data/local/tmp/mllm/models//phonelm-1.5b-instruct-q4_0_4_4.mllm" ]; then
adb push ../models//phonelm-1.5b-instruct-q4_0_4_4.mllm "/data/local/tmp/mllm/models//phonelm-1.5b-instruct-q4_0_4_4.mllm"
else
echo "/phonelm-1.5b-instruct-q4_0_4_4.mllm file already exists"
fi

LIBPATH=../src/backends/qnn/qualcomm_ai_engine_direct_220/
ANDR_LIB=$LIBPATH/lib/aarch64-android
OP_PATH=../src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/build
DEST=/data/local/tmp/mllm/qnn-lib

adb push $ANDR_LIB/libQnnHtp.so $DEST
adb push $ANDR_LIB/libQnnHtpV75Stub.so $DEST
adb push $ANDR_LIB/libQnnHtpPrepare.so $DEST
adb push $ANDR_LIB/libQnnHtpProfilingReader.so $DEST
adb push $ANDR_LIB/libQnnHtpOptraceProfilingReader.so $DEST
adb push $ANDR_LIB/libQnnHtpV75CalculatorStub.so $DEST
adb push $LIBPATH/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so $DEST
adb push $OP_PATH/aarch64-android/libQnnLLaMAPackage.so $DEST/libQnnLLaMAPackage_CPU.so
adb push $OP_PATH/hexagon-v75/libQnnLLaMAPackage.so $DEST/libQnnLLaMAPackage_HTP.so


if [ $? -ne 0 ]; then
echo "adb push failed"
exit 1
fi

adb push ../bin-arm/demo_phonelm_npu /data/local/tmp/mllm/bin/
adb shell "cd /data/local/tmp/mllm/bin && export LD_LIBRARY_PATH=/data/local/tmp/mllm/qnn-lib && export ADSP_LIBRARY_PATH=/data/local/tmp/mllm/qnn-lib && ./demo_phonelm_npu"
21 changes: 18 additions & 3 deletions src/Generate.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ struct LlmTextGeneratorOpts {
float top_p = 0.92;
bool is_padding = false;
int seq_before_padding = 0;
int chunk_size = -1;
};

template <typename T>
Expand All @@ -51,12 +52,15 @@ enum class LLmTextGeneratorType : int32_t {
class _LlmTextGenerateMethod {
bool is_padding = false;
int seq_before_padding = 0;
int chunk_size = -1;

public:
virtual ~_LlmTextGenerateMethod() = default;
virtual unsigned int generate(Tensor &t) = 0;
inline void setPadding(bool is_padding, int seq_before_padding) {
inline void setPadding(bool is_padding, int seq_before_padding, int chunk_size) {
this->is_padding = is_padding;
this->seq_before_padding = seq_before_padding;
this->chunk_size = chunk_size;
}
inline void _tensor_to_vec(Tensor &t, std::vector<float> &scores) {
assert(t.batch() == 1 && "Batch size of result is not 1. Which is not supported for now.");
Expand All @@ -65,7 +69,11 @@ class _LlmTextGenerateMethod {
int _seq = t.sequence() - 1;
// padding prefill for QNN
if (is_padding) {
_seq = seq_before_padding - 1;
if (chunk_size > 0) {
_seq = (seq_before_padding - 1) % chunk_size;
} else {
_seq = seq_before_padding - 1;
}
}
for (int i = 0; i < _dims; ++i) {
auto value = t.dataAt<float>(0, 0, _seq, i);
Expand Down Expand Up @@ -159,14 +167,21 @@ class LlmTextGenerator {

// padding prefill for QNN
if (opt.is_padding) {
m_method_class->setPadding(opt.is_padding, opt.seq_before_padding);
m_method_class->setPadding(opt.is_padding, opt.seq_before_padding, opt.chunk_size);
}
}

inline unsigned int generate(Tensor &t) {
return m_method_class->generate(t);
}

inline unsigned int generate(Tensor &t, const LlmTextGeneratorOpts &opt) {
if (opt.is_padding) {
m_method_class->setPadding(opt.is_padding, opt.seq_before_padding, opt.chunk_size);
}
return m_method_class->generate(t);
}

inline LLmTextGeneratorType type() {
return m_type;
}
Expand Down
Loading

0 comments on commit 11a2fb2

Please sign in to comment.