Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: QNN Multi Chunk Execution in New Frontend #191

Merged
merged 6 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 63 additions & 30 deletions examples/demo_phonelm_npu.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#ifdef USE_QNN
#include "Module.hpp"
#include "Types.hpp"
#include <memory>
#include "backends/cpu/CPUBackend.hpp"
#include "cmdline.h"
#include "models/phonelm/modeling_phonelm.hpp"
Expand All @@ -12,16 +14,18 @@ int main(int argc, char **argv) {
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/PhoneLM-1.5B-Instruct-128.mllm");
cmdParser.add<string>("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-droidcall-q4_0_4_4.mllm");
cmdParser.add<string>("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-instruct-q4_0_4_4.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.add<int>("chunk", 'c', "chunk size", false, 64);
cmdParser.parse_check(argc, argv);

string vocab_path = cmdParser.get<string>("vocab");
string merge_path = cmdParser.get<string>("merge");
string model_path = cmdParser.get<string>("model");
string decoding_path = cmdParser.get<string>("decoding");
int tokens_limit = cmdParser.get<int>("limits");
int chunk_size = cmdParser.get<int>("chunk");
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

auto tokenizer = SmolLMTokenizer(vocab_path, merge_path);
Expand All @@ -33,35 +37,64 @@ int main(int argc, char **argv) {

vector<string> in_strs = {
"Give me a short introduction to large language model.",
"What is the Beijing University of Posts and Telecommunications.",
"What is the meaning of life?",
"Hello, who are you?",
"What can you do?",
"Please introduce Beijing University of Posts and Telecommunications.",
};

// turn on the multi-chunk prefilling
Module::isMultiChunkPrefilling = true;

for (int i = 0; i < in_strs.size(); ++i) {
auto input_str = tokenizer.apply_chat_template(in_strs[i]);
auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 64, config.vocab_size);
std::cout << real_seq_length << endl;
std::cout << input_str << std::endl;
auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);
const int seq_length_padding = (chunk_size - real_seq_length % chunk_size) + real_seq_length;
const int chunk_num = seq_length_padding / chunk_size;
bool isSwitched = false;
// std::cout << "real seq length: " << real_seq_length << " padding to: " << seq_length_padding << " chunk num: " << chunk_num << std::endl;
std::cout << "[Q] " << in_strs[i] << std::endl;
std::cout << "[A] " << std::flush;

// tensor vectors to save the chunked tensors of the QNN prefilling input
vector<Tensor> chunked_tensors(chunk_num);
LlmTextGeneratorOpts opt{
.max_new_tokens = 1,
.do_sample = false,
.temperature = 0.3f,
.top_k = 50,
.top_p = 0.f,
.is_padding = true,
.seq_before_padding = real_seq_length,
.chunk_size = chunk_size,
};
model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
auto out_string = tokenizer.detokenize({out_token});
auto [not_end, output_string] = tokenizer.postprocess(out_string);
if (!not_end) { return false; }
std::cout << output_string << std::flush;
return true;
});

for (int chunk_id = 0; chunk_id < chunk_num; ++chunk_id) {
chunked_tensors[chunk_id].setBackend(Backend::global_backends[MLLM_CPU]);
chunked_tensors[chunk_id].setTtype(INPUT_TENSOR);
chunked_tensors[chunk_id].reshape(1, 1, chunk_size, 1);
chunked_tensors[chunk_id].setName("input-chunk-" + to_string(chunk_id));
chunked_tensors[chunk_id].deepCopyFrom(&input_tensor, false, {0, 0, chunk_id * chunk_size, 0});

model.generate(chunked_tensors[chunk_id], opt, [&](unsigned int out_token) -> bool {
if (i != 0 && !isSwitched && chunk_id == 0) {
// turn off switching at the first chunk of following inputs
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
isSwitched = true;
}
auto out_string = tokenizer.detokenize({out_token});
auto [not_end, output_string] = tokenizer.postprocess(out_string);
if (!not_end) { return false; }
if (chunk_id == chunk_num - 1) { // print the output of the last chunk
std::cout << output_string << std::flush;
}
return true;
});
Module::isFirstChunk = false;
}

// turn on switching, set sequence length and execution type
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(AUTOREGRESSIVE);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();

LlmTextGeneratorOpts decoding_opt{
.max_new_tokens = 100,
Expand All @@ -71,23 +104,23 @@ int main(int argc, char **argv) {
.top_p = 0.f,
.is_padding = false,
};
bool isSwitched = false;
decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
// call only once of switchDecodeTag
if (!isSwitched) {
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
isSwitched = false;
decoding_model.generate(chunked_tensors.back(), decoding_opt, [&](unsigned int out_token) -> bool {
if (!isSwitched) { // turn off switching
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
isSwitched = true;
}
auto out_string = tokenizer.detokenize({out_token});
auto [isOk, print_string] = tokenizer.postprocess(out_string);
if (isOk) {
std::cout << print_string << std::flush;
} else {
return false;
}
auto [not_end, output_string] = tokenizer.postprocess(out_string);
if (!not_end) { return false; }
std::cout << output_string << std::flush;
return true;
});
std::cout << "\n---------------" << std::endl;

// turn on switching, set sequence length and execution type
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(0);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
std::cout << "\n";
}
}
#endif
}
8 changes: 3 additions & 5 deletions examples/demo_qwen2.5_npu.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#ifdef USE_QNN
#include "backends/cpu/CPUBackend.hpp"
#include "cmdline.h"
#include "models/qwen/configuration_qwen.hpp"
Expand Down Expand Up @@ -61,7 +60,7 @@ int main(int argc, char **argv) {
});

static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();

LlmTextGeneratorOpts decoding_opt{
.max_new_tokens = 100,
Expand All @@ -75,7 +74,7 @@ int main(int argc, char **argv) {
decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
// call only once of switchDecodeTag
if (!isSwitched) {
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
isSwitched = true;
}
auto out_string = tokenizer.detokenize({out_token});
Expand All @@ -89,5 +88,4 @@ int main(int argc, char **argv) {
});
std::cout << "\n---------------" << std::endl;
}
}
#endif
}
8 changes: 3 additions & 5 deletions examples/demo_qwen_npu.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#ifdef USE_QNN
#include "backends/cpu/CPUBackend.hpp"
#include "cmdline.h"
#include "models/qwen/configuration_qwen.hpp"
Expand Down Expand Up @@ -61,7 +60,7 @@ int main(int argc, char **argv) {
});

static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();

LlmTextGeneratorOpts decoding_opt{
.max_new_tokens = 100,
Expand All @@ -75,7 +74,7 @@ int main(int argc, char **argv) {
decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
// call only once of switchDecodeTag
if (!isSwitched) {
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
isSwitched = true;
}
auto out_string = tokenizer.detokenize({out_token});
Expand All @@ -89,5 +88,4 @@ int main(int argc, char **argv) {
});
std::cout << "\n---------------" << std::endl;
}
}
#endif
}
12 changes: 6 additions & 6 deletions examples/main_phonelm_npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,11 +190,11 @@ int main(int argc, char **argv) {
auto inter_cpu_backend = dynamic_cast<CPUBackend *>(interNet.backends()[MLLM_CPU].get());
auto decode_cpu_backend = dynamic_cast<CPUBackend *>(cpuNet.backends()[MLLM_CPU].get());
prefill_cpu_backend->setSequenceLength(real_seq_length);
prefill_cpu_backend->switchDecodeTag();
prefill_cpu_backend->toggleSwitching();
inter_cpu_backend->setSequenceLength(real_seq_length);
inter_cpu_backend->switchDecodeTag();
inter_cpu_backend->toggleSwitching();
decode_cpu_backend->setSequenceLength(real_seq_length);
decode_cpu_backend->switchDecodeTag();
decode_cpu_backend->toggleSwitching();

// // 2: Decoding stage using CPU execute
for (int step = real_seq_length; step < real_seq_length + 100; step++) {
Expand All @@ -210,9 +210,9 @@ int main(int argc, char **argv) {
std::cout << out_token << std::flush;

if (step == real_seq_length) {
prefill_cpu_backend->switchDecodeTag();
inter_cpu_backend->switchDecodeTag();
decode_cpu_backend->switchDecodeTag();
prefill_cpu_backend->toggleSwitching();
inter_cpu_backend->toggleSwitching();
decode_cpu_backend->toggleSwitching();
}
}
} while (false);
Expand Down
12 changes: 6 additions & 6 deletions examples/main_qwen_npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,11 +187,11 @@ int main(int argc, char **argv) {
auto inter_cpu_backend = dynamic_cast<CPUBackend *>(interNet.backends()[MLLM_CPU].get());
auto decode_cpu_backend = dynamic_cast<CPUBackend *>(cpuNet.backends()[MLLM_CPU].get());
prefill_cpu_backend->setSequenceLength(real_seq_length);
prefill_cpu_backend->switchDecodeTag();
prefill_cpu_backend->toggleSwitching();
inter_cpu_backend->setSequenceLength(real_seq_length);
inter_cpu_backend->switchDecodeTag();
inter_cpu_backend->toggleSwitching();
decode_cpu_backend->setSequenceLength(real_seq_length);
decode_cpu_backend->switchDecodeTag();
decode_cpu_backend->toggleSwitching();

// // 2: Decoding stage using CPU execute
for (int step = real_seq_length; step < real_seq_length + 100; step++) {
Expand All @@ -207,9 +207,9 @@ int main(int argc, char **argv) {
std::cout << out_token << std::flush;

if (step == real_seq_length) {
prefill_cpu_backend->switchDecodeTag();
inter_cpu_backend->switchDecodeTag();
decode_cpu_backend->switchDecodeTag();
prefill_cpu_backend->toggleSwitching();
inter_cpu_backend->toggleSwitching();
decode_cpu_backend->toggleSwitching();
}
}
} while (false);
Expand Down
3 changes: 2 additions & 1 deletion include/Types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ enum TensorStatus {
// TENSOR_DYNAMIC,
TENSOR_STATIC_INIT,
TENSOR_STATIC_READY,
TENSOR_UNDEFINED,
};

enum ErrorCode {
Expand Down Expand Up @@ -141,7 +142,7 @@ enum RoPEType {
MLAROPE = 5,
};

enum QNNExecutionType {
enum ExecutionType {
PROMPT = 0,
AUTOREGRESSIVE = 1,
};
Expand Down
2 changes: 1 addition & 1 deletion scripts/build_qnn_android.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ cmake .. \
-DQNN_VALIDATE_NODE=ON \
-DMLLM_BUILD_XNNPACK_BACKEND=OFF

make -j16
make -j40
44 changes: 44 additions & 0 deletions scripts/run_phonelm_qnn.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
!/bin/bash

adb shell mkdir -p /data/local/tmp/mllm/vocab
adb shell mkdir -p /data/local/tmp/mllm/qnn-lib

adb push ../vocab/phonelm_vocab.mllm /data/local/tmp/mllm/vocab/


if ! adb shell [ -f "/data/local/tmp/mllm/models/PhoneLM-1.5B-Instruct-128.mllm" ]; then
adb push ../models/PhoneLM-1.5B-Instruct-128.mllm "/data/local/tmp/mllm/models/PhoneLM-1.5B-Instruct-128.mllm"
else
echo "PhoneLM-1.5B-Instruct-128 file already exists"
fi


if ! adb shell [ -f "/data/local/tmp/mllm/models//phonelm-1.5b-instruct-q4_0_4_4.mllm" ]; then
adb push ../models//phonelm-1.5b-instruct-q4_0_4_4.mllm "/data/local/tmp/mllm/models//phonelm-1.5b-instruct-q4_0_4_4.mllm"
else
echo "/phonelm-1.5b-instruct-q4_0_4_4.mllm file already exists"
fi

LIBPATH=../src/backends/qnn/qualcomm_ai_engine_direct_220/
ANDR_LIB=$LIBPATH/lib/aarch64-android
OP_PATH=../src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/build
DEST=/data/local/tmp/mllm/qnn-lib

adb push $ANDR_LIB/libQnnHtp.so $DEST
adb push $ANDR_LIB/libQnnHtpV75Stub.so $DEST
adb push $ANDR_LIB/libQnnHtpPrepare.so $DEST
adb push $ANDR_LIB/libQnnHtpProfilingReader.so $DEST
adb push $ANDR_LIB/libQnnHtpOptraceProfilingReader.so $DEST
adb push $ANDR_LIB/libQnnHtpV75CalculatorStub.so $DEST
adb push $LIBPATH/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so $DEST
adb push $OP_PATH/aarch64-android/libQnnLLaMAPackage.so $DEST/libQnnLLaMAPackage_CPU.so
adb push $OP_PATH/hexagon-v75/libQnnLLaMAPackage.so $DEST/libQnnLLaMAPackage_HTP.so


if [ $? -ne 0 ]; then
echo "adb push failed"
exit 1
fi

adb push ../bin-arm/demo_phonelm_npu /data/local/tmp/mllm/bin/
adb shell "cd /data/local/tmp/mllm/bin && export LD_LIBRARY_PATH=/data/local/tmp/mllm/qnn-lib && export ADSP_LIBRARY_PATH=/data/local/tmp/mllm/qnn-lib && ./demo_phonelm_npu"
21 changes: 18 additions & 3 deletions src/Generate.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ struct LlmTextGeneratorOpts {
float top_p = 0.92;
bool is_padding = false;
int seq_before_padding = 0;
int chunk_size = -1;
};

template <typename T>
Expand All @@ -51,12 +52,15 @@ enum class LLmTextGeneratorType : int32_t {
class _LlmTextGenerateMethod {
bool is_padding = false;
int seq_before_padding = 0;
int chunk_size = -1;

public:
virtual ~_LlmTextGenerateMethod() = default;
virtual unsigned int generate(Tensor &t) = 0;
inline void setPadding(bool is_padding, int seq_before_padding) {
inline void setPadding(bool is_padding, int seq_before_padding, int chunk_size) {
this->is_padding = is_padding;
this->seq_before_padding = seq_before_padding;
this->chunk_size = chunk_size;
}
inline void _tensor_to_vec(Tensor &t, std::vector<float> &scores) {
assert(t.batch() == 1 && "Batch size of result is not 1. Which is not supported for now.");
Expand All @@ -65,7 +69,11 @@ class _LlmTextGenerateMethod {
int _seq = t.sequence() - 1;
// padding prefill for QNN
if (is_padding) {
_seq = seq_before_padding - 1;
if (chunk_size > 0) {
_seq = (seq_before_padding - 1) % chunk_size;
} else {
_seq = seq_before_padding - 1;
}
}
for (int i = 0; i < _dims; ++i) {
auto value = t.dataAt<float>(0, 0, _seq, i);
Expand Down Expand Up @@ -159,14 +167,21 @@ class LlmTextGenerator {

// padding prefill for QNN
if (opt.is_padding) {
m_method_class->setPadding(opt.is_padding, opt.seq_before_padding);
m_method_class->setPadding(opt.is_padding, opt.seq_before_padding, opt.chunk_size);
}
}

inline unsigned int generate(Tensor &t) {
return m_method_class->generate(t);
}

inline unsigned int generate(Tensor &t, const LlmTextGeneratorOpts &opt) {
if (opt.is_padding) {
m_method_class->setPadding(opt.is_padding, opt.seq_before_padding, opt.chunk_size);
}
return m_method_class->generate(t);
}

inline LLmTextGeneratorType type() {
return m_type;
}
Expand Down
Loading
Loading