Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: QNN New Frontend Phonelm Support and Refactors #179

Merged
merged 55 commits into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
af9ba7d
Merge pull request #7 from liang1232018/develop-merge
liang1232018 Oct 29, 2024
71778d1
feat : support QNN RoPE now, but in reference mode.
liang1232018 Oct 29, 2024
b0d5c9a
fix: qnn old frontend pipeline
oreomaker Oct 31, 2024
3e67102
Merge pull request #8 from liang1232018/develop-zh
liang1232018 Oct 31, 2024
43c25aa
feat: support qnn 1000 token prefilling
oreomaker Oct 31, 2024
c153e05
Merge pull request #9 from liang1232018/develop-zh
liang1232018 Oct 31, 2024
448f26a
feat : rope support h_cnt as input.
liang1232018 Oct 31, 2024
ecd5476
feat : use HVX acceleration for RoPE.
liang1232018 Oct 31, 2024
35af2e7
feat : optimize RoPE FP performance and fix sin cos weights load bugs.
liang1232018 Oct 31, 2024
5ea7068
feat : optimize ROPE Memory.
liang1232018 Nov 1, 2024
9b50cba
feat : optimize RoPE execution.
liang1232018 Nov 1, 2024
5d96b9f
feat : support QUINT8 ROPE as input.
liang1232018 Nov 2, 2024
3b4cedd
refactor: qnn old frontend use nocopy merge&slit, unify old and new e…
oreomaker Nov 3, 2024
b7154f0
fix: embedding padding memset
oreomaker Nov 3, 2024
fe60480
feat : init implementation for phonelm.
liang1232018 Nov 3, 2024
308aa8b
fix : relu reference implementaion bugs.
liang1232018 Nov 3, 2024
b15b831
Merge branch 'develop-xdl' into develop-merge
oreomaker Nov 4, 2024
ad2101d
fix : use mllm embedding weights as cpu linear head weights.
liang1232018 Nov 4, 2024
8af0f33
fix : mllm input file and lm_head layer weights.
liang1232018 Nov 4, 2024
9516a38
fix : embedding bugs.
liang1232018 Nov 4, 2024
bcfb676
fix : reference code and Quantize execution.
liang1232018 Nov 4, 2024
4fb33d7
fix : phoneLM buffer size setting in shadow execution.
liang1232018 Nov 4, 2024
0fddd14
fix : change all the operator back to FP32.
liang1232018 Nov 4, 2024
247ee00
feat : use shadow = 64 as the NPU model.
liang1232018 Nov 4, 2024
c5daff8
fix : quantization execution -128 setting error.
liang1232018 Nov 4, 2024
b453f9a
fix : shadow executiion error in input1 and input2 shapes.
liang1232018 Nov 5, 2024
2324761
feat : use NPU acceleration implementation.
liang1232018 Nov 5, 2024
6d091e8
feat : use QUINT8 Relu and mul in FFN.
liang1232018 Nov 5, 2024
3343d7d
feat : using fp16 kvcache.
liang1232018 Nov 5, 2024
3790f39
feat : support shadow execution with fp16 type and front-end config.
liang1232018 Nov 5, 2024
549f329
feat: qwen npu no copy with graph fusion
oreomaker Nov 6, 2024
5740b53
Merge branch 'develop-phonelm' into develop-merge
oreomaker Nov 6, 2024
bc4f8a1
refactor: rewrite npu phonelm modeling
oreomaker Nov 6, 2024
3c50a0c
chore: remove unused npu view, add qnn node validation option
oreomaker Nov 6, 2024
5479c8f
fix: npu phonelm shadow merge and split
oreomaker Nov 6, 2024
64f02d7
feat: npu demo phonelm
oreomaker Nov 6, 2024
6eea490
feat : add QNNIRoPE.
liang1232018 Nov 6, 2024
6338d89
fix : QNNIRope express.
liang1232018 Nov 6, 2024
1bfbfd3
fix : CPU Linear int8 shadow bugs.
liang1232018 Nov 6, 2024
1d2d004
fix : fix phonelm new front end bugs.
liang1232018 Nov 6, 2024
b25daf7
fix : use the fastest qwen and phonelm model structure.
liang1232018 Nov 6, 2024
f968bf1
fix: qnn pipeline run setDataLoader
oreomaker Nov 6, 2024
e5b3dc7
Merge branches 'develop-merge' and 'develop-merge' of github.com:lian…
oreomaker Nov 6, 2024
27abc8a
dev: make pipeline gap 1 layer
oreomaker Nov 6, 2024
f021d9e
feat : optimize phonelm and shadow op threshold.
liang1232018 Nov 6, 2024
f446c60
Merge pull request #10 from liang1232018/develop-merge
liang1232018 Nov 6, 2024
ecde02e
Merge branch 'develop-xdl' into develop-phonelm-qnnirope
liang1232018 Nov 6, 2024
deea2d1
Merge pull request #11 from liang1232018/develop-phonelm-qnnirope
liang1232018 Nov 6, 2024
88f7290
fix : IRoPE implementation bug and scale bug.
liang1232018 Nov 6, 2024
fef3657
fix : merge model error.
liang1232018 Nov 6, 2024
db5f65c
Merge branch 'main' into develop-phonelm-merge
oreomaker Nov 7, 2024
937e22a
fix: demo phonelm npu decoding, main branch merge
oreomaker Nov 7, 2024
5eebae0
Merge branch 'develop-xdl' into develop-merge
oreomaker Nov 7, 2024
c4e2a3d
fix: __fp16 -> mllm_fp16_t
yirongjie Nov 7, 2024
0816e06
Merge branch 'main' into main
yirongjie Nov 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,9 @@ option(QNN_OLD_FRONTEND "Enable Old QNN" OFF)
if(QNN)
add_definitions(-DUSE_QNN) # the USE_QNN should come before cpu subdirectory
endif()
if(QNN_OLD_FRONTEND)
add_definitions(-DOLD_QNN)
option(QNN_VALIDATE_NODE "Enable QNN Validate Node When Building Graph" ON)
if(QNN_VALIDATE_NODE)
add_definitions(-DQNN_VALIDATE_NODE)
endif()

if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
Expand Down
4 changes: 3 additions & 1 deletion examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,10 @@ func_vlm_add_executable(demo_imagebind_1mod)

# QNN demo
if(QNN)
func_llm_add_executable(demo_qnn)
func_llm_add_executable(demo_qwen_npu)
func_llm_add_executable(main_qwen_npu)
func_llm_add_executable(demo_phonelm_npu)
func_llm_add_executable(main_phonelm_npu)
endif()


Expand Down
2 changes: 1 addition & 1 deletion examples/demo_phonelm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/phonelm-1.5b-instruct-fp32.mllm");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/phonelm-1.5b-instruct-q4_0_4_4.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);
Expand Down
91 changes: 91 additions & 0 deletions examples/demo_phonelm_npu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#ifdef USE_QNN
#include "backends/cpu/CPUBackend.hpp"
#include "cmdline.h"
#include "models/phonelm/modeling_phonelm.hpp"
#include "models/phonelm/modeling_phonelm_npu.hpp"
#include "models/smollm/tokenization_smollm.hpp"

using namespace mllm;

int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/PhoneLM-1.5B-Instruct-128.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);

string vocab_path = cmdParser.get<string>("vocab");
string merge_path = cmdParser.get<string>("merge");
string model_path = cmdParser.get<string>("model");
int tokens_limit = cmdParser.get<int>("limits");
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

auto tokenizer = SmolLMTokenizer(vocab_path, merge_path);
PhoneLMConfig config(tokens_limit, "1.5B");
auto model = PhoneLMForCausalLM_NPU(config);
model.load(model_path);
auto decoding_model = PhoneLMForCausalLM(config);
decoding_model.load("../models/phonelm-1.5b-instruct-q4_0_4_4.mllm");

vector<string> in_strs = {
"Give me a short introduction to large language model.",
};

for (int i = 0; i < in_strs.size(); ++i) {
auto input_str = tokenizer.apply_chat_template(in_strs[i]);
auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 64, config.vocab_size);
std::cout << real_seq_length << endl;
std::cout << input_str << std::endl;
std::cout << "[Q] " << in_strs[i] << std::endl;
std::cout << "[A] " << std::flush;

LlmTextGeneratorOpts opt{
.max_new_tokens = 1,
.do_sample = false,
.temperature = 0.3f,
.top_k = 50,
.top_p = 0.f,
.is_padding = true,
.seq_before_padding = real_seq_length,
};
model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
auto out_string = tokenizer.detokenize({out_token});
auto [not_end, output_string] = tokenizer.postprocess(out_string);
if (!not_end) { return false; }
std::cout << output_string << std::flush;
return true;
});

static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();

LlmTextGeneratorOpts decoding_opt{
.max_new_tokens = 100,
.do_sample = false,
.temperature = 0.3f,
.top_k = 50,
.top_p = 0.f,
.is_padding = false,
};
bool isSwitched = false;
decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
// call only once of switchDecodeTag
if (!isSwitched) {
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
isSwitched = true;
}
auto out_string = tokenizer.detokenize({out_token});
auto [isOk, print_string] = tokenizer.postprocess(out_string);
if (isOk) {
std::cout << print_string << std::flush;
} else {
return false;
}
return true;
});
std::cout << "\n---------------" << std::endl;
}
}
#endif
4 changes: 3 additions & 1 deletion examples/demo_qnn.cpp → examples/demo_qwen_npu.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#ifdef USE_QNN
#include "backends/cpu/CPUBackend.hpp"
#include "cmdline.h"
#include "models/qwen/configuration_qwen.hpp"
Expand Down Expand Up @@ -88,4 +89,5 @@ int main(int argc, char **argv) {
});
std::cout << "\n---------------" << std::endl;
}
}
}
#endif
236 changes: 236 additions & 0 deletions examples/main_phonelm_npu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
#ifdef USE_QNN
#include <iostream>
#include <csignal>
#include <memory>
#include <vector>
#include "Executor.hpp"
#include "Types.hpp"
#include "backends/qnn/QNNNet.hpp"
#include "cmdline.h"
#include "Net.hpp"
#include "backends/qnn/QNNExecutor.hpp"

#include "models/smollm/tokenization_smollm.hpp"
#include "main_phonelm_npu.hpp"

using namespace mllm;

unsigned int argmax(const std::vector<float> &scores) {
return std::max_element(scores.begin(), scores.end()) - scores.begin();
}

unsigned int postProcessing(shared_ptr<Tensor> result, shared_ptr<Tensor> &out_result) {
assert(result->batch() == 1);
assert(result->head() == 1);
out_result->reshape(1, 1, 1, 1);
out_result->alloc();
vector<float> scores;
for (int i = 0; i < result->dimension(); ++i) {
auto value = result->dataAt<float>(0, 0, result->sequence() - 1, i);
scores.push_back(value);
}
auto token_idx = argmax(scores);
out_result->setDataAt<float>(0, 0, 0, 0, token_idx);
return token_idx;
}

unsigned int postProcessing_prefill(shared_ptr<Tensor> result, shared_ptr<Tensor> &out_result, int seq) {
assert(result->batch() == 1);
assert(result->head() == 1);
out_result->reshape(1, 1, 1, 1);
out_result->alloc();
vector<float> scores;
for (int i = 0; i < result->dimension(); ++i) {
auto value = result->dataAt<float>(0, 0, seq - 1, i);
scores.push_back(value);
}
auto token_idx = argmax(scores);
out_result->setDataAt<float>(0, 0, 0, 0, token_idx);
return token_idx;
}

int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");

cmdParser.add<int>("limits", 'l', "max KV cache size", false, 1124);

cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.add<int>("seq", 's', "seqenth length", false, 64);
cmdParser.add<bool>("chunk", 'c', "use chunk execute", false, true);
cmdParser.add<int>("head", 'h', "num of heads", false, 16);

cmdParser.add<int>("ffn", 'f', "size of ffn hidden size", false, 6816);
cmdParser.add<int>("hds", 'd', "size of hidden size", false, 2560);

cmdParser.add<bool>("readfile", 'r', "read prompt from file", false, false);

cmdParser.parse_check(argc, argv);

const string npu_model_path = "../models/PhoneLM-1.5B-Instruct-128.mllm";
const string cpu_model_path = "../models/phonelm-with-head-q4k.mllm";
const string merge_file_path = "../vocab/phonelm_merges.txt";

string vocab_path = cmdParser.get<string>("vocab");
int tokens_limit = cmdParser.get<int>("limits");
int thread_num = cmdParser.get<int>("thread");
int seqLength = cmdParser.get<int>("seq");
bool isChunkExecute = cmdParser.get<bool>("chunk");
int head_num = cmdParser.get<int>("head");

bool read_file = cmdParser.get<bool>("readfile");

int chunk = 1;
if (isChunkExecute)
chunk = seqLength / 256;

int vocab_size = 49152;
int hidden_dim = cmdParser.get<int>("hds");
int ffn_hidden_dim = cmdParser.get<int>("ffn");

vector<string> in_strs = {
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nGive me a short introduction to large language model.<|im_end|>\n<|im_start|>assistant\n",
// " What can you do?",
// "Please introduce Beijing University of Posts and Telecommunications."};
};

string input_string;
if (read_file) {
std::ifstream file("./func_prompt.txt");
if (!file) {
std::cerr << "无法打开文件!" << std::endl;
return 1;
}
std::stringstream buffer;
buffer << file.rdbuf();
input_string = buffer.str();
file.close(); // 关闭文件
} else {
input_string = in_strs[0];
}

auto tokenizer = SmolLMTokenizer(vocab_path, merge_file_path);

std::unique_ptr<Context> npu_ctx_ptr(new Context());
auto *npu_ctx = npu_ctx_ptr.get();
std::unique_ptr<Context> cpu_ctx_ptr(new Context());
auto *cpu_ctx = cpu_ctx_ptr.get();
std::unique_ptr<Context> inter_ctx_ptr(new Context());
auto *inter_ctx = inter_ctx_ptr.get();

// cache_max should be longer than seqLength
modeling::phonelm_npu(npu_ctx, vocab_size, hidden_dim, ffn_hidden_dim, head_num, tokens_limit, seqLength, chunk);
modeling::phonelm_npu_cpu_inter(inter_ctx, vocab_size, hidden_dim, ffn_hidden_dim, head_num, tokens_limit, seqLength, chunk);
modeling::phonelm_cpu_q40(cpu_ctx, vocab_size, hidden_dim, ffn_hidden_dim, head_num, tokens_limit);

BackendConfig bn;
QNNNet npuNet(bn, npu_ctx);
npuNet.convert(npu_ctx, BackendType::MLLM_QNN, thread_num);
Net interNet(bn);
interNet.convert(inter_ctx->sub_param_, BackendType::MLLM_CPU, thread_num);
Net cpuNet(bn);
cpuNet.convert(cpu_ctx->sub_param_, BackendType::MLLM_CPU, thread_num);

ParamLoader npu_prefill_param_loader(npu_model_path);
ParamLoader cpu_decoding_param_loader(cpu_model_path);
ParamLoader inter_param_loader(npu_model_path);

QNNExecutor *npuExePtr;
if (isChunkExecute) {
npuExePtr = new QNNPipelineExecutor(&npu_prefill_param_loader);
} else {
npuExePtr = new QNNExecutor(&npu_prefill_param_loader);
}
auto &npuExe = *npuExePtr;
npuExe.setup(&npuNet);
Executor interExe(&inter_param_loader);
interExe.setup(&interNet);
Executor cpuExe(&cpu_decoding_param_loader);
cpuExe.setup(&cpuNet);

shared_ptr<Tensor> input = std::make_shared<Tensor>();

for (int str_i = 0; str_i < in_strs.size(); ++str_i) {
// auto in_str = in_strs[str_i];
auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_string, seqLength, vocab_size);
auto input = std::make_shared<Tensor>(input_tensor);

if (chunk != 1)
npuExe.warmup(npu_ctx, &npuNet, {input});

std::cout << "real_seq_length: " << real_seq_length << std::endl;
std::cout << "[Q] " << input_string << std::endl;
std::cout << "[A] " << std::flush;

do {
// 1: Prefill stage using NPU chunk execute
if (chunk == 1)
npuExe.run(npu_ctx, &npuNet, {input});
else
npuExe.runExp(npu_ctx, &npuNet, {input});
auto result = npuExe.result();

// result[0]->printData<float>();
// exit(0);

// inter model for prefill-decode
interExe.run(&interNet, {result[0]});
result = interExe.result();

auto token_idx = postProcessing_prefill(result[0], input, real_seq_length);
if (token_idx == 2) { // "</s>"
break;
}
// exit(0);

auto out_token = tokenizer.detokenize({token_idx});
std::cout << out_token << std::flush;

auto prefill_cpu_backend = dynamic_cast<CPUBackend *>(npuNet.backends()[MLLM_CPU].get());
auto inter_cpu_backend = dynamic_cast<CPUBackend *>(interNet.backends()[MLLM_CPU].get());
auto decode_cpu_backend = dynamic_cast<CPUBackend *>(cpuNet.backends()[MLLM_CPU].get());
prefill_cpu_backend->setSequenceLength(real_seq_length);
prefill_cpu_backend->switchDecodeTag();
inter_cpu_backend->setSequenceLength(real_seq_length);
inter_cpu_backend->switchDecodeTag();
decode_cpu_backend->setSequenceLength(real_seq_length);
decode_cpu_backend->switchDecodeTag();

// // 2: Decoding stage using CPU execute
for (int step = real_seq_length; step < real_seq_length + 100; step++) {
cpuExe.run(&cpuNet, {input});
auto result = cpuExe.result();

auto token_idx = postProcessing(result[0], input);
if (token_idx == 2) { // "</s>"
break;
}

auto out_token = tokenizer.detokenize({token_idx});
std::cout << out_token << std::flush;

if (step == real_seq_length) {
prefill_cpu_backend->switchDecodeTag();
inter_cpu_backend->switchDecodeTag();
decode_cpu_backend->switchDecodeTag();
}
}
} while (false);
printf("\n");
}

std::cout << "====================" << std::endl;
npuExe.perf();
cpuExe.perf();

// free memory
// for (auto *op : npu_ctx->net_ops) {
// delete op;
// }
// for (auto *tensor : npu_ctx->net_tensors) {
// delete tensor;
// }

return 0;
}
#endif
Loading
Loading