Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support PhonLM-1.5B-Call demo in Android Demo #193

Merged
merged 1 commit into from
Nov 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 29 additions & 6 deletions examples/demo_phonelm_npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/PhoneLM-1.5B-Instruct-128.mllm");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/phonelm-1.5b-instruct-int8.mllm");
cmdParser.add<string>("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-instruct-q4_0_4_4.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
Expand All @@ -35,17 +35,39 @@ int main(int argc, char **argv) {
auto decoding_model = PhoneLMForCausalLM(config);
decoding_model.load(decoding_path);

// warmup START
std::string input_str = " ";
auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);
LlmTextGeneratorOpts opt{
.max_new_tokens = 1,
.do_sample = false,
.is_padding = true,
.seq_before_padding = real_seq_length,
.chunk_size = chunk_size,
};
model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
auto out_string = tokenizer.detokenize({out_token});
auto [not_end, output_string] = tokenizer.postprocess(out_string);
if (!not_end) { return false; }
return true;
});
Module::isFirstChunk = false;
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(0);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
// turn on the multi-chunk prefilling
Module::isMultiChunkPrefilling = true;
// warmup END
std::cout << "Warmup finished." << std::endl;

vector<string> in_strs = {
"Give me a short introduction to large language model.",
"What is the Beijing University of Posts and Telecommunications.",
"What is the meaning of life?",
"Hello, who are you?",
"What can you do?",
"Please introduce Beijing University of Posts and Telecommunications.",
};

// turn on the multi-chunk prefilling
Module::isMultiChunkPrefilling = true;
"\"Large Language Models (LLMs) are advanced artificial intelligence systems designed to understand and generate human-like text. These models are trained on vast amounts of data, enabling them to perform a wide range of tasks, from answering questions and summarizing text to generating creative content and engaging in conversational dialogue. LLMs like GPT-3 and GPT-4, developed by OpenAI, have set new benchmarks in natural language processing by leveraging deep learning architectures, particularly transformer models, which excel at capturing context and relationships within text. The scalability and versatility of LLMs make them invaluable tools for applications in education, customer service, content creation, and more. However, their deployment also raises ethical considerations, including issues of bias, misinformation, and the potential for misuse. As the field continues to evolve, ongoing research and responsible deployment strategies are essential to harnessing the full potential of these powerful AI systems while mitigating their risks.\"\nGenerate a title based on the above text."};

for (int i = 0; i < in_strs.size(); ++i) {
auto input_str = tokenizer.apply_chat_template(in_strs[i]);
Expand Down Expand Up @@ -75,7 +97,8 @@ int main(int argc, char **argv) {
chunked_tensors[chunk_id].deepCopyFrom(&input_tensor, false, {0, 0, chunk_id * chunk_size, 0});

model.generate(chunked_tensors[chunk_id], opt, [&](unsigned int out_token) -> bool {
if (i != 0 && !isSwitched && chunk_id == 0) {
// if (i != 0 && !isSwitched && chunk_id == 0) {
if (!isSwitched && chunk_id == 0) {
// turn off switching at the first chunk of following inputs
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
isSwitched = true;
Expand Down
2 changes: 1 addition & 1 deletion examples/main_phonelm_npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ int main(int argc, char **argv) {

cmdParser.parse_check(argc, argv);

const string npu_model_path = "../models/PhoneLM-1.5B-Instruct-128.mllm";
const string npu_model_path = "../models/phonelm-1.5b-instruct-int8.mllm";
const string cpu_model_path = "../models/phonelm-with-head-q4k.mllm";
const string merge_file_path = "../vocab/phonelm_merges.txt";

Expand Down
6 changes: 3 additions & 3 deletions scripts/run_phonelm_qnn.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ adb shell mkdir -p /data/local/tmp/mllm/qnn-lib
adb push ../vocab/phonelm_vocab.mllm /data/local/tmp/mllm/vocab/


if ! adb shell [ -f "/data/local/tmp/mllm/models/PhoneLM-1.5B-Instruct-128.mllm" ]; then
adb push ../models/PhoneLM-1.5B-Instruct-128.mllm "/data/local/tmp/mllm/models/PhoneLM-1.5B-Instruct-128.mllm"
if ! adb shell [ -f "/data/local/tmp/mllm/models/phonelm-1.5b-instruct-int8.mllm" ]; then
adb push ../models/phonelm-1.5b-instruct-int8.mllm "/data/local/tmp/mllm/models/phonelm-1.5b-instruct-int8.mllm"
else
echo "PhoneLM-1.5B-Instruct-128 file already exists"
echo "phonelm-1.5b-instruct-int8 file already exists"
fi


Expand Down
51 changes: 51 additions & 0 deletions src/models/qwen/tokenization_qwen.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,57 @@ class QWenTokenizer final : public BPETokenizer {
ret.resize(seqLength, vocab_size);
return std::make_pair(realLength, Tokenizer::tokens2Input(ret));
}
// padding the input by neareast multiplication of chunk_size
std::pair<int, Tensor> tokenizePaddingByChunk(std::string &text, int chunk_size, int vocab_size) {
std::vector<token_id_t> ret;

if (split_special_tokens_) {
const auto word_collection = unicode_regex_split(text, FIXED_PAT_STRS);
for (auto &piece : word_collection) {
// look up table
// std::string token;
// for (auto b : UTF8(piece)) token += byte_encoder_[b];

// using bpe
std::vector<token_id_t> tmp;
BPETokenizer::tokenize(piece, tmp, false, true, "");
ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
}
} else {
auto parts = _splitWithDelimiters(text, special_tokens);
// for (auto p : parts) {
// std::cout << "\"" << p << "\"" << std::endl;
// }
for (auto &p : parts) {
if (std::find(special_tokens.begin(), special_tokens.end(), p) != special_tokens.end()) {
std::string token;
for (auto b : UTF8(p)) token += byte_encoder_[b];

std::vector<token_id_t> tmp;
BPETokenizer::tokenize(token, tmp, false, special_tokens, true);
ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
} else {
const auto word_collection = unicode_regex_split(p, FIXED_PAT_STRS);
for (auto &piece : word_collection) {
// look up table
// std::string token;
// for (auto b : UTF8(piece)) token += byte_encoder_[b];

// using bpe
std::vector<token_id_t> tmp;
BPETokenizer::tokenize(piece, tmp, false, true, "");
assert(!tmp.empty());
ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
}
}
}
}

auto realLength = ret.size();
int paddingLength = (chunk_size - realLength % chunk_size) % chunk_size;
ret.resize(realLength + paddingLength, vocab_size);
return std::make_pair(realLength, Tokenizer::tokens2Input(ret));
}

std::string _byte_decode_(const std::string &text) {
std::string ret;
Expand Down
Loading
Loading