Skip to content

Commit

Permalink
Merge branch 'main' into phi3v
Browse files Browse the repository at this point in the history
  • Loading branch information
yirongjie authored Nov 15, 2024
2 parents f6c64f1 + 490fdb4 commit a06656b
Show file tree
Hide file tree
Showing 46 changed files with 422 additions and 208 deletions.
34 changes: 11 additions & 23 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,24 +1,22 @@
macro(func_link_libaries target)
target_link_libraries(${target} PUBLIC fmt::fmt-header-only)
if (MLLM_OPENMP_STATIC)
target_link_libraries(${target} PUBLIC MLLM_CPU fmt::fmt-header-only)
if (MLLM_OPENMP)
target_compile_options(${target} PRIVATE -fopenmp)
target_link_libraries(${target} PUBLIC MLLM_CPU -fopenmp -static-openmp)
else ()
target_compile_options(${target} PRIVATE -fopenmp)
target_link_libraries(${target} PUBLIC MLLM_CPU -fopenmp)
endif ()
if(QNN)
target_compile_definitions(${target} PRIVATE USE_QNN)
if (ARM)
target_link_libraries(${target} PUBLIC MLLM_CPU MLLM_QNN ${CMAKE_DL_LIBS} -fopenmp -static-openmp)
target_link_libraries(${target} PUBLIC -fopenmp -static-openmp)
else ()
target_link_libraries(${target} PUBLIC -fopenmp)
endif ()
endif ()
if (QNN)
target_compile_definitions(${target} PRIVATE USE_QNN)
target_link_libraries(${target} PUBLIC MLLM_QNN ${CMAKE_DL_LIBS})
endif()
if (MLLM_BUILD_XNNPACK_BACKEND)
target_link_libraries(${target} PRIVATE MLLM_CPU MllmXnnpackBackend)
target_link_libraries(${target} PRIVATE MllmXnnpackBackend)
endif()
endmacro()


macro(func_llm_add_executable target)
add_executable(${target}
${PROJECT_SOURCE_DIR}/examples/${target}.cpp
Expand Down Expand Up @@ -56,11 +54,6 @@ macro(func_vlm_add_executable target)
func_link_libaries(${target})
endmacro()



## new demos

# if(NOT MLLM_BUILD_XNNPACK_BACKEND)
func_llm_add_executable(mllm_benchmark)
func_llm_add_executable(demo_llama)
func_llm_add_executable(demo_tinyllama)
Expand Down Expand Up @@ -89,9 +82,8 @@ func_vlm_add_executable(demo_imagebind)
func_vlm_add_executable(demo_imagebind_1mod)
func_vlm_add_executable(demo_phi3v)
# func_vlm_add_executable(demo)
# endif()

# QNN demo

if(QNN)
func_llm_add_executable(demo_qwen_npu)
func_llm_add_executable(main_qwen_npu)
Expand All @@ -103,10 +95,6 @@ endif()

if(MLLM_BUILD_XNNPACK_BACKEND)
func_llm_add_executable(demo_qwen_xp)
# func_llm_add_executable(demo_qwenqkvmm)
# func_llm_add_executable(demo_qwenqkvmm_cpu)
# func_llm_add_executable(demo_qdint8_gemm)
# func_llm_add_executable(demo_fp32gemmperf)
endif()


Expand Down
93 changes: 63 additions & 30 deletions examples/demo_phonelm_npu.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#ifdef USE_QNN
#include "Module.hpp"
#include "Types.hpp"
#include <memory>
#include "backends/cpu/CPUBackend.hpp"
#include "cmdline.h"
#include "models/phonelm/modeling_phonelm.hpp"
Expand All @@ -12,16 +14,18 @@ int main(int argc, char **argv) {
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/PhoneLM-1.5B-Instruct-128.mllm");
cmdParser.add<string>("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-droidcall-q4_0_4_4.mllm");
cmdParser.add<string>("decoding", 'd', "specify mllm decoding model path", false, "../models/phonelm-1.5b-instruct-q4_0_4_4.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.add<int>("chunk", 'c', "chunk size", false, 64);
cmdParser.parse_check(argc, argv);

string vocab_path = cmdParser.get<string>("vocab");
string merge_path = cmdParser.get<string>("merge");
string model_path = cmdParser.get<string>("model");
string decoding_path = cmdParser.get<string>("decoding");
int tokens_limit = cmdParser.get<int>("limits");
int chunk_size = cmdParser.get<int>("chunk");
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

auto tokenizer = SmolLMTokenizer(vocab_path, merge_path);
Expand All @@ -33,35 +37,64 @@ int main(int argc, char **argv) {

vector<string> in_strs = {
"Give me a short introduction to large language model.",
"What is the Beijing University of Posts and Telecommunications.",
"What is the meaning of life?",
"Hello, who are you?",
"What can you do?",
"Please introduce Beijing University of Posts and Telecommunications.",
};

// turn on the multi-chunk prefilling
Module::isMultiChunkPrefilling = true;

for (int i = 0; i < in_strs.size(); ++i) {
auto input_str = tokenizer.apply_chat_template(in_strs[i]);
auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 64, config.vocab_size);
std::cout << real_seq_length << endl;
std::cout << input_str << std::endl;
auto [real_seq_length, input_tensor] = tokenizer.tokenizePaddingByChunk(input_str, chunk_size, config.vocab_size);
const int seq_length_padding = (chunk_size - real_seq_length % chunk_size) + real_seq_length;
const int chunk_num = seq_length_padding / chunk_size;
bool isSwitched = false;
// std::cout << "real seq length: " << real_seq_length << " padding to: " << seq_length_padding << " chunk num: " << chunk_num << std::endl;
std::cout << "[Q] " << in_strs[i] << std::endl;
std::cout << "[A] " << std::flush;

// tensor vectors to save the chunked tensors of the QNN prefilling input
vector<Tensor> chunked_tensors(chunk_num);
LlmTextGeneratorOpts opt{
.max_new_tokens = 1,
.do_sample = false,
.temperature = 0.3f,
.top_k = 50,
.top_p = 0.f,
.is_padding = true,
.seq_before_padding = real_seq_length,
.chunk_size = chunk_size,
};
model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
auto out_string = tokenizer.detokenize({out_token});
auto [not_end, output_string] = tokenizer.postprocess(out_string);
if (!not_end) { return false; }
std::cout << output_string << std::flush;
return true;
});

for (int chunk_id = 0; chunk_id < chunk_num; ++chunk_id) {
chunked_tensors[chunk_id].setBackend(Backend::global_backends[MLLM_CPU]);
chunked_tensors[chunk_id].setTtype(INPUT_TENSOR);
chunked_tensors[chunk_id].reshape(1, 1, chunk_size, 1);
chunked_tensors[chunk_id].setName("input-chunk-" + to_string(chunk_id));
chunked_tensors[chunk_id].deepCopyFrom(&input_tensor, false, {0, 0, chunk_id * chunk_size, 0});

model.generate(chunked_tensors[chunk_id], opt, [&](unsigned int out_token) -> bool {
if (i != 0 && !isSwitched && chunk_id == 0) {
// turn off switching at the first chunk of following inputs
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
isSwitched = true;
}
auto out_string = tokenizer.detokenize({out_token});
auto [not_end, output_string] = tokenizer.postprocess(out_string);
if (!not_end) { return false; }
if (chunk_id == chunk_num - 1) { // print the output of the last chunk
std::cout << output_string << std::flush;
}
return true;
});
Module::isFirstChunk = false;
}

// turn on switching, set sequence length and execution type
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(AUTOREGRESSIVE);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();

LlmTextGeneratorOpts decoding_opt{
.max_new_tokens = 100,
Expand All @@ -71,23 +104,23 @@ int main(int argc, char **argv) {
.top_p = 0.f,
.is_padding = false,
};
bool isSwitched = false;
decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
// call only once of switchDecodeTag
if (!isSwitched) {
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
isSwitched = false;
decoding_model.generate(chunked_tensors.back(), decoding_opt, [&](unsigned int out_token) -> bool {
if (!isSwitched) { // turn off switching
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
isSwitched = true;
}
auto out_string = tokenizer.detokenize({out_token});
auto [isOk, print_string] = tokenizer.postprocess(out_string);
if (isOk) {
std::cout << print_string << std::flush;
} else {
return false;
}
auto [not_end, output_string] = tokenizer.postprocess(out_string);
if (!not_end) { return false; }
std::cout << output_string << std::flush;
return true;
});
std::cout << "\n---------------" << std::endl;

// turn on switching, set sequence length and execution type
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(0);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setExecutionType(PROMPT);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
std::cout << "\n";
}
}
#endif
}
8 changes: 3 additions & 5 deletions examples/demo_qwen2.5_npu.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#ifdef USE_QNN
#include "backends/cpu/CPUBackend.hpp"
#include "cmdline.h"
#include "models/qwen/configuration_qwen.hpp"
Expand Down Expand Up @@ -61,7 +60,7 @@ int main(int argc, char **argv) {
});

static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();

LlmTextGeneratorOpts decoding_opt{
.max_new_tokens = 100,
Expand All @@ -75,7 +74,7 @@ int main(int argc, char **argv) {
decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
// call only once of switchDecodeTag
if (!isSwitched) {
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
isSwitched = true;
}
auto out_string = tokenizer.detokenize({out_token});
Expand All @@ -89,5 +88,4 @@ int main(int argc, char **argv) {
});
std::cout << "\n---------------" << std::endl;
}
}
#endif
}
8 changes: 3 additions & 5 deletions examples/demo_qwen_npu.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#ifdef USE_QNN
#include "backends/cpu/CPUBackend.hpp"
#include "cmdline.h"
#include "models/qwen/configuration_qwen.hpp"
Expand Down Expand Up @@ -61,7 +60,7 @@ int main(int argc, char **argv) {
});

static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->setSequenceLength(real_seq_length);
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();

LlmTextGeneratorOpts decoding_opt{
.max_new_tokens = 100,
Expand All @@ -75,7 +74,7 @@ int main(int argc, char **argv) {
decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
// call only once of switchDecodeTag
if (!isSwitched) {
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->switchDecodeTag();
static_cast<CPUBackend *>(Backend::global_backends[MLLM_CPU])->toggleSwitching();
isSwitched = true;
}
auto out_string = tokenizer.detokenize({out_token});
Expand All @@ -89,5 +88,4 @@ int main(int argc, char **argv) {
});
std::cout << "\n---------------" << std::endl;
}
}
#endif
}
12 changes: 6 additions & 6 deletions examples/main_phonelm_npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,11 +190,11 @@ int main(int argc, char **argv) {
auto inter_cpu_backend = dynamic_cast<CPUBackend *>(interNet.backends()[MLLM_CPU].get());
auto decode_cpu_backend = dynamic_cast<CPUBackend *>(cpuNet.backends()[MLLM_CPU].get());
prefill_cpu_backend->setSequenceLength(real_seq_length);
prefill_cpu_backend->switchDecodeTag();
prefill_cpu_backend->toggleSwitching();
inter_cpu_backend->setSequenceLength(real_seq_length);
inter_cpu_backend->switchDecodeTag();
inter_cpu_backend->toggleSwitching();
decode_cpu_backend->setSequenceLength(real_seq_length);
decode_cpu_backend->switchDecodeTag();
decode_cpu_backend->toggleSwitching();

// // 2: Decoding stage using CPU execute
for (int step = real_seq_length; step < real_seq_length + 100; step++) {
Expand All @@ -210,9 +210,9 @@ int main(int argc, char **argv) {
std::cout << out_token << std::flush;

if (step == real_seq_length) {
prefill_cpu_backend->switchDecodeTag();
inter_cpu_backend->switchDecodeTag();
decode_cpu_backend->switchDecodeTag();
prefill_cpu_backend->toggleSwitching();
inter_cpu_backend->toggleSwitching();
decode_cpu_backend->toggleSwitching();
}
}
} while (false);
Expand Down
12 changes: 6 additions & 6 deletions examples/main_qwen_npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,11 +187,11 @@ int main(int argc, char **argv) {
auto inter_cpu_backend = dynamic_cast<CPUBackend *>(interNet.backends()[MLLM_CPU].get());
auto decode_cpu_backend = dynamic_cast<CPUBackend *>(cpuNet.backends()[MLLM_CPU].get());
prefill_cpu_backend->setSequenceLength(real_seq_length);
prefill_cpu_backend->switchDecodeTag();
prefill_cpu_backend->toggleSwitching();
inter_cpu_backend->setSequenceLength(real_seq_length);
inter_cpu_backend->switchDecodeTag();
inter_cpu_backend->toggleSwitching();
decode_cpu_backend->setSequenceLength(real_seq_length);
decode_cpu_backend->switchDecodeTag();
decode_cpu_backend->toggleSwitching();

// // 2: Decoding stage using CPU execute
for (int step = real_seq_length; step < real_seq_length + 100; step++) {
Expand All @@ -207,9 +207,9 @@ int main(int argc, char **argv) {
std::cout << out_token << std::flush;

if (step == real_seq_length) {
prefill_cpu_backend->switchDecodeTag();
inter_cpu_backend->switchDecodeTag();
decode_cpu_backend->switchDecodeTag();
prefill_cpu_backend->toggleSwitching();
inter_cpu_backend->toggleSwitching();
decode_cpu_backend->toggleSwitching();
}
}
} while (false);
Expand Down
3 changes: 2 additions & 1 deletion include/Types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ enum TensorStatus {
// TENSOR_DYNAMIC,
TENSOR_STATIC_INIT,
TENSOR_STATIC_READY,
TENSOR_UNDEFINED,
};

enum ErrorCode {
Expand Down Expand Up @@ -141,7 +142,7 @@ enum RoPEType {
MLAROPE = 5,
};

enum QNNExecutionType {
enum ExecutionType {
PROMPT = 0,
AUTOREGRESSIVE = 1,
};
Expand Down
2 changes: 1 addition & 1 deletion scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ cd ../build || exit

cmake .. -DCMAKE_BUILD_TYPE=Release

make -j4
make -j$(nproc)
2 changes: 1 addition & 1 deletion scripts/build_android.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ cmake .. \
-DARM=ON \
-DAPK=OFF

make -j4
make -j$(nproc)
6 changes: 3 additions & 3 deletions scripts/build_android_app.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
mkdir ../build-arm
cd ../build-arm || exit
mkdir ../build-arm-app
cd ../build-arm-app || exit

# 1. build mllm_lib
cmake .. \
Expand All @@ -19,7 +19,7 @@ cmake .. \
-DMLLM_BUILD_XNNPACK_BACKEND=OFF


make mllm_lib -j16
make mllm_lib -j$(nproc)

# 2. copy libs
cp ./libmllm_lib.a ../android/app/src/main/cpp/libs/
Expand Down
6 changes: 3 additions & 3 deletions scripts/build_qnn_android.sh → scripts/build_android_qnn.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
mkdir ../build-arm
cd ../build-arm || exit
mkdir ../build-arm-qnn
cd ../build-arm-qnn || exit

cmake .. \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
Expand All @@ -16,4 +16,4 @@ cmake .. \
-DQNN_VALIDATE_NODE=ON \
-DMLLM_BUILD_XNNPACK_BACKEND=OFF

make -j16
make -j$(nproc)
Loading

0 comments on commit a06656b

Please sign in to comment.