Skip to content

Commit

Permalink
Merge pull request #152 from chenghuaWang/main
Browse files Browse the repository at this point in the history
Xnnpack backend support
  • Loading branch information
yirongjie authored Oct 9, 2024
2 parents b697537 + d9c9e56 commit d1359b2
Show file tree
Hide file tree
Showing 36 changed files with 100,350 additions and 22 deletions.
3 changes: 2 additions & 1 deletion .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ Checks: >
-readability-non-const-parameter,
# Turn all the warnings from the checks above into errors.
WarningsAsErrors: "*"
# WarningsAsErrors: "*"

HeaderFilterRegex: ".*\\.hpp$&!gguf\\.hpp"
CheckOptions:
- key: readability-identifier-naming.NamespaceCase
Expand Down
9 changes: 9 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,12 @@
[submodule "third_party/pybind11"]
path = third_party/pybind11
url = https://github.com/pybind/pybind11.git
[submodule "src/backends/xnnpack/third_party/XNNPACK"]
path = src/backends/xnnpack/third_party/XNNPACK
url = https://github.com/google/XNNPACK.git
[submodule "third_party/googletest"]
path = third_party/googletest
url = https://github.com/google/googletest.git
[submodule "src/backends/xnnpack/third_party/fmt"]
path = src/backends/xnnpack/third_party/fmt
url = https://github.com/fmtlib/fmt.git
17 changes: 10 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,9 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
cmake_policy(SET CMP0135 NEW)
endif ()

include(FetchContent)
FetchContent_Declare(
googletest
URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
)
# for XNNPACK, avoid invovle googltest twice.
set(GOOGLETEST_SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/third_party/googletest)
add_subdirectory(third_party/googletest EXCLUDE_FROM_ALL)

option(MLLM_OPENMP "openmp" ON)
option(MLLM_OPENMP_STATIC "openmp static" OFF)
Expand Down Expand Up @@ -123,6 +121,13 @@ if(QNN)
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/qnn)
endif()

option(MLLM_BUILD_XNNPACK_BACKEND "Build mllm's XNNPACK backend" OFF)
if(MLLM_BUILD_XNNPACK_BACKEND)
add_compile_options(-fPIC)
add_definitions(-DMLLM_BUILD_XNNPACK_BACKEND)
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/xnnpack)
endif()

#add tokenizers
file(GLOB_RECURSE SRC_TOKENIZERS
${PROJECT_SOURCE_DIR}/src/tokenizers/*.cpp
Expand Down Expand Up @@ -156,8 +161,6 @@ if (QUANT)
${CMAKE_CURRENT_LIST_DIR}/src/quantizer/*.hpp)
list(REMOVE_ITEM MLLM_QUANTIZER ${CMAKE_CURRENT_LIST_DIR}/src/quantizer/main.cpp)

message(STATUS "MLLM_Quant: ${MLLM_QUANT}")

add_executable(
quantize
${PROJECT_SOURCE_DIR}/src/quantizer/main.cpp
Expand Down
80 changes: 80 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,86 @@ endif ()

endif() # end of cpu executables

add_executable(demo_smollm
${PROJECT_SOURCE_DIR}/examples/demo_smollm.cpp
${DIR_SRC_CPU}
${DIR_SRC_MEM_MANAGER}
${DIR_SRC_EXP}
${DIR_SRC}
${PROJECT_SOURCE_DIR}/src/tokenizers/Tokenizer.cpp
${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp
${PROJECT_SOURCE_DIR}/src/tokenizers/Unicode.cpp
${PROJECT_SOURCE_DIR}/src/tokenizers/UnicodeData.cpp
${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
)
if (MLLM_OPENMP_STATIC)
target_compile_options(demo_smollm PRIVATE -fopenmp)
target_link_libraries(demo_smollm PUBLIC MLLM_CPU -fopenmp -static-openmp)
else ()
target_compile_options(demo_smollm PRIVATE -fopenmp)
target_link_libraries(demo_smollm PUBLIC MLLM_CPU -fopenmp)
endif ()

# add_executable(demo_openelm
# ${PROJECT_SOURCE_DIR}/examples/demo_openelm.cpp
# ${DIR_SRC_CPU}
# ${DIR_SRC_MEM_MANAGER}
# ${DIR_SRC_EXP}
# ${DIR_SRC}
# ${PROJECT_SOURCE_DIR}/src/tokenizers/Tokenizer.cpp
# ${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp
# ${PROJECT_SOURCE_DIR}/src/tokenizers/Unicode.cpp
# ${PROJECT_SOURCE_DIR}/src/tokenizers/UnicodeData.cpp
# ${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
# )
# if (MLLM_OPENMP_STATIC)
# target_compile_options(demo_openelm PRIVATE -fopenmp)
# target_link_libraries(demo_openelm PUBLIC MLLM_CPU -fopenmp -static-openmp)
# else ()
# target_compile_options(demo_openelm PRIVATE -fopenmp)
# target_link_libraries(demo_openelm PUBLIC MLLM_CPU -fopenmp)
# endif ()

# add_executable(demo_dclm
# ${PROJECT_SOURCE_DIR}/examples/demo_dclm.cpp
# ${DIR_SRC_CPU}
# ${DIR_SRC_MEM_MANAGER}
# ${DIR_SRC_EXP}
# ${DIR_SRC}
# ${PROJECT_SOURCE_DIR}/src/tokenizers/Tokenizer.cpp
# ${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp
# ${PROJECT_SOURCE_DIR}/src/tokenizers/Unicode.cpp
# ${PROJECT_SOURCE_DIR}/src/tokenizers/UnicodeData.cpp
# ${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
# )
# if (MLLM_OPENMP_STATIC)
# target_compile_options(demo_dclm PRIVATE -fopenmp)
# target_link_libraries(demo_dclm PUBLIC MLLM_CPU -fopenmp -static-openmp)
# else ()
# target_compile_options(demo_dclm PRIVATE -fopenmp)
# target_link_libraries(demo_dclm PUBLIC MLLM_CPU -fopenmp)
# endif ()

add_executable(benchmark_llm
${PROJECT_SOURCE_DIR}/examples/benchmark.cpp
${DIR_SRC_CPU}
${DIR_SRC_MEM_MANAGER}
${DIR_SRC_EXP}
${DIR_SRC}
${PROJECT_SOURCE_DIR}/src/tokenizers/Tokenizer.cpp
${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp
${PROJECT_SOURCE_DIR}/src/tokenizers/Unicode.cpp
${PROJECT_SOURCE_DIR}/src/tokenizers/UnicodeData.cpp
${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
)
if (MLLM_OPENMP_STATIC)
target_compile_options(benchmark_llm PRIVATE -fopenmp)
target_link_libraries(benchmark_llm PUBLIC MLLM_CPU -fopenmp -static-openmp)
else ()
target_compile_options(benchmark_llm PRIVATE -fopenmp)
target_link_libraries(benchmark_llm PUBLIC MLLM_CPU -fopenmp)
endif ()

# QNN demo
if(QNN)
# qnn executables
Expand Down
196 changes: 196 additions & 0 deletions examples/benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
#include <iostream>
#include "Types.hpp"
#include "cmdline.h"

// tiny llama
#include "models/tinyllama/modeling_tinyllama.hpp"
#include "models/tinyllama/configuration_tinyllama.hpp"

// gemma
#include "models/gemma/modeling_gemma.hpp"
#include "models/gemma/configuration_gemma.hpp"

// qwen
#include "models/qwen/modeling_qwen.hpp"
#include "models/qwen/configuration_qwen.hpp"

// stable llm
#include "models/stablelm/configuration_stablelm.hpp"
#include "models/stablelm/modeling_stablelm.hpp"

// opt
#include "models/opt/configuration_opt.hpp"
#include "models/opt/modeling_opt.hpp"

// mini cpm
#include "models/minicpm/configuration_minicpm.hpp"
#include "models/minicpm/modeling_minicpm.hpp"

// smollm
#include "models/smollm/configuration_smollm.hpp"
#include "models/smollm/modeling_smollm.hpp"

// qwen2.5
// #include "models/qwen2_5/configuration_qwen2_5.hpp"
// #include "models/qwen2_5/modeling_qwen2_5.hpp"

#include "processor/PostProcess.hpp"

using namespace mllm;

Tensor tokens2Input(int tokens_size, string name = "input", BackendType type = MLLM_CPU) {
Tensor tensor1(1, 1, tokens_size, 1, Backend::global_backends[type], true);
tensor1.setName(name);
Tensor::tensor_status = TENSOR_STATIC_INIT;
tensor1.setTtype(INPUT_TENSOR);
for (int idx = 0; idx < tokens_size; ++idx) {
tensor1.setDataAt<float>(0, 0, idx, 0, 0);
}
return tensor1;
}

int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<std::string>("model_name", 'n', "the name of model", false);
cmdParser.add<int>("input_size", 'i', "input size", false, 64);
cmdParser.add<int>("loop", 'p', "loop", false, 100);
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);

auto model_name = cmdParser.get<std::string>("model_name");
int input_size = cmdParser.get<int>("input_size");
int loop = cmdParser.get<int>("loop");
int tokens_limit = cmdParser.get<int>("limits");
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

if (model_name == "tinyllama-1.1B") {
TinyLLaMAConfig config(tokens_limit, "1.1B", HFHUBROPE);
auto model = TinyLLaMAModel(config);
model.setNoLoadWeightsDtype(MLLM_TYPE_Q4_0_4_4);

auto input_tensor = tokens2Input(input_size);
for (int step = 0; step < loop + 1; step++) {
auto result = model({input_tensor});
chatPostProcessing(0, input_tensor, {});
}
model.profiling();
} else if (model_name == "gemma-2B") {
GemmaConfig config(tokens_limit, "2B", RoPEType::HFHUBROPE);
auto model = GemmaForCausalLM(config);
model.setNoLoadWeightsDtype(MLLM_TYPE_Q4_0_4_4);

auto input_tensor = tokens2Input(input_size);
for (int step = 0; step < loop + 1; step++) {
auto result = model({input_tensor});
chatPostProcessing(0, input_tensor, {});
}
model.profiling();
} else if (model_name == "qwen-0.5B") {
QWenConfig config(tokens_limit, "0.5B", RoPEType::HFHUBROPE);
auto model = QWenForCausalLM(config);
model.setNoLoadWeightsDtype(MLLM_TYPE_Q4_0_4_4);

auto input_tensor = tokens2Input(input_size);
for (int step = 0; step < loop + 1; step++) {
auto result = model({input_tensor});
chatPostProcessing(0, input_tensor, {});
}
model.profiling();
} else if (model_name == "qwen-1.8B") {
QWenConfig config(tokens_limit, "1.8B", RoPEType::HFHUBROPE);
auto model = QWenForCausalLM(config);
model.setNoLoadWeightsDtype(MLLM_TYPE_Q4_0_4_4);

auto input_tensor = tokens2Input(input_size);
for (int step = 0; step < loop + 1; step++) {
auto result = model({input_tensor});
chatPostProcessing(0, input_tensor, {});
}
model.profiling();
} else if (model_name == "stablelm-1.6B") {
StableLMConfig config(tokens_limit, "1.6B", HFHUBROPE);
auto model = StableLMModel(config);
model.setNoLoadWeightsDtype(MLLM_TYPE_Q4_0_4_4);

auto input_tensor = tokens2Input(input_size);
for (int step = 0; step < loop + 1; step++) {
auto result = model({input_tensor});
chatPostProcessing(0, input_tensor, {});
}
model.profiling();
} else if (model_name == "opt-1.3B") {
OPTConfig config(tokens_limit, "1.3B");
auto model = OPTModel(config);
model.setNoLoadWeightsDtype(MLLM_TYPE_Q4_0_4_4);

auto input_tensor = tokens2Input(input_size);
for (int step = 0; step < loop + 1; step++) {
auto result = model({input_tensor});
chatPostProcessing(0, input_tensor, {});
}
model.profiling();
} else if (model_name == "minicpm-2B") {
MiniCPMConfig config(tokens_limit, "2B");
auto model = MiniCPMForCausalLM(config);
model.setNoLoadWeightsDtype(MLLM_TYPE_Q4_0_4_4);

auto input_tensor = tokens2Input(input_size);
for (int step = 0; step < loop + 1; step++) {
auto result = model({input_tensor});
chatPostProcessing(0, input_tensor, {});
}
model.profiling();
} else if (model_name == "smollm-360M") {
SmoLlmConfig config(tokens_limit, "360M", RoPEType::HFHUBROPE, 49152);
auto model = SmoLlmModel(config);
model.setNoLoadWeightsDtype(MLLM_TYPE_Q4_0_4_4);

auto input_tensor = tokens2Input(input_size);
for (int step = 0; step < loop + 1; step++) {
auto result = model({input_tensor});
chatPostProcessing(0, input_tensor, {});
}
model.profiling();
} else if (model_name == "smollm-1.7B") {
SmoLlmConfig config(tokens_limit, "1.7B", RoPEType::HFHUBROPE, 49152);
auto model = SmoLlmModel(config);
model.setNoLoadWeightsDtype(MLLM_TYPE_Q4_0_4_4);

auto input_tensor = tokens2Input(input_size);
for (int step = 0; step < loop + 1; step++) {
auto result = model({input_tensor});
chatPostProcessing(0, input_tensor, {});
}
model.profiling();
} else if (model_name == "dclm-1B") {
// TODO
} else if (model_name == "openelm-1.1B") {
// TODO
} else if (model_name == "openelm-450M") {
// TODO
} else if (model_name == "qwen2.5-0.5B") {
// QWen2_5Config config(tokens_limit, "0.5B", RoPEType::HFHUBROPE);
// auto model = QWen2_5ForCausalLM(config);
// model.setNoLoadWeightsDtype(MLLM_TYPE_Q4_0_4_4);

// auto input_tensor = tokens2Input(input_size);
// for (int step = 0; step < loop + 1; step++) {
// auto result = model({input_tensor});
// chatPostProcessing(0, input_tensor, {});
// }
// model.profiling();
} else if (model_name == "qwen2.5-1.5B") {
// QWen2_5Config config(tokens_limit, "1.5B", RoPEType::HFHUBROPE);
// auto model = QWen2_5ForCausalLM(config);
// model.setNoLoadWeightsDtype(MLLM_TYPE_Q4_0_4_4);

// auto input_tensor = tokens2Input(input_size);
// for (int step = 0; step < loop + 1; step++) {
// auto result = model({input_tensor});
// chatPostProcessing(0, input_tensor, {});
// }
// model.profiling();
}
return 0;
}
Loading

0 comments on commit d1359b2

Please sign in to comment.