From 19e0d427099f61cf9d9eacb7cb1d22574cb91e41 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Wed, 6 Nov 2024 04:26:50 +0000 Subject: [PATCH 1/3] fix: boost xp kvcache and add some benchmarks --- examples/CMakeLists.txt | 3 + examples/demo_qdint8_gemm.cpp | 137 +++++++++++++++++++++++++++++ examples/demo_qwenqkvmm.cpp | 141 ++++++++++++++++++++++++++++++ examples/demo_qwenqkvmm_cpu.cpp | 105 ++++++++++++++++++++++ src/backends/cpu/CPUKVCacheXp.cpp | 5 +- 5 files changed, 389 insertions(+), 2 deletions(-) create mode 100644 examples/demo_qdint8_gemm.cpp create mode 100644 examples/demo_qwenqkvmm.cpp create mode 100644 examples/demo_qwenqkvmm_cpu.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 79f6454c..39296382 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -97,6 +97,9 @@ endif() if(MLLM_BUILD_XNNPACK_BACKEND) func_llm_add_executable(demo_qwen_xp) + func_llm_add_executable(demo_qwenqkvmm) + func_llm_add_executable(demo_qwenqkvmm_cpu) + func_llm_add_executable(demo_qdint8_gemm) endif() diff --git a/examples/demo_qdint8_gemm.cpp b/examples/demo_qdint8_gemm.cpp new file mode 100644 index 00000000..c57ee8a9 --- /dev/null +++ b/examples/demo_qdint8_gemm.cpp @@ -0,0 +1,137 @@ +#include "Types.hpp" +#include "cmdline.h" +#include "xnnpack.h" +#include +#include +#include "Tensor.hpp" +#include "Backend.hpp" +#include "Module.hpp" +#include "Layer.hpp" +#include +#include "backends/xnnpack/Utils/Logger.hpp" + +using namespace mllm; + +class MatmulModule final : public Module { + Layer linear; + +public: + explicit MatmulModule(int s) { + linear = Linear(s, s, false, ".linear"); + } + + vector Forward(vector inputs, vector args) override { + return {linear(inputs[0])}; + } +}; + +int main(int argc, char **argv) { + cmdline::parser cmdParser; + cmdParser.add("seq-len", 's', "sequence length", true, 64); + cmdParser.parse_check(argc, argv); + + size_t s = cmdParser.get("seq-len"); + + xnn_initialize(nullptr); + + Backend::global_backends.emplace(MLLM_XNNPACK, GetBackendCreator(MLLM_XNNPACK)->create({})); + + Tensor inputs1(1, 1, (int32_t)s, (int32_t)s, Backend::global_backends[MLLM_XNNPACK], true); + Tensor inputs2(1, 1, (int32_t)s, (int32_t)s, Backend::global_backends[MLLM_XNNPACK], true); + Tensor outputs1(1, 1, (int32_t)s, (int32_t)s, Backend::global_backends[MLLM_XNNPACK], true); + + xnn_subgraph_t subgraph = nullptr; + xnn_create_subgraph(3, /*flags=*/0, &subgraph); + + uint32_t input1_f32_id = XNN_INVALID_VALUE_ID; + std::array input1_dims{1, 1, s, s}; + xnn_define_tensor_value( + subgraph, + xnn_datatype_fp32, + input1_dims.size(), + input1_dims.data(), + nullptr, + /*external_id=*/0, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, + &input1_f32_id); + + uint32_t input1_id = XNN_INVALID_VALUE_ID; + xnn_define_dynamically_quantized_tensor_value( + subgraph, + xnn_datatype_qdint8, + input1_dims.size(), + 1, + input1_dims.data(), + XNN_INVALID_VALUE_ID, + 0, + &input1_id); + + uint32_t input2_id = XNN_INVALID_VALUE_ID; + std::array input2_dims{1, 1, s, s}; + std::vector channelwise_scale(s, 1.f); + xnn_define_channelwise_quantized_tensor_value( + subgraph, + xnn_datatype_qcint8, + channelwise_scale.data(), + input2_dims.size(), + /*channel_dim=*/1, + input2_dims.data(), + inputs2.rawHostPtr(), + XNN_INVALID_VALUE_ID, + /*flags=*/0, + &input2_id); + + uint32_t output1_id = XNN_INVALID_VALUE_ID; + std::array output1_dims{1, 1, s, s}; + xnn_define_tensor_value( + subgraph, + xnn_datatype_fp32, + output1_dims.size(), + output1_dims.data(), + nullptr, + 1, + XNN_VALUE_FLAG_EXTERNAL_OUTPUT, + &output1_id); + + xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input1_f32_id, + input1_id, /*flags=*/0); + xnn_define_batch_matrix_multiply(subgraph, input1_id, input2_id, + output1_id, /*flags=*/0); + xnn_runtime_t rt; + std::vector exts; + exts.push_back({0, inputs1.rawHostPtr()}); + exts.push_back({1, outputs1.rawHostPtr()}); + auto threadpool = pthreadpool_create(4); + xnn_create_runtime_v4(subgraph, nullptr, nullptr, threadpool, 0, &rt); + xnn_reshape_runtime(rt); + xnn_setup_runtime_v2(rt, exts.size(), exts.data()); + + mllm::xnnpack::Log::warn("Start benchmark"); + auto start = std::chrono::high_resolution_clock::now(); + xnn_invoke_runtime(rt); + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end - start); + mllm::xnnpack::Log::warn("xnn run (QD8 * QC8W -> FP32) shape={}x{}, time={} microseconds", s, s, duration.count()); + + inputs1.free(); + inputs2.free(); + outputs1.free(); + + auto model = MatmulModule((int32_t)s); + model.setNoLoadWeightsDtype(MLLM_TYPE_Q4_0_4_4); + model.to(MLLM_CPU); + + Tensor inputs1_q4k; + inputs1_q4k.setDtype(MLLM_TYPE_F32); + inputs1_q4k.reshape(1, 1, int32_t(s), int32_t(s)); + inputs1_q4k.setBackend(Backend::global_backends[MLLM_CPU]); + inputs1_q4k.alloc(); + inputs1_q4k.setTtype(TensorType::INPUT_TENSOR); + // inputs1_q4k.setName("inputs-0"); + + start = std::chrono::high_resolution_clock::now(); + auto o = model({inputs1_q4k}); + end = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end - start); + mllm::xnnpack::Log::warn("mllm run (Q4K * Q4K -> FP32) shape={}x{}, time={} microseconds", s, s, duration.count()); +} diff --git a/examples/demo_qwenqkvmm.cpp b/examples/demo_qwenqkvmm.cpp new file mode 100644 index 00000000..2e141c30 --- /dev/null +++ b/examples/demo_qwenqkvmm.cpp @@ -0,0 +1,141 @@ +#include "cmdline.h" +#include "Backend.hpp" +#include "Layer.hpp" +#include "Module.hpp" +#include "Tensor.hpp" +#include "Types.hpp" +#include "models/qwen/configuration_qwen.hpp" +#include "backends/xnnpack/XnnpackBackend.hpp" +#include "backends/xnnpack/Utils/Logger.hpp" + +using namespace mllm; + +class QwenQKVmmXnnPart final : public Module { + Layer sdpa; + +public: + QwenQKVmmXnnPart() = default; + + QwenQKVmmXnnPart(const QWenConfig &config, const QWenNameConfig &names, const string &base_name) { + sdpa = ScaledDotProductAttention(".sdpa"); + } + + vector Forward(vector inputs, vector args) override { + auto q = inputs[0]; + auto k = inputs[1]; + auto v = inputs[2]; + + q = q.transpose(SEQUENCE, HEAD); + k = k.transpose(SEQUENCE, HEAD); + v = v.transpose(SEQUENCE, HEAD); + + auto o = sdpa(q, k, v); + + return {o}; + } +}; + +class QuantizeModule final : public Module { + Layer o_quantize; + +public: + QuantizeModule() = default; + QuantizeModule(const QWenConfig &config, const QWenNameConfig &names, const string &base_name) { + o_quantize = Quantize(true, base_name + names._o_proj_name + ".quantize"); + } + + vector Forward(vector inputs, vector args) override { + return {o_quantize(inputs[0])}; + } +}; + +class QwenQKVmm final : public Module { + Layer q_rope; + Layer k_rope; + Layer k_cache; + Layer v_cache; + + int hidden_size; + int num_heads; + + QwenQKVmmXnnPart xnn_part; + QuantizeModule o_part; + +public: + QwenQKVmm() = default; + QwenQKVmm(const QWenConfig &config, const QWenNameConfig &names, const string &base_name) { + hidden_size = config.hidden_size; + num_heads = config.num_attention_heads * config.hidden_size / config.num_attention_heads; + + q_rope = RoPE(config.RoPE_type, config.rope_theta, config.max_position_embeddings, base_name + "q_rope"); + k_rope = RoPE(config.RoPE_type, config.rope_theta, config.max_position_embeddings, base_name + "k_rope"); + + k_cache = XP_KVCache(config.num_attention_heads / config.num_key_value_heads, config.cache_limit, base_name + "k_cache"); + v_cache = XP_KVCache(config.num_attention_heads / config.num_key_value_heads, config.cache_limit, base_name + "v_cache"); + + xnn_part = QwenQKVmmXnnPart(config, names, base_name); + xnn_part.to(MLLM_XNNPACK); + + o_part = QuantizeModule(config, names, base_name); + o_part.to(MLLM_CPU); + } + + vector Forward(vector inputs, vector args) override { + auto q = inputs[0]; + auto k = inputs[1]; + auto v = inputs[2]; + + q = q_rope(q); + k = k_rope(k); + + k = k_cache(k); + v = v_cache(v); + + q.to(MLLM_XNNPACK); + k.to(MLLM_XNNPACK); + v.to(MLLM_XNNPACK); + auto o = xnn_part({q, k, v})[0]; + o.to(MLLM_CPU); + + o = o_part({o})[0]; + + return {o}; + } +}; + +int main(int argc, char **argv) { + mllm::xnnpack::Log::log_level = mllm::xnnpack::Log::ERROR; + + cmdline::parser cmdParser; + cmdParser.add("seq-len", 's', "sequence length", true, 64); + cmdParser.parse_check(argc, argv); + + QWenConfig config(1280, "1.8B", RoPEType::HFHUBROPE); + auto model = QwenQKVmm(config, config.names_config, "base"); + model.setNoLoadWeightsDtype(MLLM_TYPE_F32); + + Layer::use_layername_2_tensorname = false; + mllm::xnnpack::XnnpackBackend::enable_dynamic_shape = true; + mllm::xnnpack::XnnpackBackend::enable_legacy_wrapper = false; + + auto s = cmdParser.get("seq-len"); + + Tensor q(1, 1, s, config.hidden_size, Backend::global_backends[MLLM_CPU], true); + Tensor k(1, 1, s, config.hidden_size, Backend::global_backends[MLLM_CPU], true); + Tensor v(1, 1, s, config.hidden_size, Backend::global_backends[MLLM_CPU], true); + q.setTtype(TensorType::INPUT_TENSOR); + k.setTtype(TensorType::INPUT_TENSOR); + v.setTtype(TensorType::INPUT_TENSOR); + + // warm up + auto o = model({q, k, v})[0]; + + // start + auto start = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < 4; ++i) { + auto o = model({q, k, v})[0]; + } + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end - start); + mllm::xnnpack::Log::error("QwenQKVmm, time={} microseconds", duration.count() / 4); +} diff --git a/examples/demo_qwenqkvmm_cpu.cpp b/examples/demo_qwenqkvmm_cpu.cpp new file mode 100644 index 00000000..7c6f93ad --- /dev/null +++ b/examples/demo_qwenqkvmm_cpu.cpp @@ -0,0 +1,105 @@ +#include "cmdline.h" +#include "Backend.hpp" +#include "Layer.hpp" +#include "Module.hpp" +#include "Tensor.hpp" +#include "Types.hpp" +#include "models/qwen/configuration_qwen.hpp" +#include "backends/xnnpack/XnnpackBackend.hpp" +#include "backends/xnnpack/Utils/Logger.hpp" + +using namespace mllm; + +class QwenQKVmm final : public Module { + Layer softmax; + Layer q_rope; + Layer k_rope; + Layer k_cache; + Layer v_cache; + Layer qk_mm; + Layer qkv_mm; + Layer o_quantize; + + int hidden_size; + int num_heads; + int head_dim; + int num_key_value_heads; + int num_key_value_groups; + +public: + QwenQKVmm() = default; + QwenQKVmm(const QWenConfig &config, const QWenNameConfig &names, const string &base_name) { + hidden_size = config.hidden_size; + num_heads = config.num_attention_heads * config.hidden_size / config.num_attention_heads; + + q_rope = RoPE(config.RoPE_type, config.rope_theta, config.max_position_embeddings, base_name + "q_rope"); + k_rope = RoPE(config.RoPE_type, config.rope_theta, config.max_position_embeddings, base_name + "k_rope"); + + k_cache = KVCache(config.num_attention_heads / config.num_key_value_heads, config.cache_limit, base_name + "k_cache", true); + v_cache = KVCache(config.num_attention_heads / config.num_key_value_heads, config.cache_limit, base_name + "v_cache", true); + + qk_mm = Matmul(false, true, base_name + "qk"); + qkv_mm = Matmul(false, false, base_name + "qkv"); + + softmax = Softmax(DIMENSION, true, base_name + "softmax"); + + o_quantize = Quantize(true, base_name + names._o_proj_name + ".quantize"); + } + + vector Forward(vector inputs, vector args) override { + auto q = inputs[0]; + auto k = inputs[1]; + auto v = inputs[2]; + + q = q_rope(q); + k = k_rope(k); + + k = k_cache(k); + v = v_cache(v); + + // auto qk = qk_mm(q, k); + auto qk = Tensor::mm(q, k.transpose(Chl::SEQUENCE, Chl::DIMENSION)); + qk = softmax(qk); + // auto o = qkv_mm(qk, v); + auto o = Tensor::mm(qk, v); + + o = o_quantize(o); + + return {o}; + } +}; + +int main(int argc, char **argv) { + cmdline::parser cmdParser; + cmdParser.add("seq-len", 's', "sequence length", true, 64); + cmdParser.parse_check(argc, argv); + + QWenConfig config(1280, "1.8B", RoPEType::HFHUBROPE); + auto model = QwenQKVmm(config, config.names_config, "base"); + model.setNoLoadWeightsDtype(MLLM_TYPE_F32); + + Layer::use_layername_2_tensorname = false; + mllm::xnnpack::XnnpackBackend::enable_dynamic_shape = true; + mllm::xnnpack::XnnpackBackend::enable_legacy_wrapper = false; + + auto s = cmdParser.get("seq-len"); + + Tensor q(1, 1, s, config.hidden_size, Backend::global_backends[MLLM_CPU], true); + Tensor k(1, 1, s, config.hidden_size, Backend::global_backends[MLLM_CPU], true); + Tensor v(1, 1, s, config.hidden_size, Backend::global_backends[MLLM_CPU], true); + q.setTtype(TensorType::INPUT_TENSOR); + k.setTtype(TensorType::INPUT_TENSOR); + v.setTtype(TensorType::INPUT_TENSOR); + + // warm up + auto o = model({q, k, v})[0]; + + // start + auto start = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < 4; ++i) { + auto o = model({q, k, v})[0]; + } + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end - start); + mllm::xnnpack::Log::warn("QwenQKVmm, time={} microseconds", duration.count() / 4); +} diff --git a/src/backends/cpu/CPUKVCacheXp.cpp b/src/backends/cpu/CPUKVCacheXp.cpp index 4ed55f3b..e428ac6b 100644 --- a/src/backends/cpu/CPUKVCacheXp.cpp +++ b/src/backends/cpu/CPUKVCacheXp.cpp @@ -58,7 +58,7 @@ ErrorCode CPUKVCacheXp::execute(vector> inputs, vectorrawHostPtr(), cache_.rawHostPtr(), outputs[0]->count() * sizeof(float)); + // memcpy(outputs[0]->rawHostPtr(), cache_.rawHostPtr(), outputs[0]->count() * sizeof(float)); return MLLM_NO_ERROR; } @@ -68,6 +68,7 @@ ErrorCode CPUKVCacheXp::free(vector> inputs, vector> inputs, vector> outputs) { - return Op::setUp(inputs, outputs); + outputs[0]->forceResetHostPointer(cache_.rawHostPtr()); + return MLLM_NO_ERROR; } } // namespace mllm \ No newline at end of file From 606f222d0ca501e7001ab0d39b02bfe31e56590c Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Fri, 8 Nov 2024 04:05:23 +0000 Subject: [PATCH 2/3] update: demo fp32 gemm perf --- examples/CMakeLists.txt | 1 + examples/demo_fp32gemmperf.cpp | 48 ++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 examples/demo_fp32gemmperf.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 39296382..71c28b8a 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -100,6 +100,7 @@ if(MLLM_BUILD_XNNPACK_BACKEND) func_llm_add_executable(demo_qwenqkvmm) func_llm_add_executable(demo_qwenqkvmm_cpu) func_llm_add_executable(demo_qdint8_gemm) + func_llm_add_executable(demo_fp32gemmperf) endif() diff --git a/examples/demo_fp32gemmperf.cpp b/examples/demo_fp32gemmperf.cpp new file mode 100644 index 00000000..12c5cf97 --- /dev/null +++ b/examples/demo_fp32gemmperf.cpp @@ -0,0 +1,48 @@ +#include "Types.hpp" +#include "cmdline.h" +#include "xnnpack.h" +#include +#include +#include "Tensor.hpp" +#include "Backend.hpp" +#include "Module.hpp" +#include "Layer.hpp" +#include +#include "backends/xnnpack/Utils/Logger.hpp" + +using namespace mllm; + +class MatmulModule final : public Module { +public: + explicit MatmulModule(int s) { + } + + vector Forward(vector inputs, vector args) override { + return {Tensor::mm(inputs[0], inputs[1])}; + } +}; + +int main(int argc, char **argv) { + cmdline::parser cmdParser; + cmdParser.add("seq-len", 's', "sequence length", true, 64); + cmdParser.parse_check(argc, argv); + + size_t s = cmdParser.get("seq-len"); + + auto model = MatmulModule((int32_t)s); + model.to(MLLM_CPU); + + Tensor inputs0(1, 1, (int32_t)s, (int32_t)s, Backend::global_backends[MLLM_CPU], true); + Tensor inputs1(1, 1, (int32_t)s, (int32_t)s, Backend::global_backends[MLLM_CPU], true); + inputs0.setTtype(TensorType::INPUT_TENSOR); + inputs1.setTtype(TensorType::INPUT_TENSOR); + + // warmup + auto o = model({inputs0, inputs1}); + + auto start = std::chrono::high_resolution_clock::now(); + o = model({inputs0, inputs1}); + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end - start); + mllm::xnnpack::Log::warn("mllm run (FP32 * FP32 -> FP32) shape={}x{}, {}x{}, time={} microseconds", s, s, s, s, duration.count()); +} From e65738d705f8607a28137e7cd8e9cdb6f40bef9a Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Fri, 8 Nov 2024 09:27:26 +0000 Subject: [PATCH 3/3] fix: smollm tokenizer regex pattern --- CMakeLists.txt | 2 +- examples/CMakeLists.txt | 8 +- examples/demo_fp32gemmperf.cpp | 48 -------- examples/demo_qdint8_gemm.cpp | 137 --------------------- examples/demo_qwenqkvmm.cpp | 141 ---------------------- examples/demo_qwenqkvmm_cpu.cpp | 105 ---------------- src/models/smollm/tokenization_smollm.hpp | 12 +- 7 files changed, 13 insertions(+), 440 deletions(-) delete mode 100644 examples/demo_fp32gemmperf.cpp delete mode 100644 examples/demo_qdint8_gemm.cpp delete mode 100644 examples/demo_qwenqkvmm.cpp delete mode 100644 examples/demo_qwenqkvmm_cpu.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 398b21db..67f3ac05 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,7 +128,7 @@ if(QNN) # QNN lib add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/qnn) endif() -option(MLLM_BUILD_XNNPACK_BACKEND "Build mllm's XNNPACK backend" ON) +option(MLLM_BUILD_XNNPACK_BACKEND "Build mllm's XNNPACK backend" OFF) if(MLLM_BUILD_XNNPACK_BACKEND) if(NOT WIN32) add_compile_options(-fPIC) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 3fdaa3bc..983423ca 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -99,10 +99,10 @@ endif() if(MLLM_BUILD_XNNPACK_BACKEND) func_llm_add_executable(demo_qwen_xp) - func_llm_add_executable(demo_qwenqkvmm) - func_llm_add_executable(demo_qwenqkvmm_cpu) - func_llm_add_executable(demo_qdint8_gemm) - func_llm_add_executable(demo_fp32gemmperf) + # func_llm_add_executable(demo_qwenqkvmm) + # func_llm_add_executable(demo_qwenqkvmm_cpu) + # func_llm_add_executable(demo_qdint8_gemm) + # func_llm_add_executable(demo_fp32gemmperf) endif() diff --git a/examples/demo_fp32gemmperf.cpp b/examples/demo_fp32gemmperf.cpp deleted file mode 100644 index 12c5cf97..00000000 --- a/examples/demo_fp32gemmperf.cpp +++ /dev/null @@ -1,48 +0,0 @@ -#include "Types.hpp" -#include "cmdline.h" -#include "xnnpack.h" -#include -#include -#include "Tensor.hpp" -#include "Backend.hpp" -#include "Module.hpp" -#include "Layer.hpp" -#include -#include "backends/xnnpack/Utils/Logger.hpp" - -using namespace mllm; - -class MatmulModule final : public Module { -public: - explicit MatmulModule(int s) { - } - - vector Forward(vector inputs, vector args) override { - return {Tensor::mm(inputs[0], inputs[1])}; - } -}; - -int main(int argc, char **argv) { - cmdline::parser cmdParser; - cmdParser.add("seq-len", 's', "sequence length", true, 64); - cmdParser.parse_check(argc, argv); - - size_t s = cmdParser.get("seq-len"); - - auto model = MatmulModule((int32_t)s); - model.to(MLLM_CPU); - - Tensor inputs0(1, 1, (int32_t)s, (int32_t)s, Backend::global_backends[MLLM_CPU], true); - Tensor inputs1(1, 1, (int32_t)s, (int32_t)s, Backend::global_backends[MLLM_CPU], true); - inputs0.setTtype(TensorType::INPUT_TENSOR); - inputs1.setTtype(TensorType::INPUT_TENSOR); - - // warmup - auto o = model({inputs0, inputs1}); - - auto start = std::chrono::high_resolution_clock::now(); - o = model({inputs0, inputs1}); - auto end = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end - start); - mllm::xnnpack::Log::warn("mllm run (FP32 * FP32 -> FP32) shape={}x{}, {}x{}, time={} microseconds", s, s, s, s, duration.count()); -} diff --git a/examples/demo_qdint8_gemm.cpp b/examples/demo_qdint8_gemm.cpp deleted file mode 100644 index c57ee8a9..00000000 --- a/examples/demo_qdint8_gemm.cpp +++ /dev/null @@ -1,137 +0,0 @@ -#include "Types.hpp" -#include "cmdline.h" -#include "xnnpack.h" -#include -#include -#include "Tensor.hpp" -#include "Backend.hpp" -#include "Module.hpp" -#include "Layer.hpp" -#include -#include "backends/xnnpack/Utils/Logger.hpp" - -using namespace mllm; - -class MatmulModule final : public Module { - Layer linear; - -public: - explicit MatmulModule(int s) { - linear = Linear(s, s, false, ".linear"); - } - - vector Forward(vector inputs, vector args) override { - return {linear(inputs[0])}; - } -}; - -int main(int argc, char **argv) { - cmdline::parser cmdParser; - cmdParser.add("seq-len", 's', "sequence length", true, 64); - cmdParser.parse_check(argc, argv); - - size_t s = cmdParser.get("seq-len"); - - xnn_initialize(nullptr); - - Backend::global_backends.emplace(MLLM_XNNPACK, GetBackendCreator(MLLM_XNNPACK)->create({})); - - Tensor inputs1(1, 1, (int32_t)s, (int32_t)s, Backend::global_backends[MLLM_XNNPACK], true); - Tensor inputs2(1, 1, (int32_t)s, (int32_t)s, Backend::global_backends[MLLM_XNNPACK], true); - Tensor outputs1(1, 1, (int32_t)s, (int32_t)s, Backend::global_backends[MLLM_XNNPACK], true); - - xnn_subgraph_t subgraph = nullptr; - xnn_create_subgraph(3, /*flags=*/0, &subgraph); - - uint32_t input1_f32_id = XNN_INVALID_VALUE_ID; - std::array input1_dims{1, 1, s, s}; - xnn_define_tensor_value( - subgraph, - xnn_datatype_fp32, - input1_dims.size(), - input1_dims.data(), - nullptr, - /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, - &input1_f32_id); - - uint32_t input1_id = XNN_INVALID_VALUE_ID; - xnn_define_dynamically_quantized_tensor_value( - subgraph, - xnn_datatype_qdint8, - input1_dims.size(), - 1, - input1_dims.data(), - XNN_INVALID_VALUE_ID, - 0, - &input1_id); - - uint32_t input2_id = XNN_INVALID_VALUE_ID; - std::array input2_dims{1, 1, s, s}; - std::vector channelwise_scale(s, 1.f); - xnn_define_channelwise_quantized_tensor_value( - subgraph, - xnn_datatype_qcint8, - channelwise_scale.data(), - input2_dims.size(), - /*channel_dim=*/1, - input2_dims.data(), - inputs2.rawHostPtr(), - XNN_INVALID_VALUE_ID, - /*flags=*/0, - &input2_id); - - uint32_t output1_id = XNN_INVALID_VALUE_ID; - std::array output1_dims{1, 1, s, s}; - xnn_define_tensor_value( - subgraph, - xnn_datatype_fp32, - output1_dims.size(), - output1_dims.data(), - nullptr, - 1, - XNN_VALUE_FLAG_EXTERNAL_OUTPUT, - &output1_id); - - xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input1_f32_id, - input1_id, /*flags=*/0); - xnn_define_batch_matrix_multiply(subgraph, input1_id, input2_id, - output1_id, /*flags=*/0); - xnn_runtime_t rt; - std::vector exts; - exts.push_back({0, inputs1.rawHostPtr()}); - exts.push_back({1, outputs1.rawHostPtr()}); - auto threadpool = pthreadpool_create(4); - xnn_create_runtime_v4(subgraph, nullptr, nullptr, threadpool, 0, &rt); - xnn_reshape_runtime(rt); - xnn_setup_runtime_v2(rt, exts.size(), exts.data()); - - mllm::xnnpack::Log::warn("Start benchmark"); - auto start = std::chrono::high_resolution_clock::now(); - xnn_invoke_runtime(rt); - auto end = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end - start); - mllm::xnnpack::Log::warn("xnn run (QD8 * QC8W -> FP32) shape={}x{}, time={} microseconds", s, s, duration.count()); - - inputs1.free(); - inputs2.free(); - outputs1.free(); - - auto model = MatmulModule((int32_t)s); - model.setNoLoadWeightsDtype(MLLM_TYPE_Q4_0_4_4); - model.to(MLLM_CPU); - - Tensor inputs1_q4k; - inputs1_q4k.setDtype(MLLM_TYPE_F32); - inputs1_q4k.reshape(1, 1, int32_t(s), int32_t(s)); - inputs1_q4k.setBackend(Backend::global_backends[MLLM_CPU]); - inputs1_q4k.alloc(); - inputs1_q4k.setTtype(TensorType::INPUT_TENSOR); - // inputs1_q4k.setName("inputs-0"); - - start = std::chrono::high_resolution_clock::now(); - auto o = model({inputs1_q4k}); - end = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end - start); - mllm::xnnpack::Log::warn("mllm run (Q4K * Q4K -> FP32) shape={}x{}, time={} microseconds", s, s, duration.count()); -} diff --git a/examples/demo_qwenqkvmm.cpp b/examples/demo_qwenqkvmm.cpp deleted file mode 100644 index 2e141c30..00000000 --- a/examples/demo_qwenqkvmm.cpp +++ /dev/null @@ -1,141 +0,0 @@ -#include "cmdline.h" -#include "Backend.hpp" -#include "Layer.hpp" -#include "Module.hpp" -#include "Tensor.hpp" -#include "Types.hpp" -#include "models/qwen/configuration_qwen.hpp" -#include "backends/xnnpack/XnnpackBackend.hpp" -#include "backends/xnnpack/Utils/Logger.hpp" - -using namespace mllm; - -class QwenQKVmmXnnPart final : public Module { - Layer sdpa; - -public: - QwenQKVmmXnnPart() = default; - - QwenQKVmmXnnPart(const QWenConfig &config, const QWenNameConfig &names, const string &base_name) { - sdpa = ScaledDotProductAttention(".sdpa"); - } - - vector Forward(vector inputs, vector args) override { - auto q = inputs[0]; - auto k = inputs[1]; - auto v = inputs[2]; - - q = q.transpose(SEQUENCE, HEAD); - k = k.transpose(SEQUENCE, HEAD); - v = v.transpose(SEQUENCE, HEAD); - - auto o = sdpa(q, k, v); - - return {o}; - } -}; - -class QuantizeModule final : public Module { - Layer o_quantize; - -public: - QuantizeModule() = default; - QuantizeModule(const QWenConfig &config, const QWenNameConfig &names, const string &base_name) { - o_quantize = Quantize(true, base_name + names._o_proj_name + ".quantize"); - } - - vector Forward(vector inputs, vector args) override { - return {o_quantize(inputs[0])}; - } -}; - -class QwenQKVmm final : public Module { - Layer q_rope; - Layer k_rope; - Layer k_cache; - Layer v_cache; - - int hidden_size; - int num_heads; - - QwenQKVmmXnnPart xnn_part; - QuantizeModule o_part; - -public: - QwenQKVmm() = default; - QwenQKVmm(const QWenConfig &config, const QWenNameConfig &names, const string &base_name) { - hidden_size = config.hidden_size; - num_heads = config.num_attention_heads * config.hidden_size / config.num_attention_heads; - - q_rope = RoPE(config.RoPE_type, config.rope_theta, config.max_position_embeddings, base_name + "q_rope"); - k_rope = RoPE(config.RoPE_type, config.rope_theta, config.max_position_embeddings, base_name + "k_rope"); - - k_cache = XP_KVCache(config.num_attention_heads / config.num_key_value_heads, config.cache_limit, base_name + "k_cache"); - v_cache = XP_KVCache(config.num_attention_heads / config.num_key_value_heads, config.cache_limit, base_name + "v_cache"); - - xnn_part = QwenQKVmmXnnPart(config, names, base_name); - xnn_part.to(MLLM_XNNPACK); - - o_part = QuantizeModule(config, names, base_name); - o_part.to(MLLM_CPU); - } - - vector Forward(vector inputs, vector args) override { - auto q = inputs[0]; - auto k = inputs[1]; - auto v = inputs[2]; - - q = q_rope(q); - k = k_rope(k); - - k = k_cache(k); - v = v_cache(v); - - q.to(MLLM_XNNPACK); - k.to(MLLM_XNNPACK); - v.to(MLLM_XNNPACK); - auto o = xnn_part({q, k, v})[0]; - o.to(MLLM_CPU); - - o = o_part({o})[0]; - - return {o}; - } -}; - -int main(int argc, char **argv) { - mllm::xnnpack::Log::log_level = mllm::xnnpack::Log::ERROR; - - cmdline::parser cmdParser; - cmdParser.add("seq-len", 's', "sequence length", true, 64); - cmdParser.parse_check(argc, argv); - - QWenConfig config(1280, "1.8B", RoPEType::HFHUBROPE); - auto model = QwenQKVmm(config, config.names_config, "base"); - model.setNoLoadWeightsDtype(MLLM_TYPE_F32); - - Layer::use_layername_2_tensorname = false; - mllm::xnnpack::XnnpackBackend::enable_dynamic_shape = true; - mllm::xnnpack::XnnpackBackend::enable_legacy_wrapper = false; - - auto s = cmdParser.get("seq-len"); - - Tensor q(1, 1, s, config.hidden_size, Backend::global_backends[MLLM_CPU], true); - Tensor k(1, 1, s, config.hidden_size, Backend::global_backends[MLLM_CPU], true); - Tensor v(1, 1, s, config.hidden_size, Backend::global_backends[MLLM_CPU], true); - q.setTtype(TensorType::INPUT_TENSOR); - k.setTtype(TensorType::INPUT_TENSOR); - v.setTtype(TensorType::INPUT_TENSOR); - - // warm up - auto o = model({q, k, v})[0]; - - // start - auto start = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < 4; ++i) { - auto o = model({q, k, v})[0]; - } - auto end = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end - start); - mllm::xnnpack::Log::error("QwenQKVmm, time={} microseconds", duration.count() / 4); -} diff --git a/examples/demo_qwenqkvmm_cpu.cpp b/examples/demo_qwenqkvmm_cpu.cpp deleted file mode 100644 index 7c6f93ad..00000000 --- a/examples/demo_qwenqkvmm_cpu.cpp +++ /dev/null @@ -1,105 +0,0 @@ -#include "cmdline.h" -#include "Backend.hpp" -#include "Layer.hpp" -#include "Module.hpp" -#include "Tensor.hpp" -#include "Types.hpp" -#include "models/qwen/configuration_qwen.hpp" -#include "backends/xnnpack/XnnpackBackend.hpp" -#include "backends/xnnpack/Utils/Logger.hpp" - -using namespace mllm; - -class QwenQKVmm final : public Module { - Layer softmax; - Layer q_rope; - Layer k_rope; - Layer k_cache; - Layer v_cache; - Layer qk_mm; - Layer qkv_mm; - Layer o_quantize; - - int hidden_size; - int num_heads; - int head_dim; - int num_key_value_heads; - int num_key_value_groups; - -public: - QwenQKVmm() = default; - QwenQKVmm(const QWenConfig &config, const QWenNameConfig &names, const string &base_name) { - hidden_size = config.hidden_size; - num_heads = config.num_attention_heads * config.hidden_size / config.num_attention_heads; - - q_rope = RoPE(config.RoPE_type, config.rope_theta, config.max_position_embeddings, base_name + "q_rope"); - k_rope = RoPE(config.RoPE_type, config.rope_theta, config.max_position_embeddings, base_name + "k_rope"); - - k_cache = KVCache(config.num_attention_heads / config.num_key_value_heads, config.cache_limit, base_name + "k_cache", true); - v_cache = KVCache(config.num_attention_heads / config.num_key_value_heads, config.cache_limit, base_name + "v_cache", true); - - qk_mm = Matmul(false, true, base_name + "qk"); - qkv_mm = Matmul(false, false, base_name + "qkv"); - - softmax = Softmax(DIMENSION, true, base_name + "softmax"); - - o_quantize = Quantize(true, base_name + names._o_proj_name + ".quantize"); - } - - vector Forward(vector inputs, vector args) override { - auto q = inputs[0]; - auto k = inputs[1]; - auto v = inputs[2]; - - q = q_rope(q); - k = k_rope(k); - - k = k_cache(k); - v = v_cache(v); - - // auto qk = qk_mm(q, k); - auto qk = Tensor::mm(q, k.transpose(Chl::SEQUENCE, Chl::DIMENSION)); - qk = softmax(qk); - // auto o = qkv_mm(qk, v); - auto o = Tensor::mm(qk, v); - - o = o_quantize(o); - - return {o}; - } -}; - -int main(int argc, char **argv) { - cmdline::parser cmdParser; - cmdParser.add("seq-len", 's', "sequence length", true, 64); - cmdParser.parse_check(argc, argv); - - QWenConfig config(1280, "1.8B", RoPEType::HFHUBROPE); - auto model = QwenQKVmm(config, config.names_config, "base"); - model.setNoLoadWeightsDtype(MLLM_TYPE_F32); - - Layer::use_layername_2_tensorname = false; - mllm::xnnpack::XnnpackBackend::enable_dynamic_shape = true; - mllm::xnnpack::XnnpackBackend::enable_legacy_wrapper = false; - - auto s = cmdParser.get("seq-len"); - - Tensor q(1, 1, s, config.hidden_size, Backend::global_backends[MLLM_CPU], true); - Tensor k(1, 1, s, config.hidden_size, Backend::global_backends[MLLM_CPU], true); - Tensor v(1, 1, s, config.hidden_size, Backend::global_backends[MLLM_CPU], true); - q.setTtype(TensorType::INPUT_TENSOR); - k.setTtype(TensorType::INPUT_TENSOR); - v.setTtype(TensorType::INPUT_TENSOR); - - // warm up - auto o = model({q, k, v})[0]; - - // start - auto start = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < 4; ++i) { - auto o = model({q, k, v})[0]; - } - auto end = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end - start); - mllm::xnnpack::Log::warn("QwenQKVmm, time={} microseconds", duration.count() / 4); -} diff --git a/src/models/smollm/tokenization_smollm.hpp b/src/models/smollm/tokenization_smollm.hpp index 32d324d7..69ddfb9e 100644 --- a/src/models/smollm/tokenization_smollm.hpp +++ b/src/models/smollm/tokenization_smollm.hpp @@ -135,7 +135,7 @@ class SmolLMTokenizer final : public BPETokenizer { std::vector ret; if (split_special_tokens_) { - const auto word_collection = unicode_regex_split(text, FIXED_PAT_STRS); + const auto word_collection = unicode_regex_split(text, regex_exprs); for (auto &piece : word_collection) { // look up table // std::string token; @@ -160,7 +160,7 @@ class SmolLMTokenizer final : public BPETokenizer { BPETokenizer::tokenize(token, tmp, false, special_tokens, true); ret.insert(ret.end(), tmp.begin(), tmp.end() - 1); } else { - const auto word_collection = unicode_regex_split(p, FIXED_PAT_STRS); + const auto word_collection = unicode_regex_split(p, regex_exprs); for (auto &piece : word_collection) { // look up table // std::string token; @@ -183,7 +183,7 @@ class SmolLMTokenizer final : public BPETokenizer { std::vector ret; if (split_special_tokens_) { - const auto word_collection = unicode_regex_split(text, FIXED_PAT_STRS); + const auto word_collection = unicode_regex_split(text, regex_exprs); for (auto &piece : word_collection) { // look up table // std::string token; @@ -208,7 +208,7 @@ class SmolLMTokenizer final : public BPETokenizer { BPETokenizer::tokenize(token, tmp, false, special_tokens, true); ret.insert(ret.end(), tmp.begin(), tmp.end() - 1); } else { - const auto word_collection = unicode_regex_split(p, FIXED_PAT_STRS); + const auto word_collection = unicode_regex_split(p, regex_exprs); for (auto &piece : word_collection) { // look up table // std::string token; @@ -259,6 +259,10 @@ class SmolLMTokenizer final : public BPETokenizer { } public: + std::vector regex_exprs = { + "\\p{N}", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }; bool split_special_tokens_ = false; std::unordered_map byte_encoder_; std::unordered_map byte_decoder_;