perf: i8 * i8 -> fp32 GEMM Boost && feat: conv2d im2cal (#187)

* feat: boost i8 * i8 -> fp32 matmul * feat: conv2d and im2col algorith
UbiquitousLearning · Nov 22, 2024 · 57893c8 · 57893c8
1 parent b9772d0
commit 57893c8
Show file tree

Hide file tree

Showing 10 changed files with 1,076 additions and 35 deletions.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -83,6 +83,7 @@ func_vlm_add_executable(demo_imagebind_1mod)
 func_vlm_add_executable(demo_phi3v)
 # func_vlm_add_executable(demo)
 
+# QNN demo
 
 if(QNN)
     func_llm_add_executable(demo_qwen_npu)

diff --git a/src/Tensor.hpp b/src/Tensor.hpp
@@ -1760,6 +1760,9 @@ class Tensor {
     TensorType &xnnTensorType();
 
     void forceResetHostPointer(void *ptr);
+
+public:
+    float i8_scale = 1.f;
 };
 } // namespace mllm
 #endif // MLLM_TENSOR_H
diff --git a/src/backends/cpu/CMakeLists.txt b/src/backends/cpu/CMakeLists.txt
@@ -24,6 +24,7 @@ endif()
 endif()
 if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
     message(STATUS "ARM detected")
+    add_compile_options(-march=armv8.2-a+dotprod+fp16+fp16fml)
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
     message(STATUS "x86_64 detected")
 add_compile_options(-mavx2)

diff --git a/src/backends/cpu/CPUConvolution2D.cpp b/src/backends/cpu/CPUConvolution2D.cpp
@@ -2,10 +2,14 @@
 #include "CPUConvolution2D.hpp"
 #include "compute/Convolution.hpp"
 
+#include "compute/Matmul.hpp"
+#include "compute/Im2Col.hpp"
+
 namespace mllm {
 
-CPUConvolution2D::CPUConvolution2D(Backend *bn, string opName, int in_channel, int out_channel,  vector<int> kernal_size, vector<int> stride, PaddingType padding_type, bool bias, int threadCount) : thread_count(threadCount),
-Op(bn, opName) {
+CPUConvolution2D::CPUConvolution2D(Backend *bn, string opName, int in_channel, int out_channel, vector<int> kernal_size, vector<int> stride, PaddingType padding_type, bool bias, int threadCount) :
+    thread_count(threadCount),
+    Op(bn, opName) {
     kernel_size_[0] = kernal_size[0];
     kernel_size_[1] = kernal_size[1];
     stride_[0] = stride[0];
@@ -16,48 +20,81 @@ Op(bn, opName) {
     support_bias_ = bias;
     weight_.setBackend(bn);
     bias_.setBackend(bn);
+
+#ifdef __ARM_NEON
+    im2col_layout_.setBackend(bn);
+    output_not_transposed_.setBackend(bn);
+#endif //! __ARM_NEON
 }
 
 ErrorCode CPUConvolution2D::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
-    //batch = batch
-    //sequence = out_channel
-    //head = height
-    //dimension = width
+    // batch = batch
+    // sequence = out_channel
+    // head = height
+    // dimension = width
     assert(in_channel_ == inputs[0]->sequence());
+
+    // #ifdef __ARM_NEON
+    //     if (kernel_size_[0] == 16 && kernel_size_[1] == 16 && padding_h_ == 0 && padding_w_ == 0 && stride_[0] == 16 && stride_[1] == 16) {
+    //         im2col_layout_.setDtype(inputs[0]->dtype());
+    //         im2col_layout_.reshape(inputs[0]->batch(), 1, (inputs[0]->head() / 16) * (inputs[0]->dimension() / 16), 16 * 16 * in_channel_);
+    //         im2col_layout_.alloc();
+    //         output_not_transposed_.setDtype(inputs[0]->dtype());
+    //         output_not_transposed_.reshape(inputs[0]->batch(), 1, (inputs[0]->head() / 16) * (inputs[0]->dimension() / 16), out_channel_);
+    //         output_not_transposed_.alloc();
+    //         outputs[0]->reshape(inputs[0]->batch(), (inputs[0]->head() / 16), out_channel_, (inputs[0]->dimension() / 16));
+    //         return Op::reshape(inputs, outputs);
+    //     }
+
+    //     if (kernel_size_[0] == kernel_size_[1] && kernel_size_[0] == stride_[0] && kernel_size_[1] == stride_[1] && padding_h_ == 0 && padding_w_ == 0) {
+    //         im2col_layout_.setDtype(inputs[0]->dtype());
+    //         im2col_layout_.reshape(inputs[0]->batch(), 1, (inputs[0]->head() / kernel_size_[0]) * (inputs[0]->dimension() / kernel_size_[0]), kernel_size_[0] * kernel_size_[0] * in_channel_);
+    //         im2col_layout_.alloc();
+    //         output_not_transposed_.setDtype(inputs[0]->dtype());
+    //         output_not_transposed_.reshape(inputs[0]->batch(), 1, (inputs[0]->head() / kernel_size_[0]) * (inputs[0]->dimension() / kernel_size_[0]), out_channel_);
+    //         output_not_transposed_.alloc();
+    //         outputs[0]->reshape(inputs[0]->batch(), (inputs[0]->head() / kernel_size_[0]), out_channel_, (inputs[0]->dimension() / kernel_size_[0]));
+    //         return Op::reshape(inputs, outputs);
+    //     }
+    // #endif
+
     switch (padding_type_) {
-    case SAME:{
+    case SAME: {
         padding_h_ = (kernel_size_[0] - 1) / 2;
         padding_w_ = (kernel_size_[1] - 1) / 2;
         const int out_height = (inputs[0]->head() + 2 * padding_h_ - kernel_size_[0]) / stride_[0] + 1;
         const int out_width = (inputs[0]->dimension() + 2 * padding_w_ - kernel_size_[1]) / stride_[1] + 1;
-        outputs[0]->reshape(inputs[0]->batch(),out_height, out_channel_,  out_width);
+        outputs[0]->reshape(inputs[0]->batch(), out_height, out_channel_, out_width);
         break;
-        }
-    case VALID:{
+    }
+    case VALID: {
         padding_h_ = 0;
         padding_w_ = 0;
         const int out_height = (inputs[0]->head() - kernel_size_[0]) / stride_[0] + 1;
-        const int out_width = (inputs[0]->dimension()- kernel_size_[1]) / stride_[1] + 1;
-        outputs[0]->reshape(inputs[0]->batch(),out_height, out_channel_, out_width);
+        const int out_width = (inputs[0]->dimension() - kernel_size_[1]) / stride_[1] + 1;
+        outputs[0]->reshape(inputs[0]->batch(), out_height, out_channel_, out_width);
         break;
-        }
+    }
     }
     return Op::reshape(inputs, outputs);
 }
 
 ErrorCode CPUConvolution2D::load(AbstructLoader &loader) {
-
     weight_.setName(name() + ".weight");
     weight_.reshape(out_channel_, kernel_size_[0], in_channel_, kernel_size_[1]);
     if (loader.getDataType(weight_.name()) != MLLM_TYPE_COUNT) {
         weight_.setDtype(loader.getDataType(weight_.name()));
         weight_.alloc();
         loader.load(&weight_);
+        // #ifndef __ARM_NEON
         kernal_ = reshape_conv2d_kernal_fp32(&weight_);
+        // #endif
     } else {
         weight_.setDtype(MLLM_TYPE_F32);
         weight_.alloc();
+        // #ifndef __ARM_NEON
         kernal_ = reshape_conv2d_kernal_fp32(&weight_);
+        // #endif
     }
     if (support_bias_) {
         bias_.setName(name() + ".bias");
@@ -75,29 +112,51 @@ ErrorCode CPUConvolution2D::load(AbstructLoader &loader) {
 }
 
 ErrorCode CPUConvolution2D::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    // #ifdef __ARM_NEON
+    //     if (kernel_size_[0] == 16 && kernel_size_[1] == 16 && padding_h_ == 0 && padding_w_ == 0 && stride_[0] == 16 && stride_[1] == 16) {
+    //         auto start = std::chrono::high_resolution_clock::now();
+    //         im2col_fp32_src_k16x16_s16_p0_to(inputs[0]->rawHostPtr(), im2col_layout_.rawHostPtr(), inputs[0]->head(), inputs[0]->dimension(), in_channel_);
+    //         weight_.reshape(1, 1, out_channel_, 16 * 16 * in_channel_);
+    //         mat_mul(&im2col_layout_, &weight_, &output_not_transposed_, true, &bias_, false, true, thread_count);
+    //         transpose_fp32(output_not_transposed_.rawHostPtr(), outputs[0]->rawHostPtr(), (inputs[0]->head() / 16) * ((inputs[0]->dimension() / 16)), out_channel_);
+    //         auto end = std::chrono::high_resolution_clock::now();
+    //         auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+    //         std::cout << duration.count() << std::endl;
+    //         return Op::execute(inputs, outputs);
+    //     }
+
+    //     if (kernel_size_[0] == kernel_size_[1] && kernel_size_[0] == stride_[0] && kernel_size_[1] == stride_[1] && padding_h_ == 0 && padding_w_ == 0) {
+    //         auto start = std::chrono::high_resolution_clock::now();
+    //         im2col_fp32_src_knxn_sn_p0_to(inputs[0]->rawHostPtr(), im2col_layout_.rawHostPtr(), inputs[0]->head(), inputs[0]->dimension(), in_channel_, kernel_size_[0]);
+    //         weight_.reshape(1, 1, out_channel_, kernel_size_[0] * kernel_size_[0] * in_channel_);
+    //         mat_mul(&im2col_layout_, &weight_, &output_not_transposed_, true, &bias_, false, true, thread_count);
+    //         transpose_fp32(output_not_transposed_.rawHostPtr(), outputs[0]->rawHostPtr(), (inputs[0]->head() / kernel_size_[0]) * ((inputs[0]->dimension() / kernel_size_[0])), out_channel_);
+    //         auto end = std::chrono::high_resolution_clock::now();
+    //         auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+    //         std::cout << duration.count() << std::endl;
+    //         return Op::execute(inputs, outputs);
+    //     }
+    // #endif
 
     switch (padding_type_) {
-    case SAME:{
+    case SAME: {
         conv2d_fp32_SAME(inputs[0].get(), outputs[0].get(), kernal_, kernel_size_[0], kernel_size_[1], support_bias_, &bias_, stride_[0], stride_[1], padding_h_, padding_w_, thread_count);
         break;
     }
     case VALID: {
-        conv2d_fp32_VALID(inputs[0].get(), outputs[0].get(),  kernal_, kernel_size_[0], kernel_size_[1], support_bias_, &bias_,stride_[0], stride_[1], thread_count);
+        conv2d_fp32_VALID(inputs[0].get(), outputs[0].get(), kernal_, kernel_size_[0], kernel_size_[1], support_bias_, &bias_, stride_[0], stride_[1], thread_count);
         break;
     }
     }
     return Op::execute(inputs, outputs);
 }
 
 ErrorCode CPUConvolution2D::free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
-
     weight_.free();
     return Op::free(inputs, outputs);
 }
 
 ErrorCode CPUConvolution2D::setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
-
     return Op::setUp(inputs, outputs);
 }
 } // namespace mllm
-
diff --git a/src/backends/cpu/CPUConvolution2D.hpp b/src/backends/cpu/CPUConvolution2D.hpp
@@ -9,13 +9,13 @@ namespace mllm {
 
 class CPUConvolution2D final : public Op {
 public:
-    CPUConvolution2D(Backend *bn, string opName, int in_channel, int out_channel,  vector<int> kernal_size, vector<int> stride, PaddingType padding_type, bool bias, int threadCount);
-    virtual ~CPUConvolution2D() = default;
-    virtual ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
-    virtual ErrorCode load(AbstructLoader &loader) override;
-    virtual ErrorCode execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
-    virtual ErrorCode free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
-    virtual ErrorCode setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
+    CPUConvolution2D(Backend *bn, string opName, int in_channel, int out_channel, vector<int> kernal_size, vector<int> stride, PaddingType padding_type, bool bias, int threadCount);
+    ~CPUConvolution2D() override = default;
+    ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
+    ErrorCode load(AbstructLoader &loader) override;
+    ErrorCode execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
+    ErrorCode free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
+    ErrorCode setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
 
     Tensor &weight() {
         return weight_;
@@ -34,21 +34,25 @@ class CPUConvolution2D final : public Op {
     Tensor weight_;
     Tensor bias_;
 
-    float ** kernal_;
-    bool support_bias_;
+#ifdef __ARM_NEON
+    Tensor im2col_layout_;
+    Tensor output_not_transposed_;
+#endif //! __ARM_NEON
 
+    float **kernal_;
+    bool support_bias_;
 };
 
 class CPUConvolution2DCreator : public CPUBackend::Creator {
 public:
-    virtual Op *create(OpParam op_param, Backend *bn, string name, int threadCount) const {
-        vector<int> kernal_size = {(int)op_param["kernal_h"],(int)op_param["kernal_w"]};
-        vector<int> stride = {(int)op_param["stride_h"],(int)op_param["stride_w"]};
+    Op *create(OpParam op_param, Backend *bn, string name, int threadCount) const override {
+        vector<int> kernal_size = {(int)op_param["kernal_h"], (int)op_param["kernal_w"]};
+        vector<int> stride = {(int)op_param["stride_h"], (int)op_param["stride_w"]};
         int in_channel = op_param["in_channel"];
         int out_channel = op_param["out_channel"];
         PaddingType padding_type = (PaddingType)op_param["padding"];
         bool bias = (bool)op_param["bias"];
-        return new CPUConvolution2D(bn, name, in_channel, out_channel, kernal_size, stride, padding_type, bias,  threadCount);
+        return new CPUConvolution2D(bn, name, in_channel, out_channel, kernal_size, stride, padding_type, bias, threadCount);
     }
 };
 

diff --git a/src/backends/cpu/compute/GEMM_AArch64.cpp b/src/backends/cpu/compute/GEMM_AArch64.cpp
@@ -1,6 +1,7 @@
 #include "GEMM_AArch64.hpp"
 #include "Types.hpp"
 #include <assert.h>
+#include <cstdlib>
 #include <float.h>
 #include <math.h>
 #include <stdio.h>  // for assert
@@ -1212,6 +1213,10 @@ void mllm_gemm_q4_0_4x4_q8_0(int n, float *__restrict s, size_t bs, const void *
                              const void *__restrict bias) {
     if (bias != nullptr) {
         _mllm_gemm_q4_0_4x4_q8_0_bias(n, s, bs, vx, vy, nr, nc, bias);
+#if defined(__ARM_NEON)
+        std::cout << "_mllm_gemm_q4_0_4x4_q8_0_bias not implemented";
+        abort();
+#endif
         return;
     }
 
@@ -2297,6 +2302,10 @@ void mllm_gemm_q4_0_4x8_q8_0(int n, float *__restrict s, size_t bs, const void *
                              const void *__restrict vy, int nr, int nc,
                              const void *__restrict bias) {
     if (bias != nullptr) {
+#if defined(__ARM_NEON)
+        std::cout << "_mllm_gemm_q4_0_4x4_q8_0_bias not implemented";
+        abort();
+#endif
         _mllm_gemm_q4_0_4x8_q8_0_bias(n, s, bs, vx, vy, nr, nc, bias);
         return;
     }
@@ -3258,6 +3267,10 @@ void mllm_gemm_q4_0_8x8_q8_0(int n, float *__restrict s, size_t bs, const void *
                              const void *__restrict vy, int nr, int nc,
                              const void *__restrict bias) {
     if (bias != nullptr) {
+#if defined(__ARM_NEON)
+        std::cout << "_mllm_gemm_q4_0_4x4_q8_0_bias not implemented";
+        abort();
+#endif
         _mllm_gemm_q4_0_8x8_q8_0_bias(n, s, bs, vx, vy, nr, nc, bias);
         return;
     }