Skip to content

Commit

Permalink
perf: i8 * i8 -> fp32 GEMM Boost && feat: conv2d im2cal (#187)
Browse files Browse the repository at this point in the history
* feat: boost i8 * i8 -> fp32 matmul
* feat: conv2d and im2col algorith
  • Loading branch information
chenghuaWang authored Nov 22, 2024
1 parent b9772d0 commit 57893c8
Show file tree
Hide file tree
Showing 10 changed files with 1,076 additions and 35 deletions.
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ func_vlm_add_executable(demo_imagebind_1mod)
func_vlm_add_executable(demo_phi3v)
# func_vlm_add_executable(demo)

# QNN demo

if(QNN)
func_llm_add_executable(demo_qwen_npu)
Expand Down
3 changes: 3 additions & 0 deletions src/Tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1760,6 +1760,9 @@ class Tensor {
TensorType &xnnTensorType();

void forceResetHostPointer(void *ptr);

public:
float i8_scale = 1.f;
};
} // namespace mllm
#endif // MLLM_TENSOR_H
1 change: 1 addition & 0 deletions src/backends/cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ endif()
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
message(STATUS "ARM detected")
add_compile_options(-march=armv8.2-a+dotprod+fp16+fp16fml)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
message(STATUS "x86_64 detected")
add_compile_options(-mavx2)
Expand Down
97 changes: 78 additions & 19 deletions src/backends/cpu/CPUConvolution2D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@
#include "CPUConvolution2D.hpp"
#include "compute/Convolution.hpp"

#include "compute/Matmul.hpp"
#include "compute/Im2Col.hpp"

namespace mllm {

CPUConvolution2D::CPUConvolution2D(Backend *bn, string opName, int in_channel, int out_channel, vector<int> kernal_size, vector<int> stride, PaddingType padding_type, bool bias, int threadCount) : thread_count(threadCount),
Op(bn, opName) {
CPUConvolution2D::CPUConvolution2D(Backend *bn, string opName, int in_channel, int out_channel, vector<int> kernal_size, vector<int> stride, PaddingType padding_type, bool bias, int threadCount) :
thread_count(threadCount),
Op(bn, opName) {
kernel_size_[0] = kernal_size[0];
kernel_size_[1] = kernal_size[1];
stride_[0] = stride[0];
Expand All @@ -16,48 +20,81 @@ Op(bn, opName) {
support_bias_ = bias;
weight_.setBackend(bn);
bias_.setBackend(bn);

#ifdef __ARM_NEON
im2col_layout_.setBackend(bn);
output_not_transposed_.setBackend(bn);
#endif //! __ARM_NEON
}

ErrorCode CPUConvolution2D::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
//batch = batch
//sequence = out_channel
//head = height
//dimension = width
// batch = batch
// sequence = out_channel
// head = height
// dimension = width
assert(in_channel_ == inputs[0]->sequence());

// #ifdef __ARM_NEON
// if (kernel_size_[0] == 16 && kernel_size_[1] == 16 && padding_h_ == 0 && padding_w_ == 0 && stride_[0] == 16 && stride_[1] == 16) {
// im2col_layout_.setDtype(inputs[0]->dtype());
// im2col_layout_.reshape(inputs[0]->batch(), 1, (inputs[0]->head() / 16) * (inputs[0]->dimension() / 16), 16 * 16 * in_channel_);
// im2col_layout_.alloc();
// output_not_transposed_.setDtype(inputs[0]->dtype());
// output_not_transposed_.reshape(inputs[0]->batch(), 1, (inputs[0]->head() / 16) * (inputs[0]->dimension() / 16), out_channel_);
// output_not_transposed_.alloc();
// outputs[0]->reshape(inputs[0]->batch(), (inputs[0]->head() / 16), out_channel_, (inputs[0]->dimension() / 16));
// return Op::reshape(inputs, outputs);
// }

// if (kernel_size_[0] == kernel_size_[1] && kernel_size_[0] == stride_[0] && kernel_size_[1] == stride_[1] && padding_h_ == 0 && padding_w_ == 0) {
// im2col_layout_.setDtype(inputs[0]->dtype());
// im2col_layout_.reshape(inputs[0]->batch(), 1, (inputs[0]->head() / kernel_size_[0]) * (inputs[0]->dimension() / kernel_size_[0]), kernel_size_[0] * kernel_size_[0] * in_channel_);
// im2col_layout_.alloc();
// output_not_transposed_.setDtype(inputs[0]->dtype());
// output_not_transposed_.reshape(inputs[0]->batch(), 1, (inputs[0]->head() / kernel_size_[0]) * (inputs[0]->dimension() / kernel_size_[0]), out_channel_);
// output_not_transposed_.alloc();
// outputs[0]->reshape(inputs[0]->batch(), (inputs[0]->head() / kernel_size_[0]), out_channel_, (inputs[0]->dimension() / kernel_size_[0]));
// return Op::reshape(inputs, outputs);
// }
// #endif

switch (padding_type_) {
case SAME:{
case SAME: {
padding_h_ = (kernel_size_[0] - 1) / 2;
padding_w_ = (kernel_size_[1] - 1) / 2;
const int out_height = (inputs[0]->head() + 2 * padding_h_ - kernel_size_[0]) / stride_[0] + 1;
const int out_width = (inputs[0]->dimension() + 2 * padding_w_ - kernel_size_[1]) / stride_[1] + 1;
outputs[0]->reshape(inputs[0]->batch(),out_height, out_channel_, out_width);
outputs[0]->reshape(inputs[0]->batch(), out_height, out_channel_, out_width);
break;
}
case VALID:{
}
case VALID: {
padding_h_ = 0;
padding_w_ = 0;
const int out_height = (inputs[0]->head() - kernel_size_[0]) / stride_[0] + 1;
const int out_width = (inputs[0]->dimension()- kernel_size_[1]) / stride_[1] + 1;
outputs[0]->reshape(inputs[0]->batch(),out_height, out_channel_, out_width);
const int out_width = (inputs[0]->dimension() - kernel_size_[1]) / stride_[1] + 1;
outputs[0]->reshape(inputs[0]->batch(), out_height, out_channel_, out_width);
break;
}
}
}
return Op::reshape(inputs, outputs);
}

ErrorCode CPUConvolution2D::load(AbstructLoader &loader) {

weight_.setName(name() + ".weight");
weight_.reshape(out_channel_, kernel_size_[0], in_channel_, kernel_size_[1]);
if (loader.getDataType(weight_.name()) != MLLM_TYPE_COUNT) {
weight_.setDtype(loader.getDataType(weight_.name()));
weight_.alloc();
loader.load(&weight_);
// #ifndef __ARM_NEON
kernal_ = reshape_conv2d_kernal_fp32(&weight_);
// #endif
} else {
weight_.setDtype(MLLM_TYPE_F32);
weight_.alloc();
// #ifndef __ARM_NEON
kernal_ = reshape_conv2d_kernal_fp32(&weight_);
// #endif
}
if (support_bias_) {
bias_.setName(name() + ".bias");
Expand All @@ -75,29 +112,51 @@ ErrorCode CPUConvolution2D::load(AbstructLoader &loader) {
}

ErrorCode CPUConvolution2D::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
// #ifdef __ARM_NEON
// if (kernel_size_[0] == 16 && kernel_size_[1] == 16 && padding_h_ == 0 && padding_w_ == 0 && stride_[0] == 16 && stride_[1] == 16) {
// auto start = std::chrono::high_resolution_clock::now();
// im2col_fp32_src_k16x16_s16_p0_to(inputs[0]->rawHostPtr(), im2col_layout_.rawHostPtr(), inputs[0]->head(), inputs[0]->dimension(), in_channel_);
// weight_.reshape(1, 1, out_channel_, 16 * 16 * in_channel_);
// mat_mul(&im2col_layout_, &weight_, &output_not_transposed_, true, &bias_, false, true, thread_count);
// transpose_fp32(output_not_transposed_.rawHostPtr(), outputs[0]->rawHostPtr(), (inputs[0]->head() / 16) * ((inputs[0]->dimension() / 16)), out_channel_);
// auto end = std::chrono::high_resolution_clock::now();
// auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
// std::cout << duration.count() << std::endl;
// return Op::execute(inputs, outputs);
// }

// if (kernel_size_[0] == kernel_size_[1] && kernel_size_[0] == stride_[0] && kernel_size_[1] == stride_[1] && padding_h_ == 0 && padding_w_ == 0) {
// auto start = std::chrono::high_resolution_clock::now();
// im2col_fp32_src_knxn_sn_p0_to(inputs[0]->rawHostPtr(), im2col_layout_.rawHostPtr(), inputs[0]->head(), inputs[0]->dimension(), in_channel_, kernel_size_[0]);
// weight_.reshape(1, 1, out_channel_, kernel_size_[0] * kernel_size_[0] * in_channel_);
// mat_mul(&im2col_layout_, &weight_, &output_not_transposed_, true, &bias_, false, true, thread_count);
// transpose_fp32(output_not_transposed_.rawHostPtr(), outputs[0]->rawHostPtr(), (inputs[0]->head() / kernel_size_[0]) * ((inputs[0]->dimension() / kernel_size_[0])), out_channel_);
// auto end = std::chrono::high_resolution_clock::now();
// auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
// std::cout << duration.count() << std::endl;
// return Op::execute(inputs, outputs);
// }
// #endif

switch (padding_type_) {
case SAME:{
case SAME: {
conv2d_fp32_SAME(inputs[0].get(), outputs[0].get(), kernal_, kernel_size_[0], kernel_size_[1], support_bias_, &bias_, stride_[0], stride_[1], padding_h_, padding_w_, thread_count);
break;
}
case VALID: {
conv2d_fp32_VALID(inputs[0].get(), outputs[0].get(), kernal_, kernel_size_[0], kernel_size_[1], support_bias_, &bias_,stride_[0], stride_[1], thread_count);
conv2d_fp32_VALID(inputs[0].get(), outputs[0].get(), kernal_, kernel_size_[0], kernel_size_[1], support_bias_, &bias_, stride_[0], stride_[1], thread_count);
break;
}
}
return Op::execute(inputs, outputs);
}

ErrorCode CPUConvolution2D::free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {

weight_.free();
return Op::free(inputs, outputs);
}

ErrorCode CPUConvolution2D::setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {

return Op::setUp(inputs, outputs);
}
} // namespace mllm

30 changes: 17 additions & 13 deletions src/backends/cpu/CPUConvolution2D.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ namespace mllm {

class CPUConvolution2D final : public Op {
public:
CPUConvolution2D(Backend *bn, string opName, int in_channel, int out_channel, vector<int> kernal_size, vector<int> stride, PaddingType padding_type, bool bias, int threadCount);
virtual ~CPUConvolution2D() = default;
virtual ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode load(AbstructLoader &loader) override;
virtual ErrorCode execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
CPUConvolution2D(Backend *bn, string opName, int in_channel, int out_channel, vector<int> kernal_size, vector<int> stride, PaddingType padding_type, bool bias, int threadCount);
~CPUConvolution2D() override = default;
ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
ErrorCode load(AbstructLoader &loader) override;
ErrorCode execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
ErrorCode free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
ErrorCode setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;

Tensor &weight() {
return weight_;
Expand All @@ -34,21 +34,25 @@ class CPUConvolution2D final : public Op {
Tensor weight_;
Tensor bias_;

float ** kernal_;
bool support_bias_;
#ifdef __ARM_NEON
Tensor im2col_layout_;
Tensor output_not_transposed_;
#endif //! __ARM_NEON

float **kernal_;
bool support_bias_;
};

class CPUConvolution2DCreator : public CPUBackend::Creator {
public:
virtual Op *create(OpParam op_param, Backend *bn, string name, int threadCount) const {
vector<int> kernal_size = {(int)op_param["kernal_h"],(int)op_param["kernal_w"]};
vector<int> stride = {(int)op_param["stride_h"],(int)op_param["stride_w"]};
Op *create(OpParam op_param, Backend *bn, string name, int threadCount) const override {
vector<int> kernal_size = {(int)op_param["kernal_h"], (int)op_param["kernal_w"]};
vector<int> stride = {(int)op_param["stride_h"], (int)op_param["stride_w"]};
int in_channel = op_param["in_channel"];
int out_channel = op_param["out_channel"];
PaddingType padding_type = (PaddingType)op_param["padding"];
bool bias = (bool)op_param["bias"];
return new CPUConvolution2D(bn, name, in_channel, out_channel, kernal_size, stride, padding_type, bias, threadCount);
return new CPUConvolution2D(bn, name, in_channel, out_channel, kernal_size, stride, padding_type, bias, threadCount);
}
};

Expand Down
13 changes: 13 additions & 0 deletions src/backends/cpu/compute/GEMM_AArch64.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "GEMM_AArch64.hpp"
#include "Types.hpp"
#include <assert.h>
#include <cstdlib>
#include <float.h>
#include <math.h>
#include <stdio.h> // for assert
Expand Down Expand Up @@ -1212,6 +1213,10 @@ void mllm_gemm_q4_0_4x4_q8_0(int n, float *__restrict s, size_t bs, const void *
const void *__restrict bias) {
if (bias != nullptr) {
_mllm_gemm_q4_0_4x4_q8_0_bias(n, s, bs, vx, vy, nr, nc, bias);
#if defined(__ARM_NEON)
std::cout << "_mllm_gemm_q4_0_4x4_q8_0_bias not implemented";
abort();
#endif
return;
}

Expand Down Expand Up @@ -2297,6 +2302,10 @@ void mllm_gemm_q4_0_4x8_q8_0(int n, float *__restrict s, size_t bs, const void *
const void *__restrict vy, int nr, int nc,
const void *__restrict bias) {
if (bias != nullptr) {
#if defined(__ARM_NEON)
std::cout << "_mllm_gemm_q4_0_4x4_q8_0_bias not implemented";
abort();
#endif
_mllm_gemm_q4_0_4x8_q8_0_bias(n, s, bs, vx, vy, nr, nc, bias);
return;
}
Expand Down Expand Up @@ -3258,6 +3267,10 @@ void mllm_gemm_q4_0_8x8_q8_0(int n, float *__restrict s, size_t bs, const void *
const void *__restrict vy, int nr, int nc,
const void *__restrict bias) {
if (bias != nullptr) {
#if defined(__ARM_NEON)
std::cout << "_mllm_gemm_q4_0_4x4_q8_0_bias not implemented";
abort();
#endif
_mllm_gemm_q4_0_8x8_q8_0_bias(n, s, bs, vx, vy, nr, nc, bias);
return;
}
Expand Down
Loading

0 comments on commit 57893c8

Please sign in to comment.