Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: i8 * i8 -> fp32 GEMM Boost #187

Merged
merged 11 commits into from
Nov 22, 2024
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ func_vlm_add_executable(demo_imagebind_1mod)
func_vlm_add_executable(demo_phi3v)
# func_vlm_add_executable(demo)

# QNN demo

if(QNN)
func_llm_add_executable(demo_qwen_npu)
Expand Down
3 changes: 3 additions & 0 deletions src/Tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1760,6 +1760,9 @@ class Tensor {
TensorType &xnnTensorType();

void forceResetHostPointer(void *ptr);

public:
float i8_scale = 1.f;
};
} // namespace mllm
#endif // MLLM_TENSOR_H
1 change: 1 addition & 0 deletions src/backends/cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ endif()
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
message(STATUS "ARM detected")
add_compile_options(-march=armv8.2-a+dotprod+fp16+fp16fml)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
message(STATUS "x86_64 detected")
add_compile_options(-mavx2)
Expand Down
97 changes: 78 additions & 19 deletions src/backends/cpu/CPUConvolution2D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@
#include "CPUConvolution2D.hpp"
#include "compute/Convolution.hpp"

#include "compute/Matmul.hpp"
#include "compute/Im2Col.hpp"

namespace mllm {

CPUConvolution2D::CPUConvolution2D(Backend *bn, string opName, int in_channel, int out_channel, vector<int> kernal_size, vector<int> stride, PaddingType padding_type, bool bias, int threadCount) : thread_count(threadCount),
Op(bn, opName) {
CPUConvolution2D::CPUConvolution2D(Backend *bn, string opName, int in_channel, int out_channel, vector<int> kernal_size, vector<int> stride, PaddingType padding_type, bool bias, int threadCount) :
thread_count(threadCount),
Op(bn, opName) {
kernel_size_[0] = kernal_size[0];
kernel_size_[1] = kernal_size[1];
stride_[0] = stride[0];
Expand All @@ -16,48 +20,81 @@ Op(bn, opName) {
support_bias_ = bias;
weight_.setBackend(bn);
bias_.setBackend(bn);

#ifdef __ARM_NEON
im2col_layout_.setBackend(bn);
output_not_transposed_.setBackend(bn);
#endif //! __ARM_NEON
}

ErrorCode CPUConvolution2D::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
//batch = batch
//sequence = out_channel
//head = height
//dimension = width
// batch = batch
// sequence = out_channel
// head = height
// dimension = width
assert(in_channel_ == inputs[0]->sequence());

// #ifdef __ARM_NEON
// if (kernel_size_[0] == 16 && kernel_size_[1] == 16 && padding_h_ == 0 && padding_w_ == 0 && stride_[0] == 16 && stride_[1] == 16) {
// im2col_layout_.setDtype(inputs[0]->dtype());
// im2col_layout_.reshape(inputs[0]->batch(), 1, (inputs[0]->head() / 16) * (inputs[0]->dimension() / 16), 16 * 16 * in_channel_);
// im2col_layout_.alloc();
// output_not_transposed_.setDtype(inputs[0]->dtype());
// output_not_transposed_.reshape(inputs[0]->batch(), 1, (inputs[0]->head() / 16) * (inputs[0]->dimension() / 16), out_channel_);
// output_not_transposed_.alloc();
// outputs[0]->reshape(inputs[0]->batch(), (inputs[0]->head() / 16), out_channel_, (inputs[0]->dimension() / 16));
// return Op::reshape(inputs, outputs);
// }

// if (kernel_size_[0] == kernel_size_[1] && kernel_size_[0] == stride_[0] && kernel_size_[1] == stride_[1] && padding_h_ == 0 && padding_w_ == 0) {
// im2col_layout_.setDtype(inputs[0]->dtype());
// im2col_layout_.reshape(inputs[0]->batch(), 1, (inputs[0]->head() / kernel_size_[0]) * (inputs[0]->dimension() / kernel_size_[0]), kernel_size_[0] * kernel_size_[0] * in_channel_);
// im2col_layout_.alloc();
// output_not_transposed_.setDtype(inputs[0]->dtype());
// output_not_transposed_.reshape(inputs[0]->batch(), 1, (inputs[0]->head() / kernel_size_[0]) * (inputs[0]->dimension() / kernel_size_[0]), out_channel_);
// output_not_transposed_.alloc();
// outputs[0]->reshape(inputs[0]->batch(), (inputs[0]->head() / kernel_size_[0]), out_channel_, (inputs[0]->dimension() / kernel_size_[0]));
// return Op::reshape(inputs, outputs);
// }
// #endif

switch (padding_type_) {
case SAME:{
case SAME: {
padding_h_ = (kernel_size_[0] - 1) / 2;
padding_w_ = (kernel_size_[1] - 1) / 2;
const int out_height = (inputs[0]->head() + 2 * padding_h_ - kernel_size_[0]) / stride_[0] + 1;
const int out_width = (inputs[0]->dimension() + 2 * padding_w_ - kernel_size_[1]) / stride_[1] + 1;
outputs[0]->reshape(inputs[0]->batch(),out_height, out_channel_, out_width);
outputs[0]->reshape(inputs[0]->batch(), out_height, out_channel_, out_width);
break;
}
case VALID:{
}
case VALID: {
padding_h_ = 0;
padding_w_ = 0;
const int out_height = (inputs[0]->head() - kernel_size_[0]) / stride_[0] + 1;
const int out_width = (inputs[0]->dimension()- kernel_size_[1]) / stride_[1] + 1;
outputs[0]->reshape(inputs[0]->batch(),out_height, out_channel_, out_width);
const int out_width = (inputs[0]->dimension() - kernel_size_[1]) / stride_[1] + 1;
outputs[0]->reshape(inputs[0]->batch(), out_height, out_channel_, out_width);
break;
}
}
}
return Op::reshape(inputs, outputs);
}

ErrorCode CPUConvolution2D::load(AbstructLoader &loader) {

weight_.setName(name() + ".weight");
weight_.reshape(out_channel_, kernel_size_[0], in_channel_, kernel_size_[1]);
if (loader.getDataType(weight_.name()) != MLLM_TYPE_COUNT) {
weight_.setDtype(loader.getDataType(weight_.name()));
weight_.alloc();
loader.load(&weight_);
// #ifndef __ARM_NEON
kernal_ = reshape_conv2d_kernal_fp32(&weight_);
// #endif
} else {
weight_.setDtype(MLLM_TYPE_F32);
weight_.alloc();
// #ifndef __ARM_NEON
kernal_ = reshape_conv2d_kernal_fp32(&weight_);
// #endif
}
if (support_bias_) {
bias_.setName(name() + ".bias");
Expand All @@ -75,29 +112,51 @@ ErrorCode CPUConvolution2D::load(AbstructLoader &loader) {
}

ErrorCode CPUConvolution2D::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
// #ifdef __ARM_NEON
// if (kernel_size_[0] == 16 && kernel_size_[1] == 16 && padding_h_ == 0 && padding_w_ == 0 && stride_[0] == 16 && stride_[1] == 16) {
// auto start = std::chrono::high_resolution_clock::now();
// im2col_fp32_src_k16x16_s16_p0_to(inputs[0]->rawHostPtr(), im2col_layout_.rawHostPtr(), inputs[0]->head(), inputs[0]->dimension(), in_channel_);
// weight_.reshape(1, 1, out_channel_, 16 * 16 * in_channel_);
// mat_mul(&im2col_layout_, &weight_, &output_not_transposed_, true, &bias_, false, true, thread_count);
// transpose_fp32(output_not_transposed_.rawHostPtr(), outputs[0]->rawHostPtr(), (inputs[0]->head() / 16) * ((inputs[0]->dimension() / 16)), out_channel_);
// auto end = std::chrono::high_resolution_clock::now();
// auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
// std::cout << duration.count() << std::endl;
// return Op::execute(inputs, outputs);
// }

// if (kernel_size_[0] == kernel_size_[1] && kernel_size_[0] == stride_[0] && kernel_size_[1] == stride_[1] && padding_h_ == 0 && padding_w_ == 0) {
// auto start = std::chrono::high_resolution_clock::now();
// im2col_fp32_src_knxn_sn_p0_to(inputs[0]->rawHostPtr(), im2col_layout_.rawHostPtr(), inputs[0]->head(), inputs[0]->dimension(), in_channel_, kernel_size_[0]);
// weight_.reshape(1, 1, out_channel_, kernel_size_[0] * kernel_size_[0] * in_channel_);
// mat_mul(&im2col_layout_, &weight_, &output_not_transposed_, true, &bias_, false, true, thread_count);
// transpose_fp32(output_not_transposed_.rawHostPtr(), outputs[0]->rawHostPtr(), (inputs[0]->head() / kernel_size_[0]) * ((inputs[0]->dimension() / kernel_size_[0])), out_channel_);
// auto end = std::chrono::high_resolution_clock::now();
// auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
// std::cout << duration.count() << std::endl;
// return Op::execute(inputs, outputs);
// }
// #endif

switch (padding_type_) {
case SAME:{
case SAME: {
conv2d_fp32_SAME(inputs[0].get(), outputs[0].get(), kernal_, kernel_size_[0], kernel_size_[1], support_bias_, &bias_, stride_[0], stride_[1], padding_h_, padding_w_, thread_count);
break;
}
case VALID: {
conv2d_fp32_VALID(inputs[0].get(), outputs[0].get(), kernal_, kernel_size_[0], kernel_size_[1], support_bias_, &bias_,stride_[0], stride_[1], thread_count);
conv2d_fp32_VALID(inputs[0].get(), outputs[0].get(), kernal_, kernel_size_[0], kernel_size_[1], support_bias_, &bias_, stride_[0], stride_[1], thread_count);
break;
}
}
return Op::execute(inputs, outputs);
}

ErrorCode CPUConvolution2D::free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {

weight_.free();
return Op::free(inputs, outputs);
}

ErrorCode CPUConvolution2D::setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {

return Op::setUp(inputs, outputs);
}
} // namespace mllm

30 changes: 17 additions & 13 deletions src/backends/cpu/CPUConvolution2D.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ namespace mllm {

class CPUConvolution2D final : public Op {
public:
CPUConvolution2D(Backend *bn, string opName, int in_channel, int out_channel, vector<int> kernal_size, vector<int> stride, PaddingType padding_type, bool bias, int threadCount);
virtual ~CPUConvolution2D() = default;
virtual ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode load(AbstructLoader &loader) override;
virtual ErrorCode execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
CPUConvolution2D(Backend *bn, string opName, int in_channel, int out_channel, vector<int> kernal_size, vector<int> stride, PaddingType padding_type, bool bias, int threadCount);
~CPUConvolution2D() override = default;
ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
ErrorCode load(AbstructLoader &loader) override;
ErrorCode execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
ErrorCode free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
ErrorCode setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;

Tensor &weight() {
return weight_;
Expand All @@ -34,21 +34,25 @@ class CPUConvolution2D final : public Op {
Tensor weight_;
Tensor bias_;

float ** kernal_;
bool support_bias_;
#ifdef __ARM_NEON
Tensor im2col_layout_;
Tensor output_not_transposed_;
#endif //! __ARM_NEON

float **kernal_;
bool support_bias_;
};

class CPUConvolution2DCreator : public CPUBackend::Creator {
public:
virtual Op *create(OpParam op_param, Backend *bn, string name, int threadCount) const {
vector<int> kernal_size = {(int)op_param["kernal_h"],(int)op_param["kernal_w"]};
vector<int> stride = {(int)op_param["stride_h"],(int)op_param["stride_w"]};
Op *create(OpParam op_param, Backend *bn, string name, int threadCount) const override {
vector<int> kernal_size = {(int)op_param["kernal_h"], (int)op_param["kernal_w"]};
vector<int> stride = {(int)op_param["stride_h"], (int)op_param["stride_w"]};
int in_channel = op_param["in_channel"];
int out_channel = op_param["out_channel"];
PaddingType padding_type = (PaddingType)op_param["padding"];
bool bias = (bool)op_param["bias"];
return new CPUConvolution2D(bn, name, in_channel, out_channel, kernal_size, stride, padding_type, bias, threadCount);
return new CPUConvolution2D(bn, name, in_channel, out_channel, kernal_size, stride, padding_type, bias, threadCount);
}
};

Expand Down
13 changes: 13 additions & 0 deletions src/backends/cpu/compute/GEMM_AArch64.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "GEMM_AArch64.hpp"
#include "Types.hpp"
#include <assert.h>
#include <cstdlib>
#include <float.h>
#include <math.h>
#include <stdio.h> // for assert
Expand Down Expand Up @@ -1212,6 +1213,10 @@ void mllm_gemm_q4_0_4x4_q8_0(int n, float *__restrict s, size_t bs, const void *
const void *__restrict bias) {
if (bias != nullptr) {
_mllm_gemm_q4_0_4x4_q8_0_bias(n, s, bs, vx, vy, nr, nc, bias);
#if defined(__ARM_NEON)
std::cout << "_mllm_gemm_q4_0_4x4_q8_0_bias not implemented";
abort();
#endif
return;
}

Expand Down Expand Up @@ -2297,6 +2302,10 @@ void mllm_gemm_q4_0_4x8_q8_0(int n, float *__restrict s, size_t bs, const void *
const void *__restrict vy, int nr, int nc,
const void *__restrict bias) {
if (bias != nullptr) {
#if defined(__ARM_NEON)
std::cout << "_mllm_gemm_q4_0_4x4_q8_0_bias not implemented";
abort();
#endif
_mllm_gemm_q4_0_4x8_q8_0_bias(n, s, bs, vx, vy, nr, nc, bias);
return;
}
Expand Down Expand Up @@ -3258,6 +3267,10 @@ void mllm_gemm_q4_0_8x8_q8_0(int n, float *__restrict s, size_t bs, const void *
const void *__restrict vy, int nr, int nc,
const void *__restrict bias) {
if (bias != nullptr) {
#if defined(__ARM_NEON)
std::cout << "_mllm_gemm_q4_0_4x4_q8_0_bias not implemented";
abort();
#endif
_mllm_gemm_q4_0_8x8_q8_0_bias(n, s, bs, vx, vy, nr, nc, bias);
return;
}
Expand Down
Loading
Loading