From 7324f00bdb32907aee184995f7f6fd5e95947447 Mon Sep 17 00:00:00 2001 From: yirongjie Date: Tue, 16 Jul 2024 08:58:54 +0000 Subject: [PATCH 1/7] fix: remove type/type -> VecDotType --- src/backends/cpu/CPUPredictor.cpp | 2 +- src/backends/cpu/compute/Matmul.cpp | 2 +- src/backends/cpu/compute/MatmulElastic.cpp | 320 ++++++++++++++++++ src/backends/cpu/compute/MatmulElastic.hpp | 18 + src/backends/cpu/compute/VecDot.cpp | 71 +++- src/backends/cpu/compute/VecDot.hpp | 20 +- .../{type/type.cpp => compute/VecDotType.cpp} | 194 +---------- .../{type/type.hpp => compute/VecDotType.hpp} | 0 8 files changed, 440 insertions(+), 187 deletions(-) create mode 100644 src/backends/cpu/compute/MatmulElastic.cpp create mode 100644 src/backends/cpu/compute/MatmulElastic.hpp rename src/backends/cpu/{type/type.cpp => compute/VecDotType.cpp} (64%) rename src/backends/cpu/{type/type.hpp => compute/VecDotType.hpp} (100%) diff --git a/src/backends/cpu/CPUPredictor.cpp b/src/backends/cpu/CPUPredictor.cpp index c84168e1..c852fe1a 100644 --- a/src/backends/cpu/CPUPredictor.cpp +++ b/src/backends/cpu/CPUPredictor.cpp @@ -1,6 +1,6 @@ #include "CPUPredictor.hpp" -#include "type/type.hpp" +#include "compute/VecDotType.hpp" #include "compute/Matmul.hpp" #include diff --git a/src/backends/cpu/compute/Matmul.cpp b/src/backends/cpu/compute/Matmul.cpp index 09387bc6..3382721f 100644 --- a/src/backends/cpu/compute/Matmul.cpp +++ b/src/backends/cpu/compute/Matmul.cpp @@ -3,7 +3,7 @@ // #include "Matmul.hpp" -#include "type/type.hpp" +#include "VecDotType.hpp" #include #define ASSERT(x) \ diff --git a/src/backends/cpu/compute/MatmulElastic.cpp b/src/backends/cpu/compute/MatmulElastic.cpp new file mode 100644 index 00000000..70049cb4 --- /dev/null +++ b/src/backends/cpu/compute/MatmulElastic.cpp @@ -0,0 +1,320 @@ +// +// Created by Rongjie Yi on 23-10-24. +// + +#include "MatmulElastic.hpp" +#include + + +ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, bool transpose0, bool transpose1, int thread_count) { + const int M = transpose0 ? src0->dimension() : src0->sequence(); + const int K = transpose0 ? src0->sequence() : src0->dimension(); + const int N = transpose1 ? src1->sequence() : src1->dimension(); + Tensor *src0_cal = src0; + Tensor *src1_cal = src1; + const int64_t blck_0 = 16; + for (int b = 0; b < src0->batch(); b++) { + for (int h = 0; h < src0->head(); h++) { + const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b; + const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h; + for (int m = 0; m < M; m++) { + int use_N = N; + const int num_blocks = use_N / blck_0; + const int remainder = use_N % blck_0; +#pragma omp parallel for num_threads(thread_count) + for (int block = 0; block < num_blocks + 1; block++) { + for (int n = block * blck_0; n < (block + 1) * blck_0 & n < num_blocks * blck_0 + remainder; n++) { + int s_1, d_1; + int s_0, d_0; + if (!transpose0 && transpose1) { + s_1 = n; d_1 = 0; s_0 = m; d_0 = 0; + } else if (!transpose0 && !transpose1) { + s_1 = 0; d_1 = n; s_0 = m; d_0 = 0; + } else { + s_1 = 0; d_1 = n; s_0 = 0; d_0 = m; + } + if(dst->dtypeAt(b,h,m,n) == MLLM_TYPE_F32) { + vec_dot_fp32(K, dst->ptrAt(b, h, m, n), + src1_cal->hostPtr() + src1_cal->offset(b_1, h_1, s_1, d_1), + src0_cal->hostPtr() + src0_cal->offset(b, h, s_0, d_0)); + if (support_bias) { + *dst->ptrAt(b, h, m, n) += bias->dataAt(0, 0, 0, n); + } + }else if (dst->dtypeAt(b,h,m,n) == MLLM_TYPE_F16) { + float tmp = 0; + vec_dot_fp32(K, &tmp, + src1_cal->hostPtr() + src1_cal->offset(b_1, h_1, s_1, d_1), + src0_cal->hostPtr() + src0_cal->offset(b, h, s_0, d_0)); + if (support_bias) { + *dst->ptrAt(b, h, m, n) = MLLM_FP32_TO_FP16(tmp + bias->dataAt(0, 0, 0, n)); + } else { + *dst->ptrAt(b, h, m, n) = MLLM_FP32_TO_FP16(tmp); + } + }else{std::cout<<"Not support type [Matmul]"<dtype() == MLLM_TYPE_F16); + assert(src0_->dtype() == MLLM_TYPE_F32); + Tensor src0_qf16(src0_->shape()); + src0_qf16.setBackend(src0_->backend()); + src0_qf16.setDtype(MLLM_TYPE_F16); + src0_qf16.alloc(); + for (int b = 0; b < src0_->batch(); b++) { + for (int h = 0; h < src0_->head(); h++) { +#pragma omp parallel for num_threads(thread_count) + for (int s = 0; s < src0_->sequence(); s++) { + mllm_fp32_to_fp16_row(src0_->hostPtr() + src0_->offset(b, h, s, 0), + src0_qf16.hostPtr() + src0_qf16.offset(b, h, s, 0), + src0_->dimension()); + } + } + } + auto *src0 = &src0_qf16; + // for(int b=0; bdimension(); b++) { + // std::cout<ptrAt(0, 0, 0, b))<<" "; + // } + // std::cout<dimension(); b++) { + // std::cout<ptrAt(0, 0, 0, b))<<" "; + // } + // std::cout<dimension() : src0->sequence(); + const int K = transpose0 ? src0->sequence() : src0->dimension(); + const int N = transpose1 ? src1->sequence() : src1->dimension(); + Tensor *src0_cal = src0; + Tensor *src1_cal = src1; + const int64_t blck_0 = 16; + for (int b = 0; b < src0->batch(); b++) { + for (int h = 0; h < src0->head(); h++) { + const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b; + const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h; + for (int m = 0; m < M; m++) { + int use_N = N; + const int num_blocks = use_N / blck_0; + const int remainder = use_N % blck_0; +#pragma omp parallel for num_threads(thread_count) + for (int block = 0; block < num_blocks + 1; block++) { + for (int n = block * blck_0; n < (block + 1) * blck_0 & n < num_blocks * blck_0 + remainder; n++) { + int s_1, d_1; + int s_0, d_0; + if (!transpose0 && transpose1) { + s_1 = n; d_1 = 0; s_0 = m; d_0 = 0; + } else if (!transpose0 && !transpose1) { + s_1 = 0; d_1 = n; s_0 = m; d_0 = 0; + } else { + s_1 = 0; d_1 = n; s_0 = 0; d_0 = m; + } + vec_dot_fp16(K, dst->ptrAt(b, h, m, n), + src1_cal->hostPtr() + src1_cal->offset(b_1, h_1, s_1, d_1), + src0_cal->hostPtr() + src0_cal->offset(b, h, s_0, d_0)); + if (support_bias) { + *dst->ptrAt(b, h, m, n) += bias->dataAt(0, 0, 0, n); + } + } + } + } + } + } + return MLLM_NO_ERROR; +} + +ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, int thread_count) { + assert(src1->dtype() == MLLM_TYPE_Q4_0); + assert(src0_->dtype() == MLLM_TYPE_F32); + Tensor src0_q8(src0_->shape()); + src0_q8.setBackend(src0_->backend()); + src0_q8.setDtype(MLLM_TYPE_Q8_0); + src0_q8.alloc(); + if (src0_->dimension() % QK8_0 == 0) { + for (int b = 0; b < src0_->batch(); b++) { + for (int h = 0; h < src0_->head(); h++) { +#pragma omp parallel for num_threads(thread_count) + for (int s = 0; s < src0_->sequence(); s++) { + quantize_row_q8_0(src0_->hostPtr() + src0_->offset(b, h, s, 0), + src0_q8.hostPtr() + src0_q8.offset(b, h, s, 0) / QK8_0, + src0_->dimension()); + } + } + } + } else { + std::cout << "[ERROR]: " << src0_->dimension() << "%" << QK8_0 << "!=0" << std::endl; + assert(src0_->dimension() % QK8_0 == 0); + } + auto *src0 = &src0_q8; + assert(src0->dtype() == MLLM_TYPE_Q8_0); + int M = src0->sequence(); + int K = src0->dimension(); + int N = src1->sequence(); + Tensor *src0_cal = src0; + Tensor *src1_cal = src1; + const int64_t blck_0 = 16; + for (int b = 0; b < src0->batch(); b++) { + for (int h = 0; h < src0->head(); h++) { + const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b; + const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h; + for (int m = 0; m < M; m++) { + int use_N = N; + const int num_blocks = use_N / blck_0; + const int remainder = use_N % blck_0; +#pragma omp parallel for num_threads(thread_count) + for (int block = 0; block < num_blocks + 1; block++) { + for (int n = block * blck_0; n < (block + 1) * blck_0 & n < num_blocks * blck_0 + remainder; n++) { + vec_dot_q4_0_q8_0(K, dst->ptrAt(b, h, m, n), + src1_cal->hostPtr() + src1_cal->offset(b_1, h_1, n, 0) / QK4_0, + src0_cal->hostPtr() + src0_cal->offset(b, h, m, 0) / QK8_0); + if (support_bias) { + *dst->ptrAt(b, h, m, n) += bias->dataAt(0, 0, 0, n); + } + } + } + } + } + } + return MLLM_NO_ERROR; +} + +ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, int thread_count) { + assert(src1->dtype() == MLLM_TYPE_Q4_K); + assert(src0_->dtype() == MLLM_TYPE_F32); + Tensor src0_q8(src0_->shape()); + src0_q8.setBackend(src0_->backend()); + src0_q8.setDtype(MLLM_TYPE_Q8_K); + src0_q8.alloc(); + if (src0_->dimension() % QK_K == 0) { + for (int b = 0; b < src0_->batch(); b++) { + for (int h = 0; h < src0_->head(); h++) { +#pragma omp parallel for num_threads(thread_count) + for (int s = 0; s < src0_->sequence(); s++) { + quantize_row_q8_K(src0_->hostPtr() + src0_->offset(b, h, s, 0), + src0_q8.hostPtr() + src0_q8.offset(b, h, s, 0) / QK_K, + src0_->dimension()); + } + } + } + } else { + std::cout << "[ERROR]: " << src0_->dimension() << "%" << QK_K << "!=0" << std::endl; + assert(src0_->dimension() % QK_K == 0); + } + auto *src0 = &src0_q8; + assert(src0->dtype() == MLLM_TYPE_Q8_K); + int M = src0->sequence(); + int K = src0->dimension(); + int N = src1->sequence(); + Tensor *src0_cal = src0; + Tensor *src1_cal = src1; + const int64_t blck_0 = 16; + + for (int b = 0; b < src0->batch(); b++) { + for (int h = 0; h < src0->head(); h++) { + const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b; + const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h; + for (int m = 0; m < M; m++) { + int use_N = N; + const int num_blocks = use_N / blck_0; + const int remainder = use_N % blck_0; +#pragma omp parallel for num_threads(thread_count) + for (int block = 0; block < num_blocks + 1; block++) { + for (int n = block * blck_0; n < (block + 1) * blck_0 & n < num_blocks * blck_0 + remainder; n++) { + if(dst->dtypeAt(b,h,m,n) == MLLM_TYPE_F32) { + vec_dot_q4_K_q8_K(K, dst->ptrAt(b, h, m, n), + src1_cal->hostPtr() + src1_cal->offset(b_1, h_1, n, 0) / QK_K, + src0_cal->hostPtr() + src0_cal->offset(b, h, m, 0) / QK_K); + if (support_bias) { + *dst->ptrAt(b, h, m, n) += bias->dataAt(0, 0, 0, n); + } + } else if (dst->dtypeAt(b,h,m,n) == MLLM_TYPE_F16) { + float tmp = 0; + vec_dot_q4_K_q8_K(K, &tmp, + src1_cal->hostPtr() + src1_cal->offset(b_1, h_1, n, 0) / QK_K, + src0_cal->hostPtr() + src0_cal->offset(b, h, m, 0) / QK_K); + if (support_bias) { + *dst->ptrAt(b, h, m, n) = MLLM_FP32_TO_FP16(tmp + bias->dataAt(0, 0, 0, n)); + } else { + *dst->ptrAt(b, h, m, n) = MLLM_FP32_TO_FP16(tmp); + } + }else{std::cout<<"Not support type [Matmul]"<dtype() == MLLM_TYPE_Q6_K); + assert(src0_->dtype() == MLLM_TYPE_F32); + Tensor src0_q8(src0_->shape()); + src0_q8.setBackend(src0_->backend()); + src0_q8.setDtype(MLLM_TYPE_Q8_K); + src0_q8.alloc(); + if (src0_->dimension() % QK_K == 0) { + for (int b = 0; b < src0_->batch(); b++) { + for (int h = 0; h < src0_->head(); h++) { +#pragma omp parallel for num_threads(thread_count) + for (int s = 0; s < src0_->sequence(); s++) { + quantize_row_q8_K(src0_->hostPtr() + src0_->offset(b, h, s, 0), + src0_q8.hostPtr() + src0_q8.offset(b, h, s, 0) / QK_K, + src0_->dimension()); + } + } + } + } else { + std::cout << "[ERROR]: " << src0_->dimension() << "%" << QK_K << "!=0" << std::endl; + assert(src0_->dimension() % QK_K == 0); + } + auto *src0 = &src0_q8; + assert(src0->dtype() == MLLM_TYPE_Q8_K); + int M = src0->sequence(); + int K = src0->dimension(); + int N = src1->sequence(); + Tensor *src0_cal = src0; + Tensor *src1_cal = src1; + const int64_t blck_0 = 16; + for (int b = 0; b < src0->batch(); b++) { + for (int h = 0; h < src0->head(); h++) { + const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b; + const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h; + for (int m = 0; m < M; m++) { + int use_N = N; + const int num_blocks = use_N / blck_0; + const int remainder = use_N % blck_0; +#pragma omp parallel for num_threads(thread_count) + for (int block = 0; block < num_blocks + 1; block++) { + for (int n = block * blck_0; n < (block + 1) * blck_0 & n < num_blocks * blck_0 + remainder; n++) { + if (dst->dtypeAt(n, h, m, n) == MLLM_TYPE_F32) { + vec_dot_q6_K_q8_K(K, dst->ptrAt(b, h, m, n), + src1_cal->hostPtr() + src1_cal->offset(b_1, h_1, n, 0) / QK_K, + src0_cal->hostPtr() + src0_cal->offset(b, h, m, 0) / QK_K); + if (support_bias) { + *dst->ptrAt(b, h, m, n) += bias->dataAt(0, 0, 0, n); + } + } else if (dst->dtypeAt(n, h, m, n) == MLLM_TYPE_F16) { + float tmp = 0; + vec_dot_q6_K_q8_K(K, &tmp, + src1_cal->hostPtr() + src1_cal->offset(b_1, h_1, n, 0) / QK_K, + src0_cal->hostPtr() + src0_cal->offset(b, h, m, 0) / QK_K); + + if (support_bias) { + *dst->ptrAt(b, h, m, n) = MLLM_FP32_TO_FP16(tmp + bias->dataAt(0, 0, 0, n)); + } else { + *dst->ptrAt(b, h, m, n) = MLLM_FP32_TO_FP16(tmp); + } + } else { + std::cout << "Not support tupe [Matmul]" << std::endl; + } + } + } + } + } + } + return MLLM_NO_ERROR; +} + diff --git a/src/backends/cpu/compute/MatmulElastic.hpp b/src/backends/cpu/compute/MatmulElastic.hpp new file mode 100644 index 00000000..828fe16e --- /dev/null +++ b/src/backends/cpu/compute/MatmulElastic.hpp @@ -0,0 +1,18 @@ +// +// Created by Rongjie Yi on 23-10-24. +// + +#ifndef MLLM_MATMUL_HPP +#define MLLM_MATMUL_HPP + + +#include "VecDot.hpp" +using namespace mllm; + +ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, bool transpose0 = false, bool transpose1 = false, int thread_count=4); +ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, bool transpose0 = false, bool transpose1 = false, int thread_count=4); +ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int thread_count=4); +ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int thread_count=4); +ErrorCode mat_mul_elastic_fp32_q6_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int thread_count=4); + +#endif // MLLM_MATMUL_HPP diff --git a/src/backends/cpu/compute/VecDot.cpp b/src/backends/cpu/compute/VecDot.cpp index d4210d68..b5270f7b 100644 --- a/src/backends/cpu/compute/VecDot.cpp +++ b/src/backends/cpu/compute/VecDot.cpp @@ -97,6 +97,7 @@ void vec_dot_fp32(const int n, float *__restrict s, const float *__restrict vx, vec_dot_fp32_arm(n, s, vx, vy); #endif } +/* void vec_dot_fp32(const float * __restrict src0, const float * __restrict src1, Tensor *dst, bool support_bias, Tensor *bias, int hid_len, int batch, int head, int src0_inf, int sec1_outf) { float value = 0; #ifdef __AVX2__ @@ -113,7 +114,7 @@ void vec_dot_fp32(const float * __restrict src0, const float * __restrict src1, } dst->setDataAt({batch, head, src0_inf, sec1_outf}, value); } - +*/ void vec_dot_fp16(const int n, float * __restrict s, const mllm_fp16_t * __restrict vx, const mllm_fp16_t * __restrict vy) { float sumf = 0.0; @@ -1344,3 +1345,71 @@ void vec_dot_q6_K_q8_K(const void * __restrict src0, const void * __restrict src dst->setDataAt({batch, head, src0_inf, sec1_outf}, value); } + + +void vec_dot_q8_0_q8_0(int n, float * __restrict s, const void * __restrict vx, const void * __restrict vy) { + const int qk = QK8_0; + const int nb = n / qk; // number of blocks + + assert(n % qk == 0); + + const auto * __restrict x = static_cast(vx); + const auto * __restrict y = static_cast(vy); + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + assert(nb % 2 == 0); // TODO: handle odd nb + + for (int i = 0; i < nb; i += 2) { + const block_q8_0 * x0 = &x[i + 0]; + const block_q8_0 * x1 = &x[i + 1]; + const block_q8_0 * y0 = &y[i + 0]; + const block_q8_0 * y1 = &y[i + 1]; + + const int8x16_t x0_0 = vld1q_s8(x0->qs); + const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); + const int8x16_t x1_0 = vld1q_s8(x1->qs); + const int8x16_t x1_1 = vld1q_s8(x1->qs + 16); + + // load y + const int8x16_t y0_0 = vld1q_s8(y0->qs); + const int8x16_t y0_1 = vld1q_s8(y0->qs + 16); + const int8x16_t y1_0 = vld1q_s8(y1->qs); + const int8x16_t y1_1 = vld1q_s8(y1->qs + 16); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + mllm_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), + mllm_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + mllm_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), + mllm_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined(__AVX2__) || defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; ++i) { + // Compute combined scale for the block + const __m256 d = _mm256_set1_ps(MLLM_FP16_TO_FP32(x[i].d) * MLLM_FP16_TO_FP32(y[i].d)); + __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs); + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + // Multiply q with scale and accumulate +#if defined(__AVX2__) + acc = _mm256_fmadd_ps( d, q, acc ); +#else + acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc ); +#endif + } + + *s = hsum_float_8(acc); +#endif +} diff --git a/src/backends/cpu/compute/VecDot.hpp b/src/backends/cpu/compute/VecDot.hpp index a8f7706c..1dad4968 100644 --- a/src/backends/cpu/compute/VecDot.hpp +++ b/src/backends/cpu/compute/VecDot.hpp @@ -335,11 +335,28 @@ static inline float hsum_float_8(const __m256 x) { } \ res = vaddvq_f32(x[0]); \ } + +#if !defined(__ARM_FEATURE_DOTPROD) + +inline static int32x4_t mllm_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { + const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); + const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); + + return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); +} + +#else + +#define mllm_vdotq_s32(a, b, c) vdotq_s32(a, b, c) + +#endif // !defined(__ARM_FEATURE_DOTPROD) + + #endif using namespace mllm; -void vec_dot_fp32(const float * __restrict src0, const float * __restrict src1, Tensor *dst, bool support_bias, Tensor *bias, int hid_len, int batch, int head, int src0_inf, int sec1_outf); +// void vec_dot_fp32(const float * __restrict src0, const float * __restrict src1, Tensor *dst, bool support_bias, Tensor *bias, int hid_len, int batch, int head, int src0_inf, int sec1_outf); void vec_dot_q4_0_q8_0(const void * __restrict src0, const void * __restrict src1, Tensor *dst, bool support_bias, Tensor *bias, int hid_len, int batch, int head, int src0_inf, int sec1_outf); void vec_dot_q4_K_q8_K(const void * __restrict src0, const void * __restrict src1, Tensor *dst, bool support_bias, Tensor *bias, int hid_len, int batch, int head, int src0_inf, int sec1_outf); void vec_dot_q6_K_q8_K(const void * __restrict src0, const void * __restrict src1, Tensor *dst, bool support_bias, Tensor *bias, int hid_len, int batch, int head, int src0_inf, int sec1_outf); @@ -350,5 +367,6 @@ void vec_dot_q6_K_q8_K(const int n, float * __restrict s, const void * __restric void vec_dot_q4_0_q8_0(const int n, float * __restrict s, const void * __restrict vx, const void * __restrict vy); void vec_dot_fp32(const int n, float * __restrict s, const float * __restrict vx, const float * __restrict vy); void vec_dot_fp16(const int n, float * __restrict s, const mllm_fp16_t * __restrict vx, const mllm_fp16_t * __restrict vy); +void vec_dot_q8_0_q8_0(int n, float * __restrict s, const void * __restrict vx, const void * __restrict vy); #endif // MLLM_VECDOT_HPP diff --git a/src/backends/cpu/type/type.cpp b/src/backends/cpu/compute/VecDotType.cpp similarity index 64% rename from src/backends/cpu/type/type.cpp rename to src/backends/cpu/compute/VecDotType.cpp index 77e30691..843c1a0f 100644 --- a/src/backends/cpu/type/type.cpp +++ b/src/backends/cpu/compute/VecDotType.cpp @@ -27,185 +27,13 @@ #include #include -#include "type.hpp" +#include "VecDotType.hpp" #include "Types.hpp" #include "quantize/Quantize.hpp" #include "quantize/QuantizeQ6.hpp" #include "compute/VecDot.hpp" -#ifdef __AVX2__ -static void vec_dot_fp32_avx2(const int n, float *__restrict s, const float *__restrict x, const float *__restrict y) { - float sumf = 0.0F; - const int np = (n & ~(MLLM_F32_STEP - 1)); - - MLLM_F32_VEC sum[MLLM_F32_ARR] = {MLLM_F32_VEC_ZERO}; - - MLLM_F32_VEC ax[MLLM_F32_ARR]; - MLLM_F32_VEC ay[MLLM_F32_ARR]; - - for (int i = 0; i < np; i += MLLM_F32_STEP) { - for (int j = 0; j < MLLM_F32_ARR; j++) { - ax[j] = MLLM_F32_VEC_LOAD(x + i + j * MLLM_F32_EPR); - ay[j] = MLLM_F32_VEC_LOAD(y + i + j * MLLM_F32_EPR); - - sum[j] = MLLM_F32_VEC_FMA(sum[j], ax[j], ay[j]); - } - } - - // reduce sum0..sum3 to sum0 - MLLM_F32_VEC_REDUCE(sumf, sum); - - // leftovers - for (int i = np; i < n; ++i) { - sumf += x[i] * y[i]; - } - - *s = sumf; -} -#endif - -#ifdef __ARM_NEON -static void vec_dot_fp32_arm(const int n, float *__restrict s, const float *__restrict x, const float *__restrict y) { - float sumf = 0.0F; - const int np = (n & ~(16 - 1)); - - F32_VEC sum[4] = {vdupq_n_f32(0.0F)}; - - F32_VEC ax[F32_ARR]; - F32_VEC ay[F32_ARR]; - - for (int i = 0; i < np; i += F32_STEP) { - for (int j = 0; j < F32_ARR; j++) { - ax[j] = vld1q_f32(x + i + j * F32_REG); - ay[j] = vld1q_f32(y + i + j * F32_REG); - sum[j] = vfmaq_f32(sum[j], ax[j], ay[j]); - // sum[j] = vmlaq_lane_f32(sum[j], ax[j], ay[0], - } - - } - - // reduce sum0..sum3 to sum0 - F32_VEC_REDUCE(sumf, sum); - - // leftovers - for (int i = np; i < n; ++i) { - sumf += x[i] * y[i]; - } - - *s = sumf; -} -#endif - -static void vec_dot_fp32_local(const int n, float *__restrict s, const float *__restrict vx, const float *__restrict vy) { -#ifdef __AVX2__ - vec_dot_fp32_avx2(n, s, vx, vy); -#elif defined(__ARM_NEON) - vec_dot_fp32_arm(n, s, vx, vy); -#endif -} -/* -void vec_dot_q8_0_q8_0(int n, float * __restrict s, const void * __restrict vx, const void * __restrict vy) { - const int qk = QK8_0; - const int nb = n / qk; // number of blocks - - assert(n % qk == 0); - - const auto * __restrict x = static_cast(vx); - const auto * __restrict y = static_cast(vy); -#if defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - assert(nb % 2 == 0); // TODO: handle odd nb - - for (int i = 0; i < nb; i += 2) { - const block_q8_0 * x0 = &x[i + 0]; - const block_q8_0 * x1 = &x[i + 1]; - const block_q8_0 * y0 = &y[i + 0]; - const block_q8_0 * y1 = &y[i + 1]; - - const int8x16_t x0_0 = vld1q_s8(x0->qs); - const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); - const int8x16_t x1_0 = vld1q_s8(x1->qs); - const int8x16_t x1_1 = vld1q_s8(x1->qs + 16); - - // load y - const int8x16_t y0_0 = vld1q_s8(y0->qs); - const int8x16_t y0_1 = vld1q_s8(y0->qs + 16); - const int8x16_t y1_0 = vld1q_s8(y1->qs); - const int8x16_t y1_1 = vld1q_s8(y1->qs + 16); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( - ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), - ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( - ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), - ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - } - - *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); -#elif defined(__AVX2__) || defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - // Main loop - for (int i = 0; i < nb; ++i) { - // Compute combined scale for the block - const __m256 d = _mm256_set1_ps(MLLM_FP16_TO_FP32(x[i].d) * MLLM_FP16_TO_FP32(y[i].d)); - __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs); - __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - - const __m256 q = mul_sum_i8_pairs_float(bx, by); - - // Multiply q with scale and accumulate -#if defined(__AVX2__) - acc = _mm256_fmadd_ps( d, q, acc ); -#else - acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc ); -#endif - } - - *s = hsum_float_8(acc); -#elif defined(__riscv_v_intrinsic) - float sumf = 0.0; - size_t vl = __riscv_vsetvl_e8m1(qk); - - for (int i = 0; i < nb; i++) { - // load elements - vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl); - vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl); - - vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl); - - vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl); - vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl); - - int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum); - - sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)); - } - - *s = sumf; -#else - // scalar - float sumf = 0.0; - - for (int i = 0; i < nb; i++) { - int sumi = 0; - - for (int j = 0; j < qk; j++) { - sumi += x[i].qs[j]*y[i].qs[j]; - } - - sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)); - } - - *s = sumf; -#endif -} -*/ void fp32_add_row_to(int n, const float * MLLM_RESTRICT src, float * MLLM_RESTRICT dst, float alpha){ int i = 0; #ifdef __AVX2__ @@ -396,7 +224,7 @@ type_traits_t type_traits[] = { .blck_size = 1, .to_float = nullptr, .from_float = nullptr, - .vec_dot = (mllm_vec_dot_func)vec_dot_fp32_local, + .vec_dot = (mllm_vec_dot_func)vec_dot_fp32, .vec_dot_type = MLLM_TYPE_F32, .add_row_to = (mllm_vec_add_row_func)fp32_add_row_to, }, @@ -425,15 +253,15 @@ type_traits_t type_traits[] = { {}, {}, {}, - // /*[MLLM_TYPE_Q8_0] = */{ - // .size = sizeof(block_q8_0), - // .blck_size = QK8_0, - // .to_float = (mllm_to_float_func) dequantize_row_q8_0, - // .from_float = (mllm_from_float_func) quantize_row_q8_0, - // .vec_dot = (mllm_vec_dot_func) vec_dot_q8_0_q8_0, - // .vec_dot_type = MLLM_TYPE_Q8_0, - // .add_row_to = (mllm_vec_add_row_func)q8_0_add_row_to, - // }, + /*[MLLM_TYPE_Q8_0] = */{ + .size = sizeof(block_q8_0), + .blck_size = QK8_0, + .to_float = (mllm_to_float_func) dequantize_row_q8_0, + .from_float = (mllm_from_float_func) quantize_row_q8_0, + .vec_dot = (mllm_vec_dot_func) vec_dot_q8_0_q8_0, + .vec_dot_type = MLLM_TYPE_Q8_0, + .add_row_to = (mllm_vec_add_row_func)q8_0_add_row_to, + }, /*[MLLM_TYPE_Q8_1] = */{}, {}, {}, diff --git a/src/backends/cpu/type/type.hpp b/src/backends/cpu/compute/VecDotType.hpp similarity index 100% rename from src/backends/cpu/type/type.hpp rename to src/backends/cpu/compute/VecDotType.hpp From 689d30e0f2a5104f64b2980fd5e63c407f6cc17f Mon Sep 17 00:00:00 2001 From: yirongjie Date: Tue, 16 Jul 2024 17:15:14 +0800 Subject: [PATCH 2/7] fix: Typo --- src/backends/cpu/compute/VecDot.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backends/cpu/compute/VecDot.cpp b/src/backends/cpu/compute/VecDot.cpp index b5270f7b..2fcca52f 100644 --- a/src/backends/cpu/compute/VecDot.cpp +++ b/src/backends/cpu/compute/VecDot.cpp @@ -1381,11 +1381,11 @@ void vec_dot_q8_0_q8_0(int n, float * __restrict s, const void * __restrict vx, sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( mllm_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), - mllm_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + mllm_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), MLLM_FP16_TO_FP32(x0->d)*MLLM_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( mllm_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), - mllm_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + mllm_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), MLLM_FP16_TO_FP32(x1->d)*MLLM_FP16_TO_FP32(y1->d)); } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); From a3eb979f7deb604f8ada8880d3aa5a32c54d9023 Mon Sep 17 00:00:00 2001 From: yirongjie Date: Wed, 17 Jul 2024 02:25:59 +0000 Subject: [PATCH 3/7] feat: add demo eladtic_llama --- CMakeLists.txt | 14 +++ examples/demo_elastic_llama.cpp | 58 ++++++++++ include/OpDefined.hpp | 4 + src/Layer.hpp | 77 ++++++++++++++ src/backends/cpu/CPUBackend.cpp | 2 + src/backends/cpu/CPUElasticLinear.cpp | 111 ++++++++++++++++++++ src/backends/cpu/CPUElasticLinear.hpp | 47 +++++++++ src/backends/cpu/compute/MatmulElastic.cpp | 58 +++++----- src/backends/cpu/compute/MatmulElastic.hpp | 10 +- src/models/llama/modeling_elastic_llama.hpp | 94 +++++++++++++++++ 10 files changed, 443 insertions(+), 32 deletions(-) create mode 100644 examples/demo_elastic_llama.cpp create mode 100644 src/backends/cpu/CPUElasticLinear.cpp create mode 100644 src/backends/cpu/CPUElasticLinear.hpp create mode 100644 src/models/llama/modeling_elastic_llama.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 22d2b686..739bf75d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -424,6 +424,20 @@ else () target_link_libraries(demo_sparse_llama MLLM_CPU) endif () +add_executable(demo_elastic_llama ${PROJECT_SOURCE_DIR}/examples/demo_elastic_llama.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC} # ${DIR_SRC_QUANT} + src/tokenizers/Tokenizer.cpp + src/tokenizers/Tokenizer.hpp + src/tokenizers/BPE/Bpe.cpp + src/tokenizers/BPE/Bpe.hpp +) +# target_compile_definitions(demo_elastic_llama PRIVATE MLLM_QKK_64) +if (ARM AND NOT APK) + target_compile_options(demo_elastic_llama PRIVATE -fopenmp) + target_link_libraries(demo_elastic_llama PUBLIC MLLM_CPU -fopenmp -static-openmp) +else () + target_link_libraries(demo_elastic_llama MLLM_CPU) +endif () + add_executable(demo_llava ${PROJECT_SOURCE_DIR}/examples/demo_llava.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC} src/tokenizers/Tokenizer.cpp src/tokenizers/BPE/Bpe.cpp diff --git a/examples/demo_elastic_llama.cpp b/examples/demo_elastic_llama.cpp new file mode 100644 index 00000000..94e90d07 --- /dev/null +++ b/examples/demo_elastic_llama.cpp @@ -0,0 +1,58 @@ +// +// Created by Rongjie Yi on 2024/1/26 0026. +// + +#include +#include "cmdline.h" +#include "models/llama/modeling_elastic_llama.hpp" +#include "models/llama/tokenization_llama.hpp" +#include "processor/PostProcess.hpp" + + +using namespace mllm; + +int main(int argc, char **argv) { + cmdline::parser cmdParser; + cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm"); + cmdParser.add("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_k.mllm"); + cmdParser.add("limits", 'l', "max KV cache size", false, 400); + cmdParser.add("thread", 't', "num of threads", false, 4); + cmdParser.parse_check(argc, argv); + + string vocab_path = cmdParser.get("vocab"); + string model_path = cmdParser.get("model"); + int tokens_limit = cmdParser.get("limits"); + CPUBackend::cpu_threads = cmdParser.get("thread"); + + auto tokenizer = LLaMATokenizer(vocab_path); + + LLaMAConfig config(tokens_limit, "7B", LLAMAROPE); + auto model = ElasticLLaMAModel(config); + model.load(model_path); + + vector in_strs = { + " Hello, who are you?", + " What can you do?", + "Please introduce Beijing University of Posts and Telecommunications."}; + + for (int i = 0; i < in_strs.size(); ++i) { + auto in_str = in_strs[i]; + auto input_tensor = tokenizer.tokenize(in_str, i); + std::cout << "[Q] " << in_str << std::endl; + std::cout << "[A] " << std::flush; + for (int step = 0; step < 1; step++) { + auto result = model({input_tensor}); + auto outputs = tokenizer.detokenize(result[0]); + auto out_string = outputs.first; + auto out_token = outputs.second; + if (out_token == 2) { + break; + } + std::cout << out_string << std::flush; + chatPostProcessing(out_token, input_tensor, {}); + } + printf("\n"); + } + + return 0; +} \ No newline at end of file diff --git a/include/OpDefined.hpp b/include/OpDefined.hpp index 0e36cf2c..a1ebdb4c 100644 --- a/include/OpDefined.hpp +++ b/include/OpDefined.hpp @@ -48,6 +48,7 @@ enum OpType { PREDICTOR, SPARSELINEAR, SPARSEIDLINEAR, + ELASTICLINEAR, OP_NUM }; @@ -89,6 +90,9 @@ static const vector OpNames = { "Range", "Where", "Replace", + "SparseLinear", + "SparseIdLinear", + "ElasticLinear", "OP_NUM"}; enum TensorFuncType { diff --git a/src/Layer.hpp b/src/Layer.hpp index 959f6b8e..956f74e5 100644 --- a/src/Layer.hpp +++ b/src/Layer.hpp @@ -5,6 +5,8 @@ #ifndef OPERATION_H #define OPERATION_H +#include +#include #include #include "Tensor.hpp" @@ -49,6 +51,13 @@ class Layer { return _3I1O_OP(input0, input1, input2); } + Tensor &operator()(Tensor &input0, int activate_input_dim, int activate_output_dim) { + auto activate_input_dim_tensor = Tensor(1, 1, 1, 1, backend_, true); + activate_input_dim_tensor.setDataAt(0,0,0,0,(float)activate_input_dim); + auto activate_output_dim_tensor = Tensor(1, 1, 1, 1, backend_, true); + activate_output_dim_tensor.setDataAt(0,0,0,0,(float)activate_output_dim); + return _3I1O_only1map_OP(input0, activate_input_dim_tensor, activate_output_dim_tensor); + } private: std::string name_num_to_X(const std::string &input_string) { @@ -344,6 +353,62 @@ class Layer { return Tensor::gph_[next_name]; } } + Tensor &_3I1O_only1map_OP(Tensor &input0, Tensor &input1, Tensor &input2) { + Module::runlistIdx = saved_list_idx; + if (INIT_OP()) { + return input0; + } else { + string layer_next_name = "out-" + op_->name(); + if (Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) { + Tensor::gph_[input0.name()].status() = input0.status(); + } + switch (input0.status()) { + case TENSOR_STATIC_INIT: { + if (Tensor::gph_.find(input0.name()) == Tensor::gph_.end() || input0.count() != Tensor::gph_[input0.name()].count()) { + Tensor::gph_[input0.name()] = input0; + Tensor::gph_[input0.name()].setName(input0.name()); + } + if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) { + layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name); + } + auto next_name = layername_2_tensorname[layer_next_name]; + if (Tensor::gph_.find(next_name) == Tensor::gph_.end()) { + Tensor::gph_[next_name] = Tensor(backend_); + Tensor::gph_[next_name].setName(next_name); + } + vector> shared_inputs{ + std::shared_ptr(&Tensor::gph_[input0.name()], [](Tensor *) {}), + std::shared_ptr(&input1, [](Tensor *) {}), + std::shared_ptr(&input2, [](Tensor *) {})}; + vector> shared_outputs{std::shared_ptr(&Tensor::gph_[next_name], [](Tensor *) {})}; + op_->reshape(shared_inputs, shared_outputs); + op_->setUp(shared_inputs, shared_outputs); + assert(Tensor::gph_[next_name].hostPtr() != nullptr); + break; + } + case TENSOR_STATIC_READY: { + auto next_name = layername_2_tensorname[layer_next_name]; + vector> shared_inputs{ + std::shared_ptr(&Tensor::gph_[input0.name()], [](Tensor *) {}), + std::shared_ptr(&input1, [](Tensor *) {}), + std::shared_ptr(&input2, [](Tensor *) {})}; + vector> shared_outputs{std::shared_ptr(&Tensor::gph_[next_name], [](Tensor *) {})}; + op_->execute(shared_inputs, shared_outputs); + assert(Tensor::gph_[next_name].hostPtr() != nullptr); + break; + } + default: { + break; + } + } + auto next_name = layername_2_tensorname[layer_next_name]; + Tensor::gph_[next_name].status() = Tensor::gph_[input0.name()].status(); + if(saveNDataFlag){ + Tensor::gph_[next_name].saveNData(layer_next_name); + } + return Tensor::gph_[next_name]; + } + } Tensor &_0I1O_OP() { Module::runlistIdx = saved_list_idx; if (INIT_OP()) { @@ -525,6 +590,18 @@ class Predictor final : public Layer { // no need to defined a new operator() function, just use the default one }; +class ElasticLinear final : public Layer { +public: + explicit ElasticLinear(int in_features, int out_features, bool bias, std::string name) { + param_["in_features"] = in_features; + param_["out_features"] = out_features; + param_["bias"] = (float)bias; + init(std::move(name), OpType::ELASTICLINEAR); + } + // Use: Tensor &operator()(Tensor &input0, int activate_input_dim, int activate_output_dim) { +}; + + class SiLU final : public Layer { public: SiLU() = default; diff --git a/src/backends/cpu/CPUBackend.cpp b/src/backends/cpu/CPUBackend.cpp index 6f6f8488..fdad5ad6 100644 --- a/src/backends/cpu/CPUBackend.cpp +++ b/src/backends/cpu/CPUBackend.cpp @@ -40,6 +40,7 @@ #include "CPUPredictor.hpp" #include "CPUSparseIdLinear.hpp" #include "CPUSparseLinear.hpp" +#include "CPUElasticLinear.hpp" #include "CPUTensorFunction.hpp" namespace mllm { @@ -99,6 +100,7 @@ void CPUBackend::registerOps() { addCreator(PREDICTOR, (CPUBackend::Creator *)(new CPUPredictorCreator())); addCreator(SPARSELINEAR, (CPUBackend::Creator *)(new CPUSparseLinearCreator())); addCreator(SPARSEIDLINEAR, (CPUBackend::Creator *)(new CPUSparseIdLinearCreator())); + addCreator(ELASTICLINEAR, (CPUBackend::Creator *)(new CPUElasticLinearCreator())); } TensorFunction *CPUBackend::funcCreate(const TensorFuncType type) { diff --git a/src/backends/cpu/CPUElasticLinear.cpp b/src/backends/cpu/CPUElasticLinear.cpp new file mode 100644 index 00000000..85a8c603 --- /dev/null +++ b/src/backends/cpu/CPUElasticLinear.cpp @@ -0,0 +1,111 @@ + +#include "CPUElasticLinear.hpp" +#include "compute/MatmulElastic.hpp" + +namespace mllm { + +CPUElasticLinear::CPUElasticLinear(Backend *bn, string opName, int in_features, int out_features, bool bias, int threadCount) : thread_count(threadCount), + Op(bn, opName) { + in_features_ = in_features; + out_features_ = out_features; + support_bias_ = bias; + thread_count = threadCount; + weight_.setBackend(bn); + bias_.setBackend(bn); +} + +ErrorCode CPUElasticLinear::reshape(vector> inputs, vector> outputs) { + //std::cout << name() << " CPUElasticLinear reshape" << std::endl; + assert(inputs.size() == 3); + assert(outputs.size() == 1); + if(inputs[0]->count() == 0) { + outputs[0]->reshape(0,0,0,0); + return Op::reshape(inputs, outputs); + } + // N | C | H | W + // ----------------------------------------------- + // 1 |out_channel | in_channel | 1 + // |out_features| in_features | + // ----------------------------------------------- + // batch |in_channel | seq_len | 1 + // |in_features | inputs[0]->sequence() | + // ----------------------------------------------- + // batch |out_channel | seq_len | 1 + // |out_features| inputs[0]->sequence() | + assert(inputs[0]->head() == 1); + assert(in_features_ == inputs[0]->dimension()); + outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence(), out_features_); + //outputs[0]->setDtype(activationDtype()); + return Op::reshape(inputs, outputs); +} + +ErrorCode CPUElasticLinear::load(AbstructLoader &loader) { + //std::cout << name() << " CPUElasticLinear load" << std::endl; + weight_.setName(name() + ".weight"); + weight_.reshape(1, 1, out_features_, in_features_); + if (loader.getDataType(weight_.name()) != MLLM_TYPE_COUNT) { + weight_.setDtype(loader.getDataType(weight_.name())); + weight_.alloc(); + loader.load(&weight_); + } else { + weight_.setDtype(MLLM_TYPE_F32); + weight_.alloc(); + } + if (support_bias_) { + bias_.setName(name() + ".bias"); + bias_.reshape(1, 1, 1, out_features_); + if (loader.getDataType(bias_.name()) != MLLM_TYPE_COUNT) { + bias_.setDtype(loader.getDataType(bias_.name())); + bias_.alloc(); + loader.load(&bias_); + } else { + bias_.setDtype(MLLM_TYPE_F32); + bias_.alloc(); + } + } + return Op::load(loader); +} + +ErrorCode CPUElasticLinear::execute(vector> inputs, vector> outputs) { + int activate_input_dim = (int)inputs[1]->dataAt(0,0,0,0); + int activate_output_dim = (int)inputs[2]->dataAt(0,0,0,0); + +// auto start = mllm::mllm_time_us(); + if(inputs[0]->count() == 0) { + return Op::execute(inputs, outputs); + } + // std::cout << name() << " CPUElasticLinear()" << std::endl; + switch (weight_.dtype()) { + case MLLM_TYPE_F32: { + mat_mul_elastic_fp32(inputs[0].get(), &weight_, outputs[0].get(), support_bias_, &bias_, activate_input_dim,activate_output_dim, false, true, thread_count); + break; + } + case MLLM_TYPE_F16: break; + case MLLM_TYPE_Q4_0: { + mat_mul_elastic_fp32_q4_0(inputs[0].get(), &weight_, outputs[0].get(), support_bias_, &bias_, activate_input_dim,activate_output_dim, thread_count); + break; + } + case MLLM_TYPE_Q4_K: { + mat_mul_elastic_fp32_q4_K(inputs[0].get(), &weight_, outputs[0].get(), support_bias_, &bias_, activate_input_dim,activate_output_dim, thread_count); + break; + } + case MLLM_TYPE_Q6_K: { + mat_mul_elastic_fp32_q6_K(inputs[0].get(), &weight_, outputs[0].get(), support_bias_, &bias_, activate_input_dim,activate_output_dim, thread_count); + break; + } + default: + break; + } +// auto end = mllm::mllm_time_us(); +// printf("exec time: %ld us\n", end - start); + return Op::execute(inputs, outputs); +} +ErrorCode CPUElasticLinear::free(vector> inputs, vector> outputs) { + weight_.free(); + if (support_bias_) { + bias_.free(); + } + return Op::free(inputs, outputs); +} + +} // namespace mllm diff --git a/src/backends/cpu/CPUElasticLinear.hpp b/src/backends/cpu/CPUElasticLinear.hpp new file mode 100644 index 00000000..926fa64a --- /dev/null +++ b/src/backends/cpu/CPUElasticLinear.hpp @@ -0,0 +1,47 @@ +#ifndef MLLM_CPUELASTICLINEAR_H +#define MLLM_CPUELASTICLINEAR_H + +#include "Op.hpp" +#include "CPUBackend.hpp" + +namespace mllm { + +class Tensor; +class CPUElasticLinear final : public Op { +public: + CPUElasticLinear(Backend *bn, string opName, int in_features, int out_features, bool bias, int threadCount); + virtual ~CPUElasticLinear() = default; + virtual ErrorCode reshape(vector> inputs, vector> outputs) override; + virtual ErrorCode load(AbstructLoader &loader) override; + virtual ErrorCode execute(vector> inputs, vector> outputs) override; + virtual ErrorCode free(vector> inputs, vector> outputs) override; + + Tensor &weight() { + return weight_; + } + Tensor &bias() { + return bias_; + } + +private: + int in_features_; + int out_features_; + bool support_bias_; + int thread_count = 4; + Tensor weight_; + Tensor bias_; +}; + +class CPUElasticLinearCreator : public CPUBackend::Creator { +public: + virtual Op *create(OpParam op_param, Backend *bn, string name, int threadCount) const { + int in_features = op_param["in_features"]; + int out_features = op_param["out_features"]; + int bias = op_param["bias"]; + return new CPUElasticLinear(bn, name, in_features, out_features, (bool)bias, threadCount); + } +}; + +} // namespace mllm + +#endif // MLLM_CPUELASTICLINEAR_H \ No newline at end of file diff --git a/src/backends/cpu/compute/MatmulElastic.cpp b/src/backends/cpu/compute/MatmulElastic.cpp index 70049cb4..884cd5a6 100644 --- a/src/backends/cpu/compute/MatmulElastic.cpp +++ b/src/backends/cpu/compute/MatmulElastic.cpp @@ -6,10 +6,14 @@ #include -ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, bool transpose0, bool transpose1, int thread_count) { +ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, + int activate_input_dim, int activate_output_dim, + bool transpose0, bool transpose1, int thread_count) { const int M = transpose0 ? src0->dimension() : src0->sequence(); const int K = transpose0 ? src0->sequence() : src0->dimension(); const int N = transpose1 ? src1->sequence() : src1->dimension(); + int use_N = (activate_output_dim == -1) ? N : activate_output_dim; + int use_K = (activate_input_dim == -1) ? K : activate_input_dim; Tensor *src0_cal = src0; Tensor *src1_cal = src1; const int64_t blck_0 = 16; @@ -18,7 +22,6 @@ ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool sup const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b; const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h; for (int m = 0; m < M; m++) { - int use_N = N; const int num_blocks = use_N / blck_0; const int remainder = use_N % blck_0; #pragma omp parallel for num_threads(thread_count) @@ -34,7 +37,7 @@ ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool sup s_1 = 0; d_1 = n; s_0 = 0; d_0 = m; } if(dst->dtypeAt(b,h,m,n) == MLLM_TYPE_F32) { - vec_dot_fp32(K, dst->ptrAt(b, h, m, n), + vec_dot_fp32(use_K, dst->ptrAt(b, h, m, n), src1_cal->hostPtr() + src1_cal->offset(b_1, h_1, s_1, d_1), src0_cal->hostPtr() + src0_cal->offset(b, h, s_0, d_0)); if (support_bias) { @@ -42,7 +45,7 @@ ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool sup } }else if (dst->dtypeAt(b,h,m,n) == MLLM_TYPE_F16) { float tmp = 0; - vec_dot_fp32(K, &tmp, + vec_dot_fp32(use_K, &tmp, src1_cal->hostPtr() + src1_cal->offset(b_1, h_1, s_1, d_1), src0_cal->hostPtr() + src0_cal->offset(b, h, s_0, d_0)); if (support_bias) { @@ -59,7 +62,9 @@ ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool sup return MLLM_NO_ERROR; } -ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, bool transpose0, bool transpose1, int thread_count) { +ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, + int activate_input_dim, int activate_output_dim, + bool transpose0, bool transpose1, int thread_count) { assert(src1->dtype() == MLLM_TYPE_F16); assert(src0_->dtype() == MLLM_TYPE_F32); Tensor src0_qf16(src0_->shape()); @@ -77,17 +82,11 @@ ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bo } } auto *src0 = &src0_qf16; - // for(int b=0; bdimension(); b++) { - // std::cout<ptrAt(0, 0, 0, b))<<" "; - // } - // std::cout<dimension(); b++) { - // std::cout<ptrAt(0, 0, 0, b))<<" "; - // } - // std::cout<dimension() : src0->sequence(); const int K = transpose0 ? src0->sequence() : src0->dimension(); const int N = transpose1 ? src1->sequence() : src1->dimension(); + int use_N = (activate_output_dim == -1) ? N : activate_output_dim; + int use_K = (activate_input_dim == -1) ? K : activate_input_dim; Tensor *src0_cal = src0; Tensor *src1_cal = src1; const int64_t blck_0 = 16; @@ -96,7 +95,6 @@ ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bo const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b; const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h; for (int m = 0; m < M; m++) { - int use_N = N; const int num_blocks = use_N / blck_0; const int remainder = use_N % blck_0; #pragma omp parallel for num_threads(thread_count) @@ -111,7 +109,7 @@ ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bo } else { s_1 = 0; d_1 = n; s_0 = 0; d_0 = m; } - vec_dot_fp16(K, dst->ptrAt(b, h, m, n), + vec_dot_fp16(use_K, dst->ptrAt(b, h, m, n), src1_cal->hostPtr() + src1_cal->offset(b_1, h_1, s_1, d_1), src0_cal->hostPtr() + src0_cal->offset(b, h, s_0, d_0)); if (support_bias) { @@ -125,7 +123,8 @@ ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bo return MLLM_NO_ERROR; } -ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, int thread_count) { +ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, + int activate_input_dim, int activate_output_dim, int thread_count) { assert(src1->dtype() == MLLM_TYPE_Q4_0); assert(src0_->dtype() == MLLM_TYPE_F32); Tensor src0_q8(src0_->shape()); @@ -152,6 +151,8 @@ ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bo int M = src0->sequence(); int K = src0->dimension(); int N = src1->sequence(); + int use_N = (activate_output_dim == -1) ? N : activate_output_dim; + int use_K = (activate_input_dim == -1) ? K : activate_input_dim; Tensor *src0_cal = src0; Tensor *src1_cal = src1; const int64_t blck_0 = 16; @@ -160,13 +161,12 @@ ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bo const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b; const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h; for (int m = 0; m < M; m++) { - int use_N = N; const int num_blocks = use_N / blck_0; const int remainder = use_N % blck_0; #pragma omp parallel for num_threads(thread_count) for (int block = 0; block < num_blocks + 1; block++) { for (int n = block * blck_0; n < (block + 1) * blck_0 & n < num_blocks * blck_0 + remainder; n++) { - vec_dot_q4_0_q8_0(K, dst->ptrAt(b, h, m, n), + vec_dot_q4_0_q8_0(use_K, dst->ptrAt(b, h, m, n), src1_cal->hostPtr() + src1_cal->offset(b_1, h_1, n, 0) / QK4_0, src0_cal->hostPtr() + src0_cal->offset(b, h, m, 0) / QK8_0); if (support_bias) { @@ -180,7 +180,8 @@ ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bo return MLLM_NO_ERROR; } -ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, int thread_count) { +ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, + int activate_input_dim, int activate_output_dim, int thread_count) { assert(src1->dtype() == MLLM_TYPE_Q4_K); assert(src0_->dtype() == MLLM_TYPE_F32); Tensor src0_q8(src0_->shape()); @@ -206,7 +207,9 @@ ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bo assert(src0->dtype() == MLLM_TYPE_Q8_K); int M = src0->sequence(); int K = src0->dimension(); - int N = src1->sequence(); + int N = src1->sequence(); + int use_N = (activate_output_dim == -1) ? N : activate_output_dim; + int use_K = (activate_input_dim == -1) ? K : activate_input_dim; Tensor *src0_cal = src0; Tensor *src1_cal = src1; const int64_t blck_0 = 16; @@ -216,14 +219,13 @@ ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bo const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b; const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h; for (int m = 0; m < M; m++) { - int use_N = N; const int num_blocks = use_N / blck_0; const int remainder = use_N % blck_0; #pragma omp parallel for num_threads(thread_count) for (int block = 0; block < num_blocks + 1; block++) { for (int n = block * blck_0; n < (block + 1) * blck_0 & n < num_blocks * blck_0 + remainder; n++) { if(dst->dtypeAt(b,h,m,n) == MLLM_TYPE_F32) { - vec_dot_q4_K_q8_K(K, dst->ptrAt(b, h, m, n), + vec_dot_q4_K_q8_K(use_K, dst->ptrAt(b, h, m, n), src1_cal->hostPtr() + src1_cal->offset(b_1, h_1, n, 0) / QK_K, src0_cal->hostPtr() + src0_cal->offset(b, h, m, 0) / QK_K); if (support_bias) { @@ -231,7 +233,7 @@ ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bo } } else if (dst->dtypeAt(b,h,m,n) == MLLM_TYPE_F16) { float tmp = 0; - vec_dot_q4_K_q8_K(K, &tmp, + vec_dot_q4_K_q8_K(use_K, &tmp, src1_cal->hostPtr() + src1_cal->offset(b_1, h_1, n, 0) / QK_K, src0_cal->hostPtr() + src0_cal->offset(b, h, m, 0) / QK_K); if (support_bias) { @@ -248,7 +250,8 @@ ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bo return MLLM_NO_ERROR; } -ErrorCode mat_mul_elastic_fp32_q6_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, int thread_count) { +ErrorCode mat_mul_elastic_fp32_q6_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, + int activate_input_dim, int activate_output_dim, int thread_count) { assert(src1->dtype() == MLLM_TYPE_Q6_K); assert(src0_->dtype() == MLLM_TYPE_F32); Tensor src0_q8(src0_->shape()); @@ -275,6 +278,8 @@ ErrorCode mat_mul_elastic_fp32_q6_K(Tensor *src0_, Tensor *src1, Tensor *dst, bo int M = src0->sequence(); int K = src0->dimension(); int N = src1->sequence(); + int use_N = (activate_output_dim == -1) ? N : activate_output_dim; + int use_K = (activate_input_dim == -1) ? K : activate_input_dim; Tensor *src0_cal = src0; Tensor *src1_cal = src1; const int64_t blck_0 = 16; @@ -283,14 +288,13 @@ ErrorCode mat_mul_elastic_fp32_q6_K(Tensor *src0_, Tensor *src1, Tensor *dst, bo const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b; const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h; for (int m = 0; m < M; m++) { - int use_N = N; const int num_blocks = use_N / blck_0; const int remainder = use_N % blck_0; #pragma omp parallel for num_threads(thread_count) for (int block = 0; block < num_blocks + 1; block++) { for (int n = block * blck_0; n < (block + 1) * blck_0 & n < num_blocks * blck_0 + remainder; n++) { if (dst->dtypeAt(n, h, m, n) == MLLM_TYPE_F32) { - vec_dot_q6_K_q8_K(K, dst->ptrAt(b, h, m, n), + vec_dot_q6_K_q8_K(use_K, dst->ptrAt(b, h, m, n), src1_cal->hostPtr() + src1_cal->offset(b_1, h_1, n, 0) / QK_K, src0_cal->hostPtr() + src0_cal->offset(b, h, m, 0) / QK_K); if (support_bias) { @@ -298,7 +302,7 @@ ErrorCode mat_mul_elastic_fp32_q6_K(Tensor *src0_, Tensor *src1, Tensor *dst, bo } } else if (dst->dtypeAt(n, h, m, n) == MLLM_TYPE_F16) { float tmp = 0; - vec_dot_q6_K_q8_K(K, &tmp, + vec_dot_q6_K_q8_K(use_K, &tmp, src1_cal->hostPtr() + src1_cal->offset(b_1, h_1, n, 0) / QK_K, src0_cal->hostPtr() + src0_cal->offset(b, h, m, 0) / QK_K); diff --git a/src/backends/cpu/compute/MatmulElastic.hpp b/src/backends/cpu/compute/MatmulElastic.hpp index 828fe16e..cd4bef9d 100644 --- a/src/backends/cpu/compute/MatmulElastic.hpp +++ b/src/backends/cpu/compute/MatmulElastic.hpp @@ -9,10 +9,10 @@ #include "VecDot.hpp" using namespace mllm; -ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, bool transpose0 = false, bool transpose1 = false, int thread_count=4); -ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, bool transpose0 = false, bool transpose1 = false, int thread_count=4); -ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int thread_count=4); -ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int thread_count=4); -ErrorCode mat_mul_elastic_fp32_q6_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int thread_count=4); +ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int activate_input_dim=-1, int activate_output_dim=-1, bool transpose0 = false, bool transpose1 = false, int thread_count=4); +ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int activate_input_dim=-1, int activate_output_dim=-1,bool transpose0 = false, bool transpose1 = false, int thread_count=4); +ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int activate_input_dim=-1, int activate_output_dim=-1,int thread_count=4); +ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int activate_input_dim=-1, int activate_output_dim=-1,int thread_count=4); +ErrorCode mat_mul_elastic_fp32_q6_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int activate_input_dim=-1, int activate_output_dim=-1,int thread_count=4); #endif // MLLM_MATMUL_HPP diff --git a/src/models/llama/modeling_elastic_llama.hpp b/src/models/llama/modeling_elastic_llama.hpp new file mode 100644 index 00000000..a3114ba2 --- /dev/null +++ b/src/models/llama/modeling_elastic_llama.hpp @@ -0,0 +1,94 @@ +// +// Created by Rongjie Yi on 2024/2/4 0004. +// + +#ifndef MODELING_LLAMA_HPP +#define MODELING_LLAMA_HPP + +#include "Layer.hpp" +#include "Module.hpp" +#include "configuration_llama.hpp" +#include "models/transformer/modeling_transformer.hpp" + +using namespace mllm; + +class ElasticLLaMAMLP final : public Module { + Layer gate_proj; + Layer silu; + Layer up_proj; + Layer down_proj; + +public: + ElasticLLaMAMLP() = default; + ElasticLLaMAMLP(int hidden_dim, int ffn_hidden, const LLaMANameConfig &names, const string &base_name) { + gate_proj = ElasticLinear(hidden_dim, ffn_hidden, false, base_name + names._gate_proj_name); + silu = SiLU(base_name + "act"); + up_proj = ElasticLinear(hidden_dim, ffn_hidden, false, base_name + names._up_proj_name); + down_proj = ElasticLinear(ffn_hidden, hidden_dim, false, base_name + names._down_proj_name); + } + vector Forward(vector inputs, vector args) override { + auto x = gate_proj(inputs[0], 256, -1); + x = silu(x); + auto y = up_proj(inputs[0], 256, -1); + x = x * y; + x = down_proj(x, 256, -1); + return {x}; + } +}; + +class ElasticLLaMABlock final : public Module { + MultiHeadAttention attention; + ElasticLLaMAMLP mlp; + Layer norm1; + Layer norm2; + +public: + ElasticLLaMABlock() = default; + ElasticLLaMABlock(int hidden_dim, int head_size, int ffn_hidden, RoPEType RoPE_type, int cache_limit, const LLaMANameConfig &names, const string &base_name) { + attention = MultiHeadAttention(hidden_dim, head_size, head_size, hidden_dim / head_size, SPLIT_NONE, false, false, + RoPE_type, cache_limit, true, false, names, base_name + names._attn_base_name); + mlp = ElasticLLaMAMLP(hidden_dim, ffn_hidden, names, base_name + names._ffn_base_name); + norm1 = RMSNorm(hidden_dim, 1e-6, base_name + names._attn_norm_name); + norm2 = RMSNorm(hidden_dim, 1e-6, base_name + names._ffn_norm_name); + } + vector Forward(vector inputs, vector args) override { + auto x = norm1(inputs[0]); + x = attention({x, x, x})[0]; + auto tmp = x + inputs[0]; + x = norm2(tmp); + x = mlp({x})[0]; + x = x + tmp; + return {x}; + } +}; + +class ElasticLLaMAModel final : public Module { + Layer embedding; + vector blocks; + Layer norm; + Layer lm_head; + +public: + explicit ElasticLLaMAModel(const LLaMAConfig &config) : + ElasticLLaMAModel(config.vocab_size, config.hidden_dim, config.head_size, config.ffn_hidden, config.block_num, config.RoPE_type, config.cache_limit, + config.names_config, config.names_config.blk_name) { + } + ElasticLLaMAModel(int vocab_size, int hidden_dim, int head_size, int ffn_hidden, int block_num, RoPEType RoPE_type, int cache_limit, + const LLaMANameConfig &names, const string &base_name) { + embedding = Embedding(vocab_size, hidden_dim, names.token_embd_name); + blocks = List(block_num, hidden_dim, head_size, ffn_hidden, RoPE_type, cache_limit, names, base_name); + norm = RMSNorm(hidden_dim, 1e-6, names.post_norm_name); + lm_head = Linear(hidden_dim, vocab_size, false, names.lm_head_name); + } + vector Forward(vector inputs, vector args) override { + auto x = embedding(inputs[0]); + for (auto &block : blocks) { + x = block({x})[0]; + } + x = norm(x); + x = lm_head(x); + return {x}; + } +}; + +#endif // MODELING_LLAMA_HPP \ No newline at end of file From f5acae49f3e8aba586344a7ed3d31a9f4c309fad Mon Sep 17 00:00:00 2001 From: Rongjie Yi <41737961+yirongjie@users.noreply.github.com> Date: Wed, 17 Jul 2024 21:46:10 +0800 Subject: [PATCH 4/7] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 48a29e43..a781cb3f 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Wait.. why on-device multimodal LLM? - It's a key building block for [intelligen ## Recent update - [🔥🔥Comming soon] Supporting Qualcomm NPU: [>1000 tokens/second prefilling!](https://arxiv.org/pdf/2407.05858v1) -V1- [2024 July 17] Support new model: StableLM V2 1.6B https://github.com/UbiquitousLearning/mllm/pull/94 +- [2024 July 17] Support new model: StableLM V2 1.6B https://github.com/UbiquitousLearning/mllm/pull/94 - [2024 July 2] Support new model: Yi V1.5 6B https://github.com/UbiquitousLearning/mllm/pull/88 - [2024 May 29] Support new model: Mistral V0.2 7B https://github.com/UbiquitousLearning/mllm/pull/83 - [2024 May 4] Support new model: QWen V1.5 0.5B https://github.com/UbiquitousLearning/mllm/pull/79 From 89118f3e0fd6c8f14dd935875503859bd93cfbff Mon Sep 17 00:00:00 2001 From: yirongjie Date: Thu, 18 Jul 2024 02:19:05 +0000 Subject: [PATCH 5/7] fix: demo elastic llama with attn elastic --- examples/demo_elastic_llama.cpp | 8 +- src/backends/cpu/CPUElasticLinear.cpp | 19 ++-- src/models/llama/modeling_elastic_llama.hpp | 98 +++++++++++++++++++-- 3 files changed, 101 insertions(+), 24 deletions(-) diff --git a/examples/demo_elastic_llama.cpp b/examples/demo_elastic_llama.cpp index 94e90d07..d4301d0b 100644 --- a/examples/demo_elastic_llama.cpp +++ b/examples/demo_elastic_llama.cpp @@ -14,7 +14,7 @@ using namespace mllm; int main(int argc, char **argv) { cmdline::parser cmdParser; cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm"); - cmdParser.add("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_k.mllm"); + cmdParser.add("model", 'm', "specify mllm model path", false, "../models/elasticllama-2-7b-chat-q4_k.mllm"); cmdParser.add("limits", 'l', "max KV cache size", false, 400); cmdParser.add("thread", 't', "num of threads", false, 4); cmdParser.parse_check(argc, argv); @@ -40,8 +40,10 @@ int main(int argc, char **argv) { auto input_tensor = tokenizer.tokenize(in_str, i); std::cout << "[Q] " << in_str << std::endl; std::cout << "[A] " << std::flush; - for (int step = 0; step < 1; step++) { - auto result = model({input_tensor}); + for (int step = 0; step < 100; step++) { + // vector activate_dims = {32*256,256}; // 32*256 is attn_head*attn_hidden_dim(e.g. llama:32*128); 256 is ffn_hidden_dim(e.g. llama:11008) + vector activate_dims = {-1,-1}; + auto result = model({input_tensor}, activate_dims); auto outputs = tokenizer.detokenize(result[0]); auto out_string = outputs.first; auto out_token = outputs.second; diff --git a/src/backends/cpu/CPUElasticLinear.cpp b/src/backends/cpu/CPUElasticLinear.cpp index 85a8c603..cfd877ba 100644 --- a/src/backends/cpu/CPUElasticLinear.cpp +++ b/src/backends/cpu/CPUElasticLinear.cpp @@ -18,24 +18,17 @@ ErrorCode CPUElasticLinear::reshape(vector> inputs, vectordataAt(0,0,0,0); + int activate_output_dim = (int)inputs[2]->dataAt(0,0,0,0); if(inputs[0]->count() == 0) { outputs[0]->reshape(0,0,0,0); return Op::reshape(inputs, outputs); } - // N | C | H | W - // ----------------------------------------------- - // 1 |out_channel | in_channel | 1 - // |out_features| in_features | - // ----------------------------------------------- - // batch |in_channel | seq_len | 1 - // |in_features | inputs[0]->sequence() | - // ----------------------------------------------- - // batch |out_channel | seq_len | 1 - // |out_features| inputs[0]->sequence() | + int in_dimension = (activate_input_dim == -1) ? in_features_ : activate_input_dim; + int out_dimension = (activate_output_dim == -1) ? out_features_ : activate_output_dim; assert(inputs[0]->head() == 1); - assert(in_features_ == inputs[0]->dimension()); - outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence(), out_features_); - //outputs[0]->setDtype(activationDtype()); + assert(in_dimension == inputs[0]->dimension()); + outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence(), out_dimension); return Op::reshape(inputs, outputs); } diff --git a/src/models/llama/modeling_elastic_llama.hpp b/src/models/llama/modeling_elastic_llama.hpp index a3114ba2..3f4c9dd4 100644 --- a/src/models/llama/modeling_elastic_llama.hpp +++ b/src/models/llama/modeling_elastic_llama.hpp @@ -9,9 +9,84 @@ #include "Module.hpp" #include "configuration_llama.hpp" #include "models/transformer/modeling_transformer.hpp" +#include using namespace mllm; +class ElasticMultiHeadAttention final : public Module { + Layer q_proj; + Layer k_proj; + Layer v_proj; + Layer q_rope; + Layer k_rope; + Layer k_cache; + Layer v_cache; + Layer mask; + Layer softmax; + Layer o_proj; + int head_size_{}; + int kv_head_size_{}; + int attn_hidden_dim_{}; + +public: + ElasticMultiHeadAttention() = default; + ElasticMultiHeadAttention(int hidden_dim, int head_size,int kv_head_size, int attn_hidden_dim, + RoPEType RoPE_type, int cache_limit, bool do_mask, bool bias, + const TransformerNameConfig &names, const string &base_name) { + attn_hidden_dim_ = attn_hidden_dim; + head_size_ = head_size; + kv_head_size_ = kv_head_size; + q_proj = ElasticLinear(hidden_dim, head_size * attn_hidden_dim, bias, base_name + names._q_proj_name); + k_proj = ElasticLinear(hidden_dim, kv_head_size * attn_hidden_dim, bias, base_name + names._k_proj_name); + v_proj = ElasticLinear(hidden_dim, kv_head_size * attn_hidden_dim, bias, base_name + names._v_proj_name); + + if (RoPE_type > 0) { + q_rope = RoPE(RoPE_type, base_name + "q_rope"); + k_rope = RoPE(RoPE_type, base_name + "k_rope"); + } + if (cache_limit > 0) { + k_cache = KVCache(head_size/kv_head_size, cache_limit, base_name + "k_cache"); + v_cache = KVCache(head_size/kv_head_size, cache_limit, base_name + "v_cache"); + } + if (do_mask) { + mask = Causalmask(base_name + "mask"); + } + softmax = Softmax(DIMENSION, base_name + "softmax"); + o_proj = ElasticLinear(head_size * attn_hidden_dim, hidden_dim, bias, base_name + names._o_proj_name); + } + vector Forward(vector inputs, vector args) override { + vector activate_dims = std::any_cast>(args[0]); + int activate_dim = activate_dims[0]; + int activate_hidden_dim = (activate_dim==-1)? attn_hidden_dim_: (activate_dim/head_size_); + Tensor q, k, v; + q = q_proj(inputs[0], -1, activate_dim); + k = k_proj(inputs[1], -1, activate_dim); + v = v_proj(inputs[2], -1, activate_dim); + q = q.view(-1, head_size_, -1, activate_hidden_dim); + k = k.view(-1, kv_head_size_, -1, activate_hidden_dim); + v = v.view(-1, kv_head_size_, -1, activate_hidden_dim); + if (q_rope.ready() && k_rope.ready()) { + q = q_rope(q); + k = k_rope(k); + } + if (k_cache.ready() && v_cache.ready()) { + k = k_cache(k); + v = v_cache(v); + } + k = k.transpose(SEQUENCE, DIMENSION); + auto qk = Tensor::mm(q, k); + qk = qk / std::sqrt(activate_hidden_dim);//attn_hidden_dim_ + if (mask.ready()) { + qk = mask(qk); + } + qk = softmax(qk); + auto o = Tensor::mm(qk, v); + o = o.view(-1, 1, -1, activate_hidden_dim * head_size_); + o = o_proj(o, activate_dim, -1); + return {o}; + } +}; + class ElasticLLaMAMLP final : public Module { Layer gate_proj; Layer silu; @@ -27,17 +102,19 @@ class ElasticLLaMAMLP final : public Module { down_proj = ElasticLinear(ffn_hidden, hidden_dim, false, base_name + names._down_proj_name); } vector Forward(vector inputs, vector args) override { - auto x = gate_proj(inputs[0], 256, -1); + vector activate_dims = std::any_cast>(args[0]); + int activate_dim = activate_dims[0]; + auto x = gate_proj(inputs[0], -1, activate_dim); x = silu(x); - auto y = up_proj(inputs[0], 256, -1); + auto y = up_proj(inputs[0], -1, activate_dim); x = x * y; - x = down_proj(x, 256, -1); + x = down_proj(x, activate_dim, -1); return {x}; } }; class ElasticLLaMABlock final : public Module { - MultiHeadAttention attention; + ElasticMultiHeadAttention attention; ElasticLLaMAMLP mlp; Layer norm1; Layer norm2; @@ -45,18 +122,21 @@ class ElasticLLaMABlock final : public Module { public: ElasticLLaMABlock() = default; ElasticLLaMABlock(int hidden_dim, int head_size, int ffn_hidden, RoPEType RoPE_type, int cache_limit, const LLaMANameConfig &names, const string &base_name) { - attention = MultiHeadAttention(hidden_dim, head_size, head_size, hidden_dim / head_size, SPLIT_NONE, false, false, + attention = ElasticMultiHeadAttention(hidden_dim, head_size, head_size, hidden_dim / head_size, RoPE_type, cache_limit, true, false, names, base_name + names._attn_base_name); mlp = ElasticLLaMAMLP(hidden_dim, ffn_hidden, names, base_name + names._ffn_base_name); norm1 = RMSNorm(hidden_dim, 1e-6, base_name + names._attn_norm_name); norm2 = RMSNorm(hidden_dim, 1e-6, base_name + names._ffn_norm_name); } vector Forward(vector inputs, vector args) override { + vector activate_dims = std::any_cast>(args[0]); + vector dim_attns = {activate_dims[0]}; + vector dim_mlps = {activate_dims[1]}; auto x = norm1(inputs[0]); - x = attention({x, x, x})[0]; + x = attention({x, x, x}, dim_attns)[0]; auto tmp = x + inputs[0]; x = norm2(tmp); - x = mlp({x})[0]; + x = mlp({x}, dim_mlps)[0]; x = x + tmp; return {x}; } @@ -81,9 +161,11 @@ class ElasticLLaMAModel final : public Module { lm_head = Linear(hidden_dim, vocab_size, false, names.lm_head_name); } vector Forward(vector inputs, vector args) override { + vector activate_dims = std::any_cast>(args[0]); + auto x = embedding(inputs[0]); for (auto &block : blocks) { - x = block({x})[0]; + x = block({x}, activate_dims)[0]; } x = norm(x); x = lm_head(x); From 51855901515d4f4b8822f76c5ee59b86800d1567 Mon Sep 17 00:00:00 2001 From: yirongjie Date: Thu, 18 Jul 2024 02:20:59 +0000 Subject: [PATCH 6/7] fix --- src/models/llama/modeling_elastic_llama.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/models/llama/modeling_elastic_llama.hpp b/src/models/llama/modeling_elastic_llama.hpp index 3f4c9dd4..b340c5dc 100644 --- a/src/models/llama/modeling_elastic_llama.hpp +++ b/src/models/llama/modeling_elastic_llama.hpp @@ -8,7 +8,6 @@ #include "Layer.hpp" #include "Module.hpp" #include "configuration_llama.hpp" -#include "models/transformer/modeling_transformer.hpp" #include using namespace mllm; From 32f72589e14d0745ee7ea28a365c0c6b8ea5c143 Mon Sep 17 00:00:00 2001 From: yirongjie Date: Thu, 18 Jul 2024 11:00:48 +0000 Subject: [PATCH 7/7] fix: load --- examples/demo_elastic_llama.cpp | 40 +++++++++++++++++++-- src/Module.hpp | 25 +++++++++++-- src/ParamLoader.cpp | 6 +++- src/models/llama/modeling_elastic_llama.hpp | 18 ++++++++-- 4 files changed, 79 insertions(+), 10 deletions(-) diff --git a/examples/demo_elastic_llama.cpp b/examples/demo_elastic_llama.cpp index d4301d0b..585df659 100644 --- a/examples/demo_elastic_llama.cpp +++ b/examples/demo_elastic_llama.cpp @@ -14,7 +14,7 @@ using namespace mllm; int main(int argc, char **argv) { cmdline::parser cmdParser; cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm"); - cmdParser.add("model", 'm', "specify mllm model path", false, "../models/elasticllama-2-7b-chat-q4_k.mllm"); + cmdParser.add("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_k.mllm"); cmdParser.add("limits", 'l', "max KV cache size", false, 400); cmdParser.add("thread", 't', "num of threads", false, 4); cmdParser.parse_check(argc, argv); @@ -41,8 +41,42 @@ int main(int argc, char **argv) { std::cout << "[Q] " << in_str << std::endl; std::cout << "[A] " << std::flush; for (int step = 0; step < 100; step++) { - // vector activate_dims = {32*256,256}; // 32*256 is attn_head*attn_hidden_dim(e.g. llama:32*128); 256 is ffn_hidden_dim(e.g. llama:11008) - vector activate_dims = {-1,-1}; + // vecor> activate_dims = {{32*8,256}}; + // 32*8 is attn_head*attn_hidden_dim(e.g. llama:32*128); 256 is ffn_hidden_dim(e.g. llama:11008) + vector> activate_dims = { + {-1,-1}, //0 + {-1,-1}, //1 + {-1,-1}, //2 + {-1,-1}, //3 + {-1,-1}, //4 + {-1,-1}, //5 + {-1,-1}, //6 + {-1,-1}, //7 + {-1,-1}, //8 + {-1,-1}, //9 + {-1,-1}, //10 + {-1,-1}, //11 + {-1,-1}, //12 + {-1,-1}, //13 + {-1,-1}, //14 + {-1,-1}, //15 + {-1,-1}, //16 + {-1,-1}, //17 + {-1,-1}, //18 + {-1,-1}, //19 + {-1,-1}, //20 + {-1,-1}, //21 + {-1,-1}, //22 + {-1,-1}, //23 + {-1,-1}, //24 + {-1,-1}, //25 + {-1,-1}, //26 + {-1,-1}, //27 + {-1,-1}, //28 + {-1,-1}, //29 + {-1,-1}, //30 + {-1,-1} //31 + }; auto result = model({input_tensor}, activate_dims); auto outputs = tokenizer.detokenize(result[0]); auto out_string = outputs.first; diff --git a/src/Module.hpp b/src/Module.hpp index 12c57d59..c0470e01 100644 --- a/src/Module.hpp +++ b/src/Module.hpp @@ -12,6 +12,7 @@ #include "backends/cpu/CPUBackend.hpp" #include +#include #include #include #include @@ -70,9 +71,27 @@ class Module { Tensor::gph_[std::to_string(i)] = Tensor(Module::backends[MLLM_CPU]); tmps.push_back(Tensor::gph_[std::to_string(i)]); } - vector tmpt = {0, 0}; - uint64_t time_start = mllm_time_us(); - operator()(tmps, tmpt); + vector alternate_args={ + {}, + vector{0, 0}, + std::vector>(32, std::vector(2)) + }; + uint64_t time_start = 0; + for (auto args : alternate_args) { + time_start = mllm_time_us(); + try { + operator()(tmps, args); + break; + } catch (const std::exception& e) { + if("bad any_cast" != e.what()) { + std::cerr << e.what() << std::endl; + exit(0); + } + } catch (...) { + std::cerr << "load error" << std::endl; + exit(0); + } + } uint64_t time_end = mllm_time_us(); load_time_ = (time_end - time_start) / 1000.0F;//ms Module::doLoad = false; diff --git a/src/ParamLoader.cpp b/src/ParamLoader.cpp index f9029d8b..1dd51256 100644 --- a/src/ParamLoader.cpp +++ b/src/ParamLoader.cpp @@ -2,6 +2,7 @@ #include "Types.hpp" #include #include +#include #include #include #include @@ -123,7 +124,10 @@ std::tuple ParamLoader::load(string name) { } DataType ParamLoader::getDataType(string name) { if (data_type_.count(name) != 1) { - if (this->fp_ != nullptr) { + if(this->path_ != "" && this->fp_ == nullptr){ + std::cerr<path_<<" not found"<fp_ != nullptr && this->path_ != "") { std::cerr< #include using namespace mllm; @@ -146,6 +147,7 @@ class ElasticLLaMAModel final : public Module { vector blocks; Layer norm; Layer lm_head; + int num_layer_size; public: explicit ElasticLLaMAModel(const LLaMAConfig &config) : @@ -158,13 +160,23 @@ class ElasticLLaMAModel final : public Module { blocks = List(block_num, hidden_dim, head_size, ffn_hidden, RoPE_type, cache_limit, names, base_name); norm = RMSNorm(hidden_dim, 1e-6, names.post_norm_name); lm_head = Linear(hidden_dim, vocab_size, false, names.lm_head_name); + num_layer_size = block_num; } vector Forward(vector inputs, vector args) override { - vector activate_dims = std::any_cast>(args[0]); + // vector activate_dims = std::any_cast>(args[0]); + // assert(activate_dims.size() == 2*num_layer_size); + // auto x = embedding(inputs[0]); + // for (int id = 0; id activate_dims_ = {activate_dims[id], activate_dims[id+1]}; + // x = blocks[id]({x}, activate_dims_)[0]; + // } + + vector> activate_dims = std::any_cast>>(args[0]); + assert(activate_dims.size() == num_layer_size); auto x = embedding(inputs[0]); - for (auto &block : blocks) { - x = block({x}, activate_dims)[0]; + for (int id = 0; id