From 7324f00bdb32907aee184995f7f6fd5e95947447 Mon Sep 17 00:00:00 2001
From: yirongjie <yirj0809@gmail.com>
Date: Tue, 16 Jul 2024 08:58:54 +0000
Subject: [PATCH 1/7] fix: remove type/type -> VecDotType

---
 src/backends/cpu/CPUPredictor.cpp             |   2 +-
 src/backends/cpu/compute/Matmul.cpp           |   2 +-
 src/backends/cpu/compute/MatmulElastic.cpp    | 320 ++++++++++++++++++
 src/backends/cpu/compute/MatmulElastic.hpp    |  18 +
 src/backends/cpu/compute/VecDot.cpp           |  71 +++-
 src/backends/cpu/compute/VecDot.hpp           |  20 +-
 .../{type/type.cpp => compute/VecDotType.cpp} | 194 +----------
 .../{type/type.hpp => compute/VecDotType.hpp} |   0
 8 files changed, 440 insertions(+), 187 deletions(-)
 create mode 100644 src/backends/cpu/compute/MatmulElastic.cpp
 create mode 100644 src/backends/cpu/compute/MatmulElastic.hpp
 rename src/backends/cpu/{type/type.cpp => compute/VecDotType.cpp} (64%)
 rename src/backends/cpu/{type/type.hpp => compute/VecDotType.hpp} (100%)
diff --git a/src/backends/cpu/CPUPredictor.cpp b/src/backends/cpu/CPUPredictor.cpp
index c84168e1..c852fe1a 100644
--- a/src/backends/cpu/CPUPredictor.cpp
+++ b/src/backends/cpu/CPUPredictor.cpp
@@ -1,6 +1,6 @@
 
 #include "CPUPredictor.hpp"
-#include "type/type.hpp"
+#include "compute/VecDotType.hpp"
 #include "compute/Matmul.hpp"
 
 #include <utility>
diff --git a/src/backends/cpu/compute/Matmul.cpp b/src/backends/cpu/compute/Matmul.cpp
index 09387bc6..3382721f 100644
--- a/src/backends/cpu/compute/Matmul.cpp
+++ b/src/backends/cpu/compute/Matmul.cpp
@@ -3,7 +3,7 @@
 //
 
 #include "Matmul.hpp"
-#include "type/type.hpp"
+#include "VecDotType.hpp"
 #include <pthread.h>
 
 #define ASSERT(x) \
diff --git a/src/backends/cpu/compute/MatmulElastic.cpp b/src/backends/cpu/compute/MatmulElastic.cpp
new file mode 100644
index 00000000..70049cb4
--- /dev/null
+++ b/src/backends/cpu/compute/MatmulElastic.cpp
@@ -0,0 +1,320 @@
+//
+// Created by Rongjie Yi on 23-10-24.
+//
+
+#include "MatmulElastic.hpp"
+#include <pthread.h>
+
+
+ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, bool transpose0, bool transpose1, int thread_count) {
+    const int M = transpose0 ? src0->dimension() : src0->sequence();
+    const int K = transpose0 ? src0->sequence() : src0->dimension();
+    const int N = transpose1 ? src1->sequence() : src1->dimension();
+    Tensor *src0_cal = src0;
+    Tensor *src1_cal = src1;
+    const int64_t blck_0 = 16;
+    for (int b = 0; b < src0->batch(); b++) {
+        for (int h = 0; h < src0->head(); h++) {
+            const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b;
+            const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h;
+            for (int m = 0; m < M; m++) {
+                int use_N = N;
+                const int num_blocks = use_N / blck_0;
+                const int remainder = use_N % blck_0;
+#pragma omp parallel for num_threads(thread_count)
+                for (int block = 0; block < num_blocks + 1; block++) {
+                    for (int n = block * blck_0; n < (block + 1) * blck_0 & n < num_blocks * blck_0 + remainder; n++) {
+                        int s_1, d_1;
+                        int s_0, d_0;
+                        if (!transpose0 && transpose1) {
+                            s_1 = n; d_1 = 0; s_0 = m; d_0 = 0;
+                        } else if (!transpose0 && !transpose1) {
+                            s_1 = 0; d_1 = n; s_0 = m; d_0 = 0;
+                        } else {
+                            s_1 = 0; d_1 = n; s_0 = 0; d_0 = m;
+                        }
+                        if(dst->dtypeAt(b,h,m,n) == MLLM_TYPE_F32) {
+                            vec_dot_fp32(K, dst->ptrAt<float>(b, h, m, n),
+                                         src1_cal->hostPtr<float>() + src1_cal->offset(b_1, h_1, s_1, d_1),
+                                         src0_cal->hostPtr<float>() + src0_cal->offset(b, h, s_0, d_0));
+                            if (support_bias) {
+                                *dst->ptrAt<float>(b, h, m, n) += bias->dataAt<float>(0, 0, 0, n);
+                            }
+                        }else if (dst->dtypeAt(b,h,m,n) == MLLM_TYPE_F16) {
+                            float tmp = 0;
+                            vec_dot_fp32(K, &tmp,
+                                         src1_cal->hostPtr<float>() + src1_cal->offset(b_1, h_1, s_1, d_1),
+                                         src0_cal->hostPtr<float>() + src0_cal->offset(b, h, s_0, d_0));
+                            if (support_bias) {
+                                *dst->ptrAt<mllm_fp16_t>(b, h, m, n) = MLLM_FP32_TO_FP16(tmp + bias->dataAt<float>(0, 0, 0, n));
+                            } else {
+                                *dst->ptrAt<mllm_fp16_t>(b, h, m, n) = MLLM_FP32_TO_FP16(tmp);
+                            }
+                        }else{std::cout<<"Not support type [Matmul]"<<std::endl;}
+                    }
+                }
+            }
+        }
+    }
+    return MLLM_NO_ERROR;
+}
+
+ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, bool transpose0, bool transpose1, int thread_count) {
+    assert(src1->dtype() == MLLM_TYPE_F16);
+    assert(src0_->dtype() == MLLM_TYPE_F32);
+    Tensor src0_qf16(src0_->shape());
+    src0_qf16.setBackend(src0_->backend());
+    src0_qf16.setDtype(MLLM_TYPE_F16);
+    src0_qf16.alloc();
+        for (int b = 0; b < src0_->batch(); b++) {
+            for (int h = 0; h < src0_->head(); h++) {
+#pragma omp parallel for num_threads(thread_count)
+                for (int s = 0; s < src0_->sequence(); s++) {
+                    mllm_fp32_to_fp16_row(src0_->hostPtr<float>() + src0_->offset(b, h, s, 0),
+                                      src0_qf16.hostPtr<mllm_fp16_t>() + src0_qf16.offset(b, h, s, 0),
+                                      src0_->dimension());
+                }
+            }
+        }
+    auto *src0 = &src0_qf16;
+    // for(int b=0; b<src0->dimension(); b++) {
+    //     std::cout<<MLLM_COMPUTE_FP16_TO_FP32(*src0->ptrAt<mllm_fp16_t>(0, 0, 0, b))<<" ";
+    // }
+    // std::cout<<std::endl;
+    // for(int b=0; b<src1->dimension(); b++) {
+    //     std::cout<<MLLM_COMPUTE_FP16_TO_FP32(*src1->ptrAt<mllm_fp16_t>(0, 0, 0, b))<<" ";
+    // }
+    // std::cout<<std::endl;
+    const int M = transpose0 ? src0->dimension() : src0->sequence();
+    const int K = transpose0 ? src0->sequence() : src0->dimension();
+    const int N = transpose1 ? src1->sequence() : src1->dimension();
+    Tensor *src0_cal = src0;
+    Tensor *src1_cal = src1;
+    const int64_t blck_0 = 16;
+    for (int b = 0; b < src0->batch(); b++) {
+        for (int h = 0; h < src0->head(); h++) {
+            const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b;
+            const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h;
+            for (int m = 0; m < M; m++) {
+                int use_N = N;
+                const int num_blocks = use_N / blck_0;
+                const int remainder = use_N % blck_0;
+#pragma omp parallel for num_threads(thread_count)
+                for (int block = 0; block < num_blocks + 1; block++) {
+                    for (int n = block * blck_0; n < (block + 1) * blck_0 & n < num_blocks * blck_0 + remainder; n++) {
+                        int s_1, d_1;
+                        int s_0, d_0;
+                        if (!transpose0 && transpose1) {
+                            s_1 = n; d_1 = 0; s_0 = m; d_0 = 0;
+                        } else if (!transpose0 && !transpose1) {
+                            s_1 = 0; d_1 = n; s_0 = m; d_0 = 0;
+                        } else {
+                            s_1 = 0; d_1 = n; s_0 = 0; d_0 = m;
+                        }
+                        vec_dot_fp16(K, dst->ptrAt<float>(b, h, m, n),
+                                     src1_cal->hostPtr<mllm_fp16_t>() + src1_cal->offset(b_1, h_1, s_1, d_1),
+                                     src0_cal->hostPtr<mllm_fp16_t>() + src0_cal->offset(b, h, s_0, d_0));
+                        if (support_bias) {
+                            *dst->ptrAt<float>(b, h, m, n) += bias->dataAt<float>(0, 0, 0, n);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return MLLM_NO_ERROR;
+}
+
+ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, int thread_count) {
+    assert(src1->dtype() == MLLM_TYPE_Q4_0);
+    assert(src0_->dtype() == MLLM_TYPE_F32);
+    Tensor src0_q8(src0_->shape());
+    src0_q8.setBackend(src0_->backend());
+    src0_q8.setDtype(MLLM_TYPE_Q8_0);
+    src0_q8.alloc();
+    if (src0_->dimension() % QK8_0 == 0) {
+        for (int b = 0; b < src0_->batch(); b++) {
+            for (int h = 0; h < src0_->head(); h++) {
+#pragma omp parallel for num_threads(thread_count)
+                for (int s = 0; s < src0_->sequence(); s++) {
+                    quantize_row_q8_0(src0_->hostPtr<float>() + src0_->offset(b, h, s, 0),
+                                      src0_q8.hostPtr<block_q8_0>() + src0_q8.offset(b, h, s, 0) / QK8_0,
+                                      src0_->dimension());
+                }
+            }
+        }
+    } else {
+        std::cout << "[ERROR]: " << src0_->dimension() << "%" << QK8_0 << "!=0" << std::endl;
+        assert(src0_->dimension() % QK8_0 == 0);
+    }
+    auto *src0 = &src0_q8;
+    assert(src0->dtype() == MLLM_TYPE_Q8_0);
+    int M = src0->sequence();
+    int K = src0->dimension();
+    int N = src1->sequence();
+    Tensor *src0_cal = src0;
+    Tensor *src1_cal = src1;
+    const int64_t blck_0 = 16;
+    for (int b = 0; b < src0->batch(); b++) {
+        for (int h = 0; h < src0->head(); h++) {
+            const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b;
+            const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h;
+            for (int m = 0; m < M; m++) {
+                int use_N = N;
+                const int num_blocks = use_N / blck_0;
+                const int remainder = use_N % blck_0;
+#pragma omp parallel for num_threads(thread_count)
+                for (int block = 0; block < num_blocks + 1; block++) {
+                    for (int n = block * blck_0; n < (block + 1) * blck_0 & n < num_blocks * blck_0 + remainder; n++) {
+                        vec_dot_q4_0_q8_0(K, dst->ptrAt<float>(b, h, m, n),
+                                          src1_cal->hostPtr<block_q4_0>() + src1_cal->offset(b_1, h_1, n, 0) / QK4_0,
+                                          src0_cal->hostPtr<block_q8_0>() + src0_cal->offset(b, h, m, 0) / QK8_0);
+                        if (support_bias) {
+                            *dst->ptrAt<float>(b, h, m, n) += bias->dataAt<float>(0, 0, 0, n);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return MLLM_NO_ERROR;
+}
+
+ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, int thread_count) {
+    assert(src1->dtype() == MLLM_TYPE_Q4_K);
+    assert(src0_->dtype() == MLLM_TYPE_F32);
+    Tensor src0_q8(src0_->shape());
+    src0_q8.setBackend(src0_->backend());
+    src0_q8.setDtype(MLLM_TYPE_Q8_K);
+    src0_q8.alloc();
+    if (src0_->dimension() % QK_K == 0) {
+        for (int b = 0; b < src0_->batch(); b++) {
+            for (int h = 0; h < src0_->head(); h++) {
+#pragma omp parallel for num_threads(thread_count)
+                for (int s = 0; s < src0_->sequence(); s++) {
+                    quantize_row_q8_K(src0_->hostPtr<float>() + src0_->offset(b, h, s, 0),
+                                      src0_q8.hostPtr<block_q8_K>() + src0_q8.offset(b, h, s, 0) / QK_K,
+                                      src0_->dimension());
+                }
+            }
+        }
+    } else {
+        std::cout << "[ERROR]: " << src0_->dimension() << "%" << QK_K << "!=0" << std::endl;
+        assert(src0_->dimension() % QK_K == 0);
+    }
+    auto *src0 = &src0_q8;
+    assert(src0->dtype() == MLLM_TYPE_Q8_K);
+    int M = src0->sequence();
+    int K = src0->dimension();
+    int N = src1->sequence();
+    Tensor *src0_cal = src0;
+    Tensor *src1_cal = src1;
+    const int64_t blck_0 = 16;
+
+    for (int b = 0; b < src0->batch(); b++) {
+        for (int h = 0; h < src0->head(); h++) {
+            const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b;
+            const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h;
+            for (int m = 0; m < M; m++) {
+                int use_N = N;
+                const int num_blocks = use_N / blck_0;
+                const int remainder = use_N % blck_0;
+#pragma omp parallel for num_threads(thread_count)
+                for (int block = 0; block < num_blocks + 1; block++) {
+                    for (int n = block * blck_0; n < (block + 1) * blck_0 & n < num_blocks * blck_0 + remainder; n++) {
+                        if(dst->dtypeAt(b,h,m,n) == MLLM_TYPE_F32) {
+                            vec_dot_q4_K_q8_K(K, dst->ptrAt<float>(b, h, m, n),
+                                              src1_cal->hostPtr<block_q4_K>() + src1_cal->offset(b_1, h_1, n, 0) / QK_K,
+                                              src0_cal->hostPtr<block_q8_K>() + src0_cal->offset(b, h, m, 0) / QK_K);
+                            if (support_bias) {
+                                *dst->ptrAt<float>(b, h, m, n) += bias->dataAt<float>(0, 0, 0, n);
+                            }
+                        } else if (dst->dtypeAt(b,h,m,n) == MLLM_TYPE_F16) {
+                            float tmp = 0;
+                            vec_dot_q4_K_q8_K(K, &tmp,
+                                              src1_cal->hostPtr<block_q4_K>() + src1_cal->offset(b_1, h_1, n, 0) / QK_K,
+                                              src0_cal->hostPtr<block_q8_K>() + src0_cal->offset(b, h, m, 0) / QK_K);
+                            if (support_bias) {
+                                *dst->ptrAt<mllm_fp16_t>(b, h, m, n) = MLLM_FP32_TO_FP16(tmp + bias->dataAt<float>(0, 0, 0, n));
+                            } else {
+                                *dst->ptrAt<mllm_fp16_t>(b, h, m, n) = MLLM_FP32_TO_FP16(tmp);
+                            }
+                        }else{std::cout<<"Not support type [Matmul]"<<std::endl;}
+                    }
+                }
+            }
+        }
+    }
+    return MLLM_NO_ERROR;
+}
+
+ErrorCode mat_mul_elastic_fp32_q6_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, int thread_count) {
+    assert(src1->dtype() == MLLM_TYPE_Q6_K);
+    assert(src0_->dtype() == MLLM_TYPE_F32);
+    Tensor src0_q8(src0_->shape());
+    src0_q8.setBackend(src0_->backend());
+    src0_q8.setDtype(MLLM_TYPE_Q8_K);
+    src0_q8.alloc();
+    if (src0_->dimension() % QK_K == 0) {
+        for (int b = 0; b < src0_->batch(); b++) {
+            for (int h = 0; h < src0_->head(); h++) {
+#pragma omp parallel for num_threads(thread_count)
+                for (int s = 0; s < src0_->sequence(); s++) {
+                    quantize_row_q8_K(src0_->hostPtr<float>() + src0_->offset(b, h, s, 0),
+                                      src0_q8.hostPtr<block_q8_K>() + src0_q8.offset(b, h, s, 0) / QK_K,
+                                      src0_->dimension());
+                }
+            }
+        }
+    } else {
+        std::cout << "[ERROR]: " << src0_->dimension() << "%" << QK_K << "!=0" << std::endl;
+        assert(src0_->dimension() % QK_K == 0);
+    }
+    auto *src0 = &src0_q8;
+    assert(src0->dtype() == MLLM_TYPE_Q8_K);
+    int M = src0->sequence();
+    int K = src0->dimension();
+    int N = src1->sequence();
+    Tensor *src0_cal = src0;
+    Tensor *src1_cal = src1;
+    const int64_t blck_0 = 16;
+    for (int b = 0; b < src0->batch(); b++) {
+        for (int h = 0; h < src0->head(); h++) {
+            const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b;
+            const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h;
+            for (int m = 0; m < M; m++) {
+                int use_N = N;
+                const int num_blocks = use_N / blck_0;
+                const int remainder = use_N % blck_0;
+#pragma omp parallel for num_threads(thread_count)
+                for (int block = 0; block < num_blocks + 1; block++) {
+                    for (int n = block * blck_0; n < (block + 1) * blck_0 & n < num_blocks * blck_0 + remainder; n++) {
+                        if (dst->dtypeAt(n, h, m, n) == MLLM_TYPE_F32) {
+                            vec_dot_q6_K_q8_K(K, dst->ptrAt<float>(b, h, m, n),
+                                              src1_cal->hostPtr<block_q6_K>() + src1_cal->offset(b_1, h_1, n, 0) / QK_K,
+                                              src0_cal->hostPtr<block_q8_K>() + src0_cal->offset(b, h, m, 0) / QK_K);
+                            if (support_bias) {
+                                *dst->ptrAt<float>(b, h, m, n) += bias->dataAt<float>(0, 0, 0, n);
+                            }
+                        } else if (dst->dtypeAt(n, h, m, n) == MLLM_TYPE_F16) {
+                            float tmp = 0;
+                            vec_dot_q6_K_q8_K(K, &tmp,
+                                              src1_cal->hostPtr<block_q6_K>() + src1_cal->offset(b_1, h_1, n, 0) / QK_K,
+                                              src0_cal->hostPtr<block_q8_K>() + src0_cal->offset(b, h, m, 0) / QK_K);
+
+                            if (support_bias) {
+                                *dst->ptrAt<mllm_fp16_t>(b, h, m, n) = MLLM_FP32_TO_FP16(tmp + bias->dataAt<float>(0, 0, 0, n));
+                            } else {
+                                *dst->ptrAt<mllm_fp16_t>(b, h, m, n) = MLLM_FP32_TO_FP16(tmp);
+                            }
+                        } else {
+                            std::cout << "Not support tupe [Matmul]" << std::endl;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return MLLM_NO_ERROR;
+}
+
diff --git a/src/backends/cpu/compute/MatmulElastic.hpp b/src/backends/cpu/compute/MatmulElastic.hpp
new file mode 100644
index 00000000..828fe16e
--- /dev/null
+++ b/src/backends/cpu/compute/MatmulElastic.hpp
@@ -0,0 +1,18 @@
+//
+// Created by Rongjie Yi on 23-10-24.
+//
+
+#ifndef MLLM_MATMUL_HPP
+#define MLLM_MATMUL_HPP
+
+
+#include "VecDot.hpp"
+using namespace mllm;
+
+ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, bool transpose0 = false, bool transpose1 = false, int thread_count=4);
+ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, bool transpose0 = false, bool transpose1 = false, int thread_count=4);
+ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int thread_count=4);
+ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int thread_count=4);
+ErrorCode mat_mul_elastic_fp32_q6_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int thread_count=4);
+
+#endif // MLLM_MATMUL_HPP
diff --git a/src/backends/cpu/compute/VecDot.cpp b/src/backends/cpu/compute/VecDot.cpp
index d4210d68..b5270f7b 100644
--- a/src/backends/cpu/compute/VecDot.cpp
+++ b/src/backends/cpu/compute/VecDot.cpp
@@ -97,6 +97,7 @@ void vec_dot_fp32(const int n, float *__restrict s, const float *__restrict vx,
     vec_dot_fp32_arm(n, s, vx, vy);
 #endif
 }
+/*
 void vec_dot_fp32(const float * __restrict src0, const float * __restrict src1, Tensor *dst, bool support_bias, Tensor *bias, int hid_len, int batch, int head, int src0_inf, int sec1_outf) {
     float value = 0;
 #ifdef __AVX2__
@@ -113,7 +114,7 @@ void vec_dot_fp32(const float * __restrict src0, const float * __restrict src1,
     }
     dst->setDataAt<float>({batch, head, src0_inf, sec1_outf}, value);
 }
-
+*/
 void vec_dot_fp16(const int n, float * __restrict s, const mllm_fp16_t * __restrict vx, const mllm_fp16_t * __restrict vy) {
     float sumf = 0.0;
 
@@ -1344,3 +1345,71 @@ void vec_dot_q6_K_q8_K(const void * __restrict src0, const void * __restrict src
     dst->setDataAt<float>({batch, head, src0_inf, sec1_outf}, value);
 }
 
+
+
+void vec_dot_q8_0_q8_0(int n, float * __restrict s, const void * __restrict vx, const void * __restrict vy) {
+    const int qk = QK8_0;
+    const int nb = n / qk;  // number of blocks
+
+    assert(n % qk == 0);
+
+    const auto * __restrict x = static_cast<const block_q8_0 *>(vx);
+    const auto * __restrict y = static_cast<const block_q8_0 *>(vy);
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    assert(nb % 2 == 0); // TODO: handle odd nb
+
+    for (int i = 0; i < nb; i += 2) {
+        const block_q8_0 * x0 = &x[i + 0];
+        const block_q8_0 * x1 = &x[i + 1];
+        const block_q8_0 * y0 = &y[i + 0];
+        const block_q8_0 * y1 = &y[i + 1];
+
+        const int8x16_t x0_0 = vld1q_s8(x0->qs);
+        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
+        const int8x16_t x1_0 = vld1q_s8(x1->qs);
+        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
+
+        // load y
+        const int8x16_t y0_0 = vld1q_s8(y0->qs);
+        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
+        const int8x16_t y1_0 = vld1q_s8(y1->qs);
+        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                                       mllm_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
+                                       mllm_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                                       mllm_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
+                                       mllm_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__AVX2__) || defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        // Compute combined scale for the block
+        const __m256 d = _mm256_set1_ps(MLLM_FP16_TO_FP32(x[i].d) * MLLM_FP16_TO_FP32(y[i].d));
+        __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        // Multiply q with scale and accumulate
+#if defined(__AVX2__)
+        acc = _mm256_fmadd_ps( d, q, acc );
+#else
+        acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
+#endif
+    }
+
+    *s = hsum_float_8(acc);
+#endif
+}
diff --git a/src/backends/cpu/compute/VecDot.hpp b/src/backends/cpu/compute/VecDot.hpp
index a8f7706c..1dad4968 100644
--- a/src/backends/cpu/compute/VecDot.hpp
+++ b/src/backends/cpu/compute/VecDot.hpp
@@ -335,11 +335,28 @@ static inline float hsum_float_8(const __m256 x) {
         }                                          \
         res = vaddvq_f32(x[0]);                    \
     }
+
+#if !defined(__ARM_FEATURE_DOTPROD)
+
+inline static int32x4_t mllm_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
+    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
+    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
+
+    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
+}
+
+#else
+
+#define mllm_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
+
+#endif // !defined(__ARM_FEATURE_DOTPROD)
+
+
 #endif
 
 using namespace mllm;
 
-void vec_dot_fp32(const float * __restrict src0, const float * __restrict src1, Tensor *dst, bool support_bias, Tensor *bias, int hid_len, int batch, int head, int src0_inf, int sec1_outf);
+// void vec_dot_fp32(const float * __restrict src0, const float * __restrict src1, Tensor *dst, bool support_bias, Tensor *bias, int hid_len, int batch, int head, int src0_inf, int sec1_outf);
 void vec_dot_q4_0_q8_0(const void * __restrict src0, const void * __restrict src1, Tensor *dst, bool support_bias, Tensor *bias, int hid_len, int batch, int head, int src0_inf, int sec1_outf);
 void vec_dot_q4_K_q8_K(const void * __restrict src0, const void * __restrict src1, Tensor *dst, bool support_bias, Tensor *bias, int hid_len, int batch, int head, int src0_inf, int sec1_outf);
 void vec_dot_q6_K_q8_K(const void * __restrict src0, const void * __restrict src1, Tensor *dst, bool support_bias, Tensor *bias, int hid_len, int batch, int head, int src0_inf, int sec1_outf);
@@ -350,5 +367,6 @@ void vec_dot_q6_K_q8_K(const int n, float * __restrict s, const void * __restric
 void vec_dot_q4_0_q8_0(const int n, float * __restrict s, const void * __restrict vx, const void * __restrict vy);
 void vec_dot_fp32(const int n, float * __restrict s, const float * __restrict vx, const float * __restrict vy);
 void vec_dot_fp16(const int n, float * __restrict s, const mllm_fp16_t * __restrict vx, const mllm_fp16_t * __restrict vy);
+void vec_dot_q8_0_q8_0(int n, float * __restrict s, const void * __restrict vx, const void * __restrict vy);
 
 #endif // MLLM_VECDOT_HPP
diff --git a/src/backends/cpu/type/type.cpp b/src/backends/cpu/compute/VecDotType.cpp
similarity index 64%
rename from src/backends/cpu/type/type.cpp
rename to src/backends/cpu/compute/VecDotType.cpp
index 77e30691..843c1a0f 100644
--- a/src/backends/cpu/type/type.cpp
+++ b/src/backends/cpu/compute/VecDotType.cpp
@@ -27,185 +27,13 @@
 
 #include <cstdio>
 #include <cstring>
-#include "type.hpp"
+#include "VecDotType.hpp"
 #include "Types.hpp"
 #include "quantize/Quantize.hpp"
 #include "quantize/QuantizeQ6.hpp"
 #include "compute/VecDot.hpp"
 
-#ifdef __AVX2__
-static void vec_dot_fp32_avx2(const int n, float *__restrict s, const float *__restrict x, const float *__restrict y) {
-    float sumf = 0.0F;
-    const int np = (n & ~(MLLM_F32_STEP - 1));
-
-    MLLM_F32_VEC sum[MLLM_F32_ARR] = {MLLM_F32_VEC_ZERO};
-
-    MLLM_F32_VEC ax[MLLM_F32_ARR];
-    MLLM_F32_VEC ay[MLLM_F32_ARR];
-
-    for (int i = 0; i < np; i += MLLM_F32_STEP) {
-        for (int j = 0; j < MLLM_F32_ARR; j++) {
-            ax[j] = MLLM_F32_VEC_LOAD(x + i + j * MLLM_F32_EPR);
-            ay[j] = MLLM_F32_VEC_LOAD(y + i + j * MLLM_F32_EPR);
-
-            sum[j] = MLLM_F32_VEC_FMA(sum[j], ax[j], ay[j]);
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    MLLM_F32_VEC_REDUCE(sumf, sum);
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        sumf += x[i] * y[i];
-    }
-
-    *s = sumf;
-}
-#endif
-
-#ifdef __ARM_NEON
-static void vec_dot_fp32_arm(const int n, float *__restrict s, const float *__restrict x, const float *__restrict y) {
-    float sumf = 0.0F;
-    const int np = (n & ~(16 - 1));
-
-    F32_VEC sum[4] = {vdupq_n_f32(0.0F)};
-
-    F32_VEC ax[F32_ARR];
-    F32_VEC ay[F32_ARR];
-
-    for (int i = 0; i < np; i += F32_STEP) {
-        for (int j = 0; j < F32_ARR; j++) {
-            ax[j] = vld1q_f32(x + i + j * F32_REG);
-            ay[j] = vld1q_f32(y + i + j * F32_REG);
-            sum[j] = vfmaq_f32(sum[j], ax[j], ay[j]);
-            // sum[j] = vmlaq_lane_f32(sum[j], ax[j], ay[0],
-        }
-
-    }
-
-    // reduce sum0..sum3 to sum0
-    F32_VEC_REDUCE(sumf, sum);
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        sumf += x[i] * y[i];
-    }
-
-    *s = sumf;
-}
-#endif
-
-static void vec_dot_fp32_local(const int n, float *__restrict s, const float *__restrict vx, const float *__restrict vy) {
-#ifdef __AVX2__
-    vec_dot_fp32_avx2(n, s, vx, vy);
-#elif defined(__ARM_NEON)
-    vec_dot_fp32_arm(n, s, vx, vy);
-#endif
-}
-/*
-void vec_dot_q8_0_q8_0(int n, float * __restrict s, const void * __restrict vx, const void * __restrict vy) {
-    const int qk = QK8_0;
-    const int nb = n / qk;  // number of blocks
-
-    assert(n % qk == 0);
-
-    const auto * __restrict x = static_cast<const block_q8_0 *>(vx);
-    const auto * __restrict y = static_cast<const block_q8_0 *>(vy);
 
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    assert(nb % 2 == 0); // TODO: handle odd nb
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q8_0 * x0 = &x[i + 0];
-        const block_q8_0 * x1 = &x[i + 1];
-        const block_q8_0 * y0 = &y[i + 0];
-        const block_q8_0 * y1 = &y[i + 1];
-
-        const int8x16_t x0_0 = vld1q_s8(x0->qs);
-        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
-        const int8x16_t x1_0 = vld1q_s8(x1->qs);
-        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
-
-        // load y
-        const int8x16_t y0_0 = vld1q_s8(y0->qs);
-        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
-        const int8x16_t y1_0 = vld1q_s8(y1->qs);
-        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                                       ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
-                                       ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                                       ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
-                                       ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__) || defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_set1_ps(MLLM_FP16_TO_FP32(x[i].d) * MLLM_FP16_TO_FP32(y[i].d));
-        __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        // Multiply q with scale and accumulate
-#if defined(__AVX2__)
-        acc = _mm256_fmadd_ps( d, q, acc );
-#else
-        acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
-#endif
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__riscv_v_intrinsic)
-    float sumf = 0.0;
-    size_t vl = __riscv_vsetvl_e8m1(qk);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
-        vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
-
-        vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
-
-        vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
-        vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
-
-        sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
-    }
-
-    *s = sumf;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        int sumi = 0;
-
-        for (int j = 0; j < qk; j++) {
-            sumi += x[i].qs[j]*y[i].qs[j];
-        }
-
-        sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
-    }
-
-    *s = sumf;
-#endif
-}
-*/
 void fp32_add_row_to(int n, const float * MLLM_RESTRICT src, float * MLLM_RESTRICT dst, float alpha){
     int i = 0;
 #ifdef __AVX2__
@@ -396,7 +224,7 @@ type_traits_t type_traits[] = {
         .blck_size = 1,
         .to_float = nullptr,
         .from_float = nullptr,
-        .vec_dot = (mllm_vec_dot_func)vec_dot_fp32_local,
+        .vec_dot = (mllm_vec_dot_func)vec_dot_fp32,
         .vec_dot_type = MLLM_TYPE_F32,
         .add_row_to = (mllm_vec_add_row_func)fp32_add_row_to,
     },
@@ -425,15 +253,15 @@ type_traits_t type_traits[] = {
     {},
     {},
     {},
-    // /*[MLLM_TYPE_Q8_0] = */{
-    //     .size = sizeof(block_q8_0),
-    //     .blck_size = QK8_0,
-    //     .to_float = (mllm_to_float_func) dequantize_row_q8_0,
-    //     .from_float = (mllm_from_float_func) quantize_row_q8_0,
-    //     .vec_dot = (mllm_vec_dot_func) vec_dot_q8_0_q8_0,
-    //     .vec_dot_type = MLLM_TYPE_Q8_0,
-    //     .add_row_to = (mllm_vec_add_row_func)q8_0_add_row_to,
-    // },
+    /*[MLLM_TYPE_Q8_0] = */{
+        .size = sizeof(block_q8_0),
+        .blck_size = QK8_0,
+        .to_float = (mllm_to_float_func) dequantize_row_q8_0,
+        .from_float = (mllm_from_float_func) quantize_row_q8_0,
+        .vec_dot = (mllm_vec_dot_func) vec_dot_q8_0_q8_0,
+        .vec_dot_type = MLLM_TYPE_Q8_0,
+        .add_row_to = (mllm_vec_add_row_func)q8_0_add_row_to,
+    },
     /*[MLLM_TYPE_Q8_1] = */{},
     {},
     {},
diff --git a/src/backends/cpu/type/type.hpp b/src/backends/cpu/compute/VecDotType.hpp
similarity index 100%
rename from src/backends/cpu/type/type.hpp
rename to src/backends/cpu/compute/VecDotType.hpp

From 689d30e0f2a5104f64b2980fd5e63c407f6cc17f Mon Sep 17 00:00:00 2001
From: yirongjie <yirj0809@gmail.com>
Date: Tue, 16 Jul 2024 17:15:14 +0800
Subject: [PATCH 2/7] fix: Typo

---
 src/backends/cpu/compute/VecDot.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backends/cpu/compute/VecDot.cpp b/src/backends/cpu/compute/VecDot.cpp
index b5270f7b..2fcca52f 100644
--- a/src/backends/cpu/compute/VecDot.cpp
+++ b/src/backends/cpu/compute/VecDot.cpp
@@ -1381,11 +1381,11 @@ void vec_dot_q8_0_q8_0(int n, float * __restrict s, const void * __restrict vx,
 
         sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
                                        mllm_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
-                                       mllm_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                                       mllm_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), MLLM_FP16_TO_FP32(x0->d)*MLLM_FP16_TO_FP32(y0->d));
 
         sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
                                        mllm_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
-                                       mllm_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                                       mllm_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), MLLM_FP16_TO_FP32(x1->d)*MLLM_FP16_TO_FP32(y1->d));
     }
 
     *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);

From a3eb979f7deb604f8ada8880d3aa5a32c54d9023 Mon Sep 17 00:00:00 2001
From: yirongjie <yirj0809@gmail.com>
Date: Wed, 17 Jul 2024 02:25:59 +0000
Subject: [PATCH 3/7] feat: add demo eladtic_llama

---
 CMakeLists.txt                              |  14 +++
 examples/demo_elastic_llama.cpp             |  58 ++++++++++
 include/OpDefined.hpp                       |   4 +
 src/Layer.hpp                               |  77 ++++++++++++++
 src/backends/cpu/CPUBackend.cpp             |   2 +
 src/backends/cpu/CPUElasticLinear.cpp       | 111 ++++++++++++++++++++
 src/backends/cpu/CPUElasticLinear.hpp       |  47 +++++++++
 src/backends/cpu/compute/MatmulElastic.cpp  |  58 +++++-----
 src/backends/cpu/compute/MatmulElastic.hpp  |  10 +-
 src/models/llama/modeling_elastic_llama.hpp |  94 +++++++++++++++++
 10 files changed, 443 insertions(+), 32 deletions(-)
 create mode 100644 examples/demo_elastic_llama.cpp
 create mode 100644 src/backends/cpu/CPUElasticLinear.cpp
 create mode 100644 src/backends/cpu/CPUElasticLinear.hpp
 create mode 100644 src/models/llama/modeling_elastic_llama.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 22d2b686..739bf75d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -424,6 +424,20 @@ else ()
     target_link_libraries(demo_sparse_llama MLLM_CPU)
 endif ()
 
+add_executable(demo_elastic_llama ${PROJECT_SOURCE_DIR}/examples/demo_elastic_llama.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC} # ${DIR_SRC_QUANT}
+        src/tokenizers/Tokenizer.cpp
+        src/tokenizers/Tokenizer.hpp
+        src/tokenizers/BPE/Bpe.cpp
+        src/tokenizers/BPE/Bpe.hpp
+)
+# target_compile_definitions(demo_elastic_llama PRIVATE MLLM_QKK_64)
+if (ARM AND NOT APK)
+    target_compile_options(demo_elastic_llama PRIVATE -fopenmp)
+    target_link_libraries(demo_elastic_llama PUBLIC MLLM_CPU -fopenmp -static-openmp)
+else ()
+    target_link_libraries(demo_elastic_llama MLLM_CPU)
+endif ()
+
 add_executable(demo_llava ${PROJECT_SOURCE_DIR}/examples/demo_llava.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC}
         src/tokenizers/Tokenizer.cpp
         src/tokenizers/BPE/Bpe.cpp
diff --git a/examples/demo_elastic_llama.cpp b/examples/demo_elastic_llama.cpp
new file mode 100644
index 00000000..94e90d07
--- /dev/null
+++ b/examples/demo_elastic_llama.cpp
@@ -0,0 +1,58 @@
+//
+// Created by Rongjie Yi on 2024/1/26 0026.
+//
+
+#include <iostream>
+#include "cmdline.h"
+#include "models/llama/modeling_elastic_llama.hpp"
+#include "models/llama/tokenization_llama.hpp"
+#include "processor/PostProcess.hpp"
+
+
+using namespace mllm;
+
+int main(int argc, char **argv) {
+    cmdline::parser cmdParser;
+    cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm");
+    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_k.mllm");
+    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
+    cmdParser.add<int>("thread", 't', "num of threads", false, 4);
+    cmdParser.parse_check(argc, argv);
+
+    string vocab_path = cmdParser.get<string>("vocab");
+    string model_path = cmdParser.get<string>("model");
+    int tokens_limit = cmdParser.get<int>("limits");
+    CPUBackend::cpu_threads = cmdParser.get<int>("thread");
+
+    auto tokenizer = LLaMATokenizer(vocab_path);
+
+    LLaMAConfig config(tokens_limit, "7B", LLAMAROPE);
+    auto model = ElasticLLaMAModel(config);
+    model.load(model_path);
+
+    vector<string> in_strs = {
+        " Hello, who are you?",
+        " What can you do?",
+        "Please introduce Beijing University of Posts and Telecommunications."};
+
+    for (int i = 0; i < in_strs.size(); ++i) {
+        auto in_str = in_strs[i];
+        auto input_tensor = tokenizer.tokenize(in_str, i);
+        std::cout << "[Q] " << in_str << std::endl;
+        std::cout << "[A] " << std::flush;
+        for (int step = 0; step < 1; step++) {
+            auto result = model({input_tensor});
+            auto outputs = tokenizer.detokenize(result[0]);
+            auto out_string = outputs.first;
+            auto out_token = outputs.second;
+            if (out_token == 2) {
+                break;
+            }
+            std::cout << out_string << std::flush;
+            chatPostProcessing(out_token, input_tensor, {});
+        }
+        printf("\n");
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/include/OpDefined.hpp b/include/OpDefined.hpp
index 0e36cf2c..a1ebdb4c 100644
--- a/include/OpDefined.hpp
+++ b/include/OpDefined.hpp
@@ -48,6 +48,7 @@ enum OpType {
     PREDICTOR,
     SPARSELINEAR,
     SPARSEIDLINEAR,
+    ELASTICLINEAR,
     OP_NUM
 };
 
@@ -89,6 +90,9 @@ static const vector<string> OpNames = {
     "Range",
     "Where",
     "Replace",
+    "SparseLinear",
+    "SparseIdLinear",
+    "ElasticLinear",
     "OP_NUM"};
 
 enum TensorFuncType {
diff --git a/src/Layer.hpp b/src/Layer.hpp
index 959f6b8e..956f74e5 100644
--- a/src/Layer.hpp
+++ b/src/Layer.hpp
@@ -5,6 +5,8 @@
 #ifndef OPERATION_H
 #define OPERATION_H
 
+#include <cstdlib>
+#include <iostream>
 #include <utility>
 
 #include "Tensor.hpp"
@@ -49,6 +51,13 @@ class Layer {
         return _3I1O_OP(input0, input1, input2);
     }
 
+    Tensor &operator()(Tensor &input0, int activate_input_dim, int activate_output_dim) {
+        auto activate_input_dim_tensor = Tensor(1, 1, 1, 1, backend_, true);
+        activate_input_dim_tensor.setDataAt<float>(0,0,0,0,(float)activate_input_dim);
+        auto activate_output_dim_tensor = Tensor(1, 1, 1, 1, backend_, true);
+        activate_output_dim_tensor.setDataAt<float>(0,0,0,0,(float)activate_output_dim);
+        return _3I1O_only1map_OP(input0, activate_input_dim_tensor, activate_output_dim_tensor);
+    }
 
 private:
     std::string name_num_to_X(const std::string &input_string) {
@@ -344,6 +353,62 @@ class Layer {
             return Tensor::gph_[next_name];
         }
     }
+    Tensor &_3I1O_only1map_OP(Tensor &input0, Tensor &input1, Tensor &input2) {
+        Module::runlistIdx = saved_list_idx;
+        if (INIT_OP()) {
+            return input0;
+        } else {
+            string layer_next_name = "out-" + op_->name();
+            if (Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) {
+                Tensor::gph_[input0.name()].status() = input0.status();
+            }
+            switch (input0.status()) {
+            case TENSOR_STATIC_INIT: {
+                if (Tensor::gph_.find(input0.name()) == Tensor::gph_.end() || input0.count() != Tensor::gph_[input0.name()].count()) {
+                    Tensor::gph_[input0.name()] = input0;
+                    Tensor::gph_[input0.name()].setName(input0.name());
+                }
+                if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
+                    layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);
+                }
+                auto next_name = layername_2_tensorname[layer_next_name];
+                if (Tensor::gph_.find(next_name) == Tensor::gph_.end()) {
+                    Tensor::gph_[next_name] = Tensor(backend_);
+                    Tensor::gph_[next_name].setName(next_name);
+                }
+                vector<shared_ptr<Tensor>> shared_inputs{
+                    std::shared_ptr<Tensor>(&Tensor::gph_[input0.name()], [](Tensor *) {}),
+                    std::shared_ptr<Tensor>(&input1, [](Tensor *) {}),
+                    std::shared_ptr<Tensor>(&input2, [](Tensor *) {})};
+                vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
+                op_->reshape(shared_inputs, shared_outputs);
+                op_->setUp(shared_inputs, shared_outputs);
+                assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
+                break;
+            }
+            case TENSOR_STATIC_READY: {
+                auto next_name = layername_2_tensorname[layer_next_name];
+                vector<shared_ptr<Tensor>> shared_inputs{
+                    std::shared_ptr<Tensor>(&Tensor::gph_[input0.name()], [](Tensor *) {}),
+                    std::shared_ptr<Tensor>(&input1, [](Tensor *) {}),
+                    std::shared_ptr<Tensor>(&input2, [](Tensor *) {})};
+                vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
+                op_->execute(shared_inputs, shared_outputs);
+                assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
+                break;
+            }
+            default: {
+                break;
+            }
+            }
+            auto next_name = layername_2_tensorname[layer_next_name];
+            Tensor::gph_[next_name].status() = Tensor::gph_[input0.name()].status();            
+            if(saveNDataFlag){
+                Tensor::gph_[next_name].saveNData<float>(layer_next_name);
+            }
+            return Tensor::gph_[next_name];
+        }
+    }
     Tensor &_0I1O_OP() {
         Module::runlistIdx = saved_list_idx;
         if (INIT_OP()) {
@@ -525,6 +590,18 @@ class Predictor final : public Layer {
     // no need to defined a new operator() function, just use the default one
 };
 
+class ElasticLinear final : public Layer {
+public:
+    explicit ElasticLinear(int in_features, int out_features, bool bias, std::string name) {
+        param_["in_features"] = in_features;
+        param_["out_features"] = out_features;
+        param_["bias"] = (float)bias;
+        init(std::move(name), OpType::ELASTICLINEAR);
+    }
+    // Use: Tensor &operator()(Tensor &input0, int activate_input_dim, int activate_output_dim) {
+};
+
+
 class SiLU final : public Layer {
 public:
     SiLU() = default;
diff --git a/src/backends/cpu/CPUBackend.cpp b/src/backends/cpu/CPUBackend.cpp
index 6f6f8488..fdad5ad6 100644
--- a/src/backends/cpu/CPUBackend.cpp
+++ b/src/backends/cpu/CPUBackend.cpp
@@ -40,6 +40,7 @@
 #include "CPUPredictor.hpp"
 #include "CPUSparseIdLinear.hpp"
 #include "CPUSparseLinear.hpp"
+#include "CPUElasticLinear.hpp"
 #include "CPUTensorFunction.hpp"
 
 namespace mllm {
@@ -99,6 +100,7 @@ void CPUBackend::registerOps() {
     addCreator(PREDICTOR, (CPUBackend::Creator *)(new CPUPredictorCreator()));
     addCreator(SPARSELINEAR, (CPUBackend::Creator *)(new CPUSparseLinearCreator()));
     addCreator(SPARSEIDLINEAR, (CPUBackend::Creator *)(new CPUSparseIdLinearCreator()));
+    addCreator(ELASTICLINEAR, (CPUBackend::Creator *)(new CPUElasticLinearCreator()));
 }
 
 TensorFunction *CPUBackend::funcCreate(const TensorFuncType type) {
diff --git a/src/backends/cpu/CPUElasticLinear.cpp b/src/backends/cpu/CPUElasticLinear.cpp
new file mode 100644
index 00000000..85a8c603
--- /dev/null
+++ b/src/backends/cpu/CPUElasticLinear.cpp
@@ -0,0 +1,111 @@
+
+#include "CPUElasticLinear.hpp"
+#include "compute/MatmulElastic.hpp"
+
+namespace mllm {
+
+CPUElasticLinear::CPUElasticLinear(Backend *bn, string opName, int in_features, int out_features, bool bias, int threadCount) : thread_count(threadCount),
+    Op(bn, opName) {
+    in_features_ = in_features;
+    out_features_ = out_features;
+    support_bias_ = bias;
+    thread_count = threadCount;
+    weight_.setBackend(bn);
+    bias_.setBackend(bn);
+}
+
+ErrorCode CPUElasticLinear::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    //std::cout << name() << "  CPUElasticLinear  reshape" << std::endl;
+    assert(inputs.size() == 3);
+    assert(outputs.size() == 1);
+    if(inputs[0]->count() == 0) {
+        outputs[0]->reshape(0,0,0,0);
+        return Op::reshape(inputs, outputs);
+    }
+    // N     |    C       |   H                   |  W
+    // -----------------------------------------------
+    // 1     |out_channel | in_channel            |  1
+    //       |out_features| in_features           |
+    // -----------------------------------------------
+    // batch |in_channel  | seq_len               |  1
+    //       |in_features | inputs[0]->sequence()   |
+    // -----------------------------------------------
+    // batch |out_channel | seq_len               |  1
+    //       |out_features|  inputs[0]->sequence()  |
+    assert(inputs[0]->head() == 1);
+    assert(in_features_ == inputs[0]->dimension());
+    outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence(), out_features_);
+    //outputs[0]->setDtype(activationDtype());
+    return Op::reshape(inputs, outputs);
+}
+
+ErrorCode CPUElasticLinear::load(AbstructLoader &loader) {
+    //std::cout << name() << "  CPUElasticLinear load" << std::endl;
+    weight_.setName(name() + ".weight");
+    weight_.reshape(1, 1, out_features_, in_features_);
+    if (loader.getDataType(weight_.name()) != MLLM_TYPE_COUNT) {
+        weight_.setDtype(loader.getDataType(weight_.name()));
+        weight_.alloc();
+        loader.load(&weight_);
+    } else {
+        weight_.setDtype(MLLM_TYPE_F32);
+        weight_.alloc();
+    }
+    if (support_bias_) {
+        bias_.setName(name() + ".bias");
+        bias_.reshape(1, 1, 1, out_features_);
+        if (loader.getDataType(bias_.name()) != MLLM_TYPE_COUNT) {
+            bias_.setDtype(loader.getDataType(bias_.name()));
+            bias_.alloc();
+            loader.load(&bias_);
+        } else {
+            bias_.setDtype(MLLM_TYPE_F32);
+            bias_.alloc();
+        }
+    }
+    return Op::load(loader);
+}
+
+ErrorCode CPUElasticLinear::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    int activate_input_dim = (int)inputs[1]->dataAt<float>(0,0,0,0);
+    int activate_output_dim = (int)inputs[2]->dataAt<float>(0,0,0,0);
+
+//    auto start = mllm::mllm_time_us();
+    if(inputs[0]->count() == 0) {
+        return Op::execute(inputs, outputs);
+    }
+    // std::cout << name() << "  CPUElasticLinear()" << std::endl;
+    switch (weight_.dtype()) {
+    case MLLM_TYPE_F32: {
+        mat_mul_elastic_fp32(inputs[0].get(), &weight_, outputs[0].get(), support_bias_, &bias_, activate_input_dim,activate_output_dim, false, true, thread_count);
+        break;
+    }
+    case MLLM_TYPE_F16: break;
+    case MLLM_TYPE_Q4_0: {
+        mat_mul_elastic_fp32_q4_0(inputs[0].get(), &weight_, outputs[0].get(), support_bias_, &bias_,  activate_input_dim,activate_output_dim, thread_count);
+        break;
+    }
+    case MLLM_TYPE_Q4_K: {
+        mat_mul_elastic_fp32_q4_K(inputs[0].get(), &weight_, outputs[0].get(), support_bias_, &bias_,  activate_input_dim,activate_output_dim, thread_count);
+        break;
+    }
+    case MLLM_TYPE_Q6_K: {
+        mat_mul_elastic_fp32_q6_K(inputs[0].get(), &weight_, outputs[0].get(), support_bias_, &bias_,  activate_input_dim,activate_output_dim, thread_count);
+        break;
+    }
+    default:
+        break;
+    }
+//    auto end = mllm::mllm_time_us();
+//    printf("exec time: %ld us\n", end - start);
+    return Op::execute(inputs, outputs);
+}
+ErrorCode CPUElasticLinear::free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    weight_.free();
+    if (support_bias_) {
+        bias_.free();
+    }
+    return Op::free(inputs, outputs);
+}
+
+} // namespace mllm
diff --git a/src/backends/cpu/CPUElasticLinear.hpp b/src/backends/cpu/CPUElasticLinear.hpp
new file mode 100644
index 00000000..926fa64a
--- /dev/null
+++ b/src/backends/cpu/CPUElasticLinear.hpp
@@ -0,0 +1,47 @@
+#ifndef MLLM_CPUELASTICLINEAR_H
+#define MLLM_CPUELASTICLINEAR_H
+
+#include "Op.hpp"
+#include "CPUBackend.hpp"
+
+namespace mllm {
+
+class Tensor;
+class CPUElasticLinear final : public Op {
+public:
+    CPUElasticLinear(Backend *bn, string opName, int in_features, int out_features, bool bias, int threadCount);
+    virtual ~CPUElasticLinear() = default;
+    virtual ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
+    virtual ErrorCode load(AbstructLoader &loader) override;
+    virtual ErrorCode execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
+    virtual ErrorCode free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
+
+    Tensor &weight() {
+        return weight_;
+    }
+    Tensor &bias() {
+        return bias_;
+    }
+
+private:
+    int in_features_;
+    int out_features_;
+    bool support_bias_;
+    int thread_count = 4;
+    Tensor weight_;
+    Tensor bias_;
+};
+
+class CPUElasticLinearCreator : public CPUBackend::Creator {
+public:
+    virtual Op *create(OpParam op_param, Backend *bn, string name, int threadCount) const {
+        int in_features = op_param["in_features"];
+        int out_features = op_param["out_features"];
+        int bias = op_param["bias"];
+        return new CPUElasticLinear(bn, name, in_features, out_features, (bool)bias, threadCount);
+    }
+};
+
+} // namespace mllm
+
+#endif // MLLM_CPUELASTICLINEAR_H
\ No newline at end of file
diff --git a/src/backends/cpu/compute/MatmulElastic.cpp b/src/backends/cpu/compute/MatmulElastic.cpp
index 70049cb4..884cd5a6 100644
--- a/src/backends/cpu/compute/MatmulElastic.cpp
+++ b/src/backends/cpu/compute/MatmulElastic.cpp
@@ -6,10 +6,14 @@
 #include <pthread.h>
 
 
-ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, bool transpose0, bool transpose1, int thread_count) {
+ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, 
+                                int activate_input_dim, int activate_output_dim,
+                                bool transpose0, bool transpose1, int thread_count) {
     const int M = transpose0 ? src0->dimension() : src0->sequence();
     const int K = transpose0 ? src0->sequence() : src0->dimension();
     const int N = transpose1 ? src1->sequence() : src1->dimension();
+    int use_N = (activate_output_dim == -1) ? N : activate_output_dim;
+    int use_K = (activate_input_dim == -1) ? K : activate_input_dim;
     Tensor *src0_cal = src0;
     Tensor *src1_cal = src1;
     const int64_t blck_0 = 16;
@@ -18,7 +22,6 @@ ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool sup
             const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b;
             const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h;
             for (int m = 0; m < M; m++) {
-                int use_N = N;
                 const int num_blocks = use_N / blck_0;
                 const int remainder = use_N % blck_0;
 #pragma omp parallel for num_threads(thread_count)
@@ -34,7 +37,7 @@ ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool sup
                             s_1 = 0; d_1 = n; s_0 = 0; d_0 = m;
                         }
                         if(dst->dtypeAt(b,h,m,n) == MLLM_TYPE_F32) {
-                            vec_dot_fp32(K, dst->ptrAt<float>(b, h, m, n),
+                            vec_dot_fp32(use_K, dst->ptrAt<float>(b, h, m, n),
                                          src1_cal->hostPtr<float>() + src1_cal->offset(b_1, h_1, s_1, d_1),
                                          src0_cal->hostPtr<float>() + src0_cal->offset(b, h, s_0, d_0));
                             if (support_bias) {
@@ -42,7 +45,7 @@ ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool sup
                             }
                         }else if (dst->dtypeAt(b,h,m,n) == MLLM_TYPE_F16) {
                             float tmp = 0;
-                            vec_dot_fp32(K, &tmp,
+                            vec_dot_fp32(use_K, &tmp,
                                          src1_cal->hostPtr<float>() + src1_cal->offset(b_1, h_1, s_1, d_1),
                                          src0_cal->hostPtr<float>() + src0_cal->offset(b, h, s_0, d_0));
                             if (support_bias) {
@@ -59,7 +62,9 @@ ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool sup
     return MLLM_NO_ERROR;
 }
 
-ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, bool transpose0, bool transpose1, int thread_count) {
+ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, 
+                                int activate_input_dim, int activate_output_dim,
+                                    bool transpose0, bool transpose1, int thread_count) {
     assert(src1->dtype() == MLLM_TYPE_F16);
     assert(src0_->dtype() == MLLM_TYPE_F32);
     Tensor src0_qf16(src0_->shape());
@@ -77,17 +82,11 @@ ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bo
             }
         }
     auto *src0 = &src0_qf16;
-    // for(int b=0; b<src0->dimension(); b++) {
-    //     std::cout<<MLLM_COMPUTE_FP16_TO_FP32(*src0->ptrAt<mllm_fp16_t>(0, 0, 0, b))<<" ";
-    // }
-    // std::cout<<std::endl;
-    // for(int b=0; b<src1->dimension(); b++) {
-    //     std::cout<<MLLM_COMPUTE_FP16_TO_FP32(*src1->ptrAt<mllm_fp16_t>(0, 0, 0, b))<<" ";
-    // }
-    // std::cout<<std::endl;
     const int M = transpose0 ? src0->dimension() : src0->sequence();
     const int K = transpose0 ? src0->sequence() : src0->dimension();
     const int N = transpose1 ? src1->sequence() : src1->dimension();
+    int use_N = (activate_output_dim == -1) ? N : activate_output_dim;
+    int use_K = (activate_input_dim == -1) ? K : activate_input_dim;
     Tensor *src0_cal = src0;
     Tensor *src1_cal = src1;
     const int64_t blck_0 = 16;
@@ -96,7 +95,6 @@ ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bo
             const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b;
             const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h;
             for (int m = 0; m < M; m++) {
-                int use_N = N;
                 const int num_blocks = use_N / blck_0;
                 const int remainder = use_N % blck_0;
 #pragma omp parallel for num_threads(thread_count)
@@ -111,7 +109,7 @@ ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bo
                         } else {
                             s_1 = 0; d_1 = n; s_0 = 0; d_0 = m;
                         }
-                        vec_dot_fp16(K, dst->ptrAt<float>(b, h, m, n),
+                        vec_dot_fp16(use_K, dst->ptrAt<float>(b, h, m, n),
                                      src1_cal->hostPtr<mllm_fp16_t>() + src1_cal->offset(b_1, h_1, s_1, d_1),
                                      src0_cal->hostPtr<mllm_fp16_t>() + src0_cal->offset(b, h, s_0, d_0));
                         if (support_bias) {
@@ -125,7 +123,8 @@ ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bo
     return MLLM_NO_ERROR;
 }
 
-ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, int thread_count) {
+ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, 
+                                int activate_input_dim, int activate_output_dim, int thread_count) {
     assert(src1->dtype() == MLLM_TYPE_Q4_0);
     assert(src0_->dtype() == MLLM_TYPE_F32);
     Tensor src0_q8(src0_->shape());
@@ -152,6 +151,8 @@ ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bo
     int M = src0->sequence();
     int K = src0->dimension();
     int N = src1->sequence();
+    int use_N = (activate_output_dim == -1) ? N : activate_output_dim;
+    int use_K = (activate_input_dim == -1) ? K : activate_input_dim;
     Tensor *src0_cal = src0;
     Tensor *src1_cal = src1;
     const int64_t blck_0 = 16;
@@ -160,13 +161,12 @@ ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bo
             const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b;
             const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h;
             for (int m = 0; m < M; m++) {
-                int use_N = N;
                 const int num_blocks = use_N / blck_0;
                 const int remainder = use_N % blck_0;
 #pragma omp parallel for num_threads(thread_count)
                 for (int block = 0; block < num_blocks + 1; block++) {
                     for (int n = block * blck_0; n < (block + 1) * blck_0 & n < num_blocks * blck_0 + remainder; n++) {
-                        vec_dot_q4_0_q8_0(K, dst->ptrAt<float>(b, h, m, n),
+                        vec_dot_q4_0_q8_0(use_K, dst->ptrAt<float>(b, h, m, n),
                                           src1_cal->hostPtr<block_q4_0>() + src1_cal->offset(b_1, h_1, n, 0) / QK4_0,
                                           src0_cal->hostPtr<block_q8_0>() + src0_cal->offset(b, h, m, 0) / QK8_0);
                         if (support_bias) {
@@ -180,7 +180,8 @@ ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bo
     return MLLM_NO_ERROR;
 }
 
-ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, int thread_count) {
+ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, 
+                                int activate_input_dim, int activate_output_dim, int thread_count) {
     assert(src1->dtype() == MLLM_TYPE_Q4_K);
     assert(src0_->dtype() == MLLM_TYPE_F32);
     Tensor src0_q8(src0_->shape());
@@ -206,7 +207,9 @@ ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bo
     assert(src0->dtype() == MLLM_TYPE_Q8_K);
     int M = src0->sequence();
     int K = src0->dimension();
-    int N = src1->sequence();
+    int N = src1->sequence();    
+    int use_N = (activate_output_dim == -1) ? N : activate_output_dim;
+    int use_K = (activate_input_dim == -1) ? K : activate_input_dim;
     Tensor *src0_cal = src0;
     Tensor *src1_cal = src1;
     const int64_t blck_0 = 16;
@@ -216,14 +219,13 @@ ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bo
             const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b;
             const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h;
             for (int m = 0; m < M; m++) {
-                int use_N = N;
                 const int num_blocks = use_N / blck_0;
                 const int remainder = use_N % blck_0;
 #pragma omp parallel for num_threads(thread_count)
                 for (int block = 0; block < num_blocks + 1; block++) {
                     for (int n = block * blck_0; n < (block + 1) * blck_0 & n < num_blocks * blck_0 + remainder; n++) {
                         if(dst->dtypeAt(b,h,m,n) == MLLM_TYPE_F32) {
-                            vec_dot_q4_K_q8_K(K, dst->ptrAt<float>(b, h, m, n),
+                            vec_dot_q4_K_q8_K(use_K, dst->ptrAt<float>(b, h, m, n),
                                               src1_cal->hostPtr<block_q4_K>() + src1_cal->offset(b_1, h_1, n, 0) / QK_K,
                                               src0_cal->hostPtr<block_q8_K>() + src0_cal->offset(b, h, m, 0) / QK_K);
                             if (support_bias) {
@@ -231,7 +233,7 @@ ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bo
                             }
                         } else if (dst->dtypeAt(b,h,m,n) == MLLM_TYPE_F16) {
                             float tmp = 0;
-                            vec_dot_q4_K_q8_K(K, &tmp,
+                            vec_dot_q4_K_q8_K(use_K, &tmp,
                                               src1_cal->hostPtr<block_q4_K>() + src1_cal->offset(b_1, h_1, n, 0) / QK_K,
                                               src0_cal->hostPtr<block_q8_K>() + src0_cal->offset(b, h, m, 0) / QK_K);
                             if (support_bias) {
@@ -248,7 +250,8 @@ ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bo
     return MLLM_NO_ERROR;
 }
 
-ErrorCode mat_mul_elastic_fp32_q6_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, int thread_count) {
+ErrorCode mat_mul_elastic_fp32_q6_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, 
+                                int activate_input_dim, int activate_output_dim, int thread_count) {
     assert(src1->dtype() == MLLM_TYPE_Q6_K);
     assert(src0_->dtype() == MLLM_TYPE_F32);
     Tensor src0_q8(src0_->shape());
@@ -275,6 +278,8 @@ ErrorCode mat_mul_elastic_fp32_q6_K(Tensor *src0_, Tensor *src1, Tensor *dst, bo
     int M = src0->sequence();
     int K = src0->dimension();
     int N = src1->sequence();
+    int use_N = (activate_output_dim == -1) ? N : activate_output_dim;
+    int use_K = (activate_input_dim == -1) ? K : activate_input_dim;
     Tensor *src0_cal = src0;
     Tensor *src1_cal = src1;
     const int64_t blck_0 = 16;
@@ -283,14 +288,13 @@ ErrorCode mat_mul_elastic_fp32_q6_K(Tensor *src0_, Tensor *src1, Tensor *dst, bo
             const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b;
             const int h_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : h;
             for (int m = 0; m < M; m++) {
-                int use_N = N;
                 const int num_blocks = use_N / blck_0;
                 const int remainder = use_N % blck_0;
 #pragma omp parallel for num_threads(thread_count)
                 for (int block = 0; block < num_blocks + 1; block++) {
                     for (int n = block * blck_0; n < (block + 1) * blck_0 & n < num_blocks * blck_0 + remainder; n++) {
                         if (dst->dtypeAt(n, h, m, n) == MLLM_TYPE_F32) {
-                            vec_dot_q6_K_q8_K(K, dst->ptrAt<float>(b, h, m, n),
+                            vec_dot_q6_K_q8_K(use_K, dst->ptrAt<float>(b, h, m, n),
                                               src1_cal->hostPtr<block_q6_K>() + src1_cal->offset(b_1, h_1, n, 0) / QK_K,
                                               src0_cal->hostPtr<block_q8_K>() + src0_cal->offset(b, h, m, 0) / QK_K);
                             if (support_bias) {
@@ -298,7 +302,7 @@ ErrorCode mat_mul_elastic_fp32_q6_K(Tensor *src0_, Tensor *src1, Tensor *dst, bo
                             }
                         } else if (dst->dtypeAt(n, h, m, n) == MLLM_TYPE_F16) {
                             float tmp = 0;
-                            vec_dot_q6_K_q8_K(K, &tmp,
+                            vec_dot_q6_K_q8_K(use_K, &tmp,
                                               src1_cal->hostPtr<block_q6_K>() + src1_cal->offset(b_1, h_1, n, 0) / QK_K,
                                               src0_cal->hostPtr<block_q8_K>() + src0_cal->offset(b, h, m, 0) / QK_K);
 
diff --git a/src/backends/cpu/compute/MatmulElastic.hpp b/src/backends/cpu/compute/MatmulElastic.hpp
index 828fe16e..cd4bef9d 100644
--- a/src/backends/cpu/compute/MatmulElastic.hpp
+++ b/src/backends/cpu/compute/MatmulElastic.hpp
@@ -9,10 +9,10 @@
 #include "VecDot.hpp"
 using namespace mllm;
 
-ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, bool transpose0 = false, bool transpose1 = false, int thread_count=4);
-ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, bool transpose0 = false, bool transpose1 = false, int thread_count=4);
-ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int thread_count=4);
-ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int thread_count=4);
-ErrorCode mat_mul_elastic_fp32_q6_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int thread_count=4);
+ErrorCode mat_mul_elastic_fp32(Tensor *src0, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int activate_input_dim=-1, int activate_output_dim=-1, bool transpose0 = false, bool transpose1 = false, int thread_count=4);
+ErrorCode mat_mul_elastic_fp32_fp16(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int activate_input_dim=-1, int activate_output_dim=-1,bool transpose0 = false, bool transpose1 = false, int thread_count=4);
+ErrorCode mat_mul_elastic_fp32_q4_0(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int activate_input_dim=-1, int activate_output_dim=-1,int thread_count=4);
+ErrorCode mat_mul_elastic_fp32_q4_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int activate_input_dim=-1, int activate_output_dim=-1,int thread_count=4);
+ErrorCode mat_mul_elastic_fp32_q6_K(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int activate_input_dim=-1, int activate_output_dim=-1,int thread_count=4);
 
 #endif // MLLM_MATMUL_HPP
diff --git a/src/models/llama/modeling_elastic_llama.hpp b/src/models/llama/modeling_elastic_llama.hpp
new file mode 100644
index 00000000..a3114ba2
--- /dev/null
+++ b/src/models/llama/modeling_elastic_llama.hpp
@@ -0,0 +1,94 @@
+//
+// Created by Rongjie Yi on 2024/2/4 0004.
+//
+
+#ifndef MODELING_LLAMA_HPP
+#define MODELING_LLAMA_HPP
+
+#include "Layer.hpp"
+#include "Module.hpp"
+#include "configuration_llama.hpp"
+#include "models/transformer/modeling_transformer.hpp"
+
+using namespace mllm;
+
+class ElasticLLaMAMLP final : public Module {
+    Layer gate_proj;
+    Layer silu;
+    Layer up_proj;
+    Layer down_proj;
+
+public:
+    ElasticLLaMAMLP() = default;
+    ElasticLLaMAMLP(int hidden_dim, int ffn_hidden, const LLaMANameConfig &names, const string &base_name) {
+        gate_proj = ElasticLinear(hidden_dim, ffn_hidden, false, base_name + names._gate_proj_name);
+        silu = SiLU(base_name + "act");
+        up_proj = ElasticLinear(hidden_dim, ffn_hidden, false, base_name + names._up_proj_name);
+        down_proj = ElasticLinear(ffn_hidden, hidden_dim, false, base_name + names._down_proj_name);
+    }
+    vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override  {
+        auto x = gate_proj(inputs[0], 256, -1);
+        x = silu(x);
+        auto y = up_proj(inputs[0], 256, -1);
+        x = x * y;
+        x = down_proj(x, 256, -1);
+        return {x};
+    }
+};
+
+class ElasticLLaMABlock final : public Module {
+    MultiHeadAttention attention;
+    ElasticLLaMAMLP mlp;
+    Layer norm1;
+    Layer norm2;
+
+public:
+    ElasticLLaMABlock() = default;
+    ElasticLLaMABlock(int hidden_dim, int head_size, int ffn_hidden, RoPEType RoPE_type, int cache_limit, const LLaMANameConfig &names, const string &base_name) {
+        attention = MultiHeadAttention(hidden_dim, head_size, head_size, hidden_dim / head_size, SPLIT_NONE, false, false,
+                                       RoPE_type, cache_limit, true, false, names, base_name + names._attn_base_name);
+        mlp = ElasticLLaMAMLP(hidden_dim, ffn_hidden, names, base_name + names._ffn_base_name);
+        norm1 = RMSNorm(hidden_dim, 1e-6, base_name + names._attn_norm_name);
+        norm2 = RMSNorm(hidden_dim, 1e-6, base_name + names._ffn_norm_name);
+    }
+    vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override  {
+        auto x = norm1(inputs[0]);
+        x = attention({x, x, x})[0];
+        auto tmp = x + inputs[0];
+        x = norm2(tmp);
+        x = mlp({x})[0];
+        x = x + tmp;
+        return {x};
+    }
+};
+
+class ElasticLLaMAModel final : public Module {
+    Layer embedding;
+    vector<ElasticLLaMABlock> blocks;
+    Layer norm;
+    Layer lm_head;
+
+public:
+    explicit ElasticLLaMAModel(const LLaMAConfig &config) :
+        ElasticLLaMAModel(config.vocab_size, config.hidden_dim, config.head_size, config.ffn_hidden, config.block_num, config.RoPE_type, config.cache_limit,
+                   config.names_config, config.names_config.blk_name) {
+    }
+    ElasticLLaMAModel(int vocab_size, int hidden_dim, int head_size, int ffn_hidden, int block_num, RoPEType RoPE_type, int cache_limit,
+               const LLaMANameConfig &names, const string &base_name) {
+        embedding = Embedding(vocab_size, hidden_dim, names.token_embd_name);
+        blocks = List<ElasticLLaMABlock>(block_num, hidden_dim, head_size, ffn_hidden, RoPE_type, cache_limit, names, base_name);
+        norm = RMSNorm(hidden_dim, 1e-6, names.post_norm_name);
+        lm_head = Linear(hidden_dim, vocab_size, false, names.lm_head_name);
+    }
+    vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override  {
+        auto x = embedding(inputs[0]);
+        for (auto &block : blocks) {
+            x = block({x})[0];
+        }
+        x = norm(x);
+        x = lm_head(x);
+        return {x};
+    }
+};
+
+#endif // MODELING_LLAMA_HPP
\ No newline at end of file

From f5acae49f3e8aba586344a7ed3d31a9f4c309fad Mon Sep 17 00:00:00 2001
From: Rongjie Yi <41737961+yirongjie@users.noreply.github.com>
Date: Wed, 17 Jul 2024 21:46:10 +0800
Subject: [PATCH 4/7] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 48a29e43..a781cb3f 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ Wait.. why on-device multimodal LLM? - It's a key building block for [intelligen
 
 ## Recent update
 - [🔥🔥Comming soon] Supporting Qualcomm NPU: [>1000 tokens/second prefilling!](https://arxiv.org/pdf/2407.05858v1)
-V1- [2024 July 17] Support new model: StableLM V2 1.6B https://github.com/UbiquitousLearning/mllm/pull/94
+- [2024 July 17] Support new model: StableLM V2 1.6B https://github.com/UbiquitousLearning/mllm/pull/94
 - [2024 July 2] Support new model: Yi V1.5 6B https://github.com/UbiquitousLearning/mllm/pull/88
 - [2024 May 29] Support new model: Mistral V0.2 7B https://github.com/UbiquitousLearning/mllm/pull/83
 - [2024 May 4] Support new model: QWen V1.5 0.5B https://github.com/UbiquitousLearning/mllm/pull/79

From 89118f3e0fd6c8f14dd935875503859bd93cfbff Mon Sep 17 00:00:00 2001
From: yirongjie <yirj0809@gmail.com>
Date: Thu, 18 Jul 2024 02:19:05 +0000
Subject: [PATCH 5/7] fix: demo elastic llama with attn elastic

---
 examples/demo_elastic_llama.cpp             |  8 +-
 src/backends/cpu/CPUElasticLinear.cpp       | 19 ++--
 src/models/llama/modeling_elastic_llama.hpp | 98 +++++++++++++++++++--
 3 files changed, 101 insertions(+), 24 deletions(-)

diff --git a/examples/demo_elastic_llama.cpp b/examples/demo_elastic_llama.cpp
index 94e90d07..d4301d0b 100644
--- a/examples/demo_elastic_llama.cpp
+++ b/examples/demo_elastic_llama.cpp
@@ -14,7 +14,7 @@ using namespace mllm;
 int main(int argc, char **argv) {
     cmdline::parser cmdParser;
     cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm");
-    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_k.mllm");
+    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/elasticllama-2-7b-chat-q4_k.mllm");
     cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
     cmdParser.add<int>("thread", 't', "num of threads", false, 4);
     cmdParser.parse_check(argc, argv);
@@ -40,8 +40,10 @@ int main(int argc, char **argv) {
         auto input_tensor = tokenizer.tokenize(in_str, i);
         std::cout << "[Q] " << in_str << std::endl;
         std::cout << "[A] " << std::flush;
-        for (int step = 0; step < 1; step++) {
-            auto result = model({input_tensor});
+        for (int step = 0; step < 100; step++) {
+            // vector<int> activate_dims = {32*256,256}; // 32*256 is attn_head*attn_hidden_dim(e.g. llama:32*128); 256 is ffn_hidden_dim(e.g. llama:11008) 
+            vector<int> activate_dims = {-1,-1};
+            auto result = model({input_tensor}, activate_dims);
             auto outputs = tokenizer.detokenize(result[0]);
             auto out_string = outputs.first;
             auto out_token = outputs.second;
diff --git a/src/backends/cpu/CPUElasticLinear.cpp b/src/backends/cpu/CPUElasticLinear.cpp
index 85a8c603..cfd877ba 100644
--- a/src/backends/cpu/CPUElasticLinear.cpp
+++ b/src/backends/cpu/CPUElasticLinear.cpp
@@ -18,24 +18,17 @@ ErrorCode CPUElasticLinear::reshape(vector<shared_ptr<Tensor>> inputs, vector<sh
     //std::cout << name() << "  CPUElasticLinear  reshape" << std::endl;
     assert(inputs.size() == 3);
     assert(outputs.size() == 1);
+    int activate_input_dim = (int)inputs[1]->dataAt<float>(0,0,0,0);
+    int activate_output_dim = (int)inputs[2]->dataAt<float>(0,0,0,0);
     if(inputs[0]->count() == 0) {
         outputs[0]->reshape(0,0,0,0);
         return Op::reshape(inputs, outputs);
     }
-    // N     |    C       |   H                   |  W
-    // -----------------------------------------------
-    // 1     |out_channel | in_channel            |  1
-    //       |out_features| in_features           |
-    // -----------------------------------------------
-    // batch |in_channel  | seq_len               |  1
-    //       |in_features | inputs[0]->sequence()   |
-    // -----------------------------------------------
-    // batch |out_channel | seq_len               |  1
-    //       |out_features|  inputs[0]->sequence()  |
+    int in_dimension = (activate_input_dim == -1) ? in_features_ : activate_input_dim;
+    int out_dimension = (activate_output_dim == -1) ? out_features_ : activate_output_dim;
     assert(inputs[0]->head() == 1);
-    assert(in_features_ == inputs[0]->dimension());
-    outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence(), out_features_);
-    //outputs[0]->setDtype(activationDtype());
+    assert(in_dimension == inputs[0]->dimension());
+    outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence(), out_dimension);
     return Op::reshape(inputs, outputs);
 }
 
diff --git a/src/models/llama/modeling_elastic_llama.hpp b/src/models/llama/modeling_elastic_llama.hpp
index a3114ba2..3f4c9dd4 100644
--- a/src/models/llama/modeling_elastic_llama.hpp
+++ b/src/models/llama/modeling_elastic_llama.hpp
@@ -9,9 +9,84 @@
 #include "Module.hpp"
 #include "configuration_llama.hpp"
 #include "models/transformer/modeling_transformer.hpp"
+#include <vector>
 
 using namespace mllm;
 
+class ElasticMultiHeadAttention final : public Module {
+    Layer q_proj;
+    Layer k_proj;
+    Layer v_proj;
+    Layer q_rope;
+    Layer k_rope;
+    Layer k_cache;
+    Layer v_cache;
+    Layer mask;
+    Layer softmax;
+    Layer o_proj;
+    int head_size_{};
+    int kv_head_size_{};
+    int attn_hidden_dim_{};
+
+public:
+    ElasticMultiHeadAttention() = default;
+    ElasticMultiHeadAttention(int hidden_dim, int head_size,int kv_head_size, int attn_hidden_dim,
+                       RoPEType RoPE_type, int cache_limit, bool do_mask, bool bias,
+                       const TransformerNameConfig &names, const string &base_name) {
+        attn_hidden_dim_ = attn_hidden_dim;
+        head_size_ = head_size;
+        kv_head_size_ = kv_head_size;
+        q_proj = ElasticLinear(hidden_dim, head_size * attn_hidden_dim, bias, base_name + names._q_proj_name);
+        k_proj = ElasticLinear(hidden_dim, kv_head_size * attn_hidden_dim, bias, base_name + names._k_proj_name);
+        v_proj = ElasticLinear(hidden_dim, kv_head_size * attn_hidden_dim, bias, base_name + names._v_proj_name);
+        
+        if (RoPE_type > 0) {
+            q_rope = RoPE(RoPE_type, base_name + "q_rope");
+            k_rope = RoPE(RoPE_type, base_name + "k_rope");
+        }
+        if (cache_limit > 0) {
+            k_cache = KVCache(head_size/kv_head_size, cache_limit, base_name + "k_cache");
+            v_cache = KVCache(head_size/kv_head_size, cache_limit, base_name + "v_cache");
+        }
+        if (do_mask) {
+            mask = Causalmask(base_name + "mask");
+        }
+        softmax = Softmax(DIMENSION, base_name + "softmax");
+        o_proj = ElasticLinear(head_size * attn_hidden_dim, hidden_dim, bias, base_name + names._o_proj_name);
+    }
+    vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override  {
+        vector<int> activate_dims = std::any_cast<vector<int>>(args[0]);
+        int activate_dim = activate_dims[0];
+        int activate_hidden_dim = (activate_dim==-1)? attn_hidden_dim_: (activate_dim/head_size_);
+        Tensor q, k, v;
+        q = q_proj(inputs[0], -1, activate_dim);
+        k = k_proj(inputs[1], -1, activate_dim);
+        v = v_proj(inputs[2], -1, activate_dim);
+        q = q.view(-1, head_size_, -1, activate_hidden_dim);
+        k = k.view(-1, kv_head_size_, -1, activate_hidden_dim);
+        v = v.view(-1, kv_head_size_, -1, activate_hidden_dim);
+        if (q_rope.ready() && k_rope.ready()) {
+            q = q_rope(q);
+            k = k_rope(k);
+        }
+        if (k_cache.ready() && v_cache.ready()) {
+            k = k_cache(k);
+            v = v_cache(v);
+        }
+        k = k.transpose(SEQUENCE, DIMENSION);
+        auto qk = Tensor::mm(q, k);
+        qk = qk / std::sqrt(activate_hidden_dim);//attn_hidden_dim_
+        if (mask.ready()) {
+            qk = mask(qk);
+        }
+        qk = softmax(qk);
+        auto o = Tensor::mm(qk, v);
+        o = o.view(-1, 1, -1, activate_hidden_dim * head_size_);
+        o = o_proj(o, activate_dim, -1);
+        return {o};
+    }
+};
+
 class ElasticLLaMAMLP final : public Module {
     Layer gate_proj;
     Layer silu;
@@ -27,17 +102,19 @@ class ElasticLLaMAMLP final : public Module {
         down_proj = ElasticLinear(ffn_hidden, hidden_dim, false, base_name + names._down_proj_name);
     }
     vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override  {
-        auto x = gate_proj(inputs[0], 256, -1);
+        vector<int> activate_dims = std::any_cast<vector<int>>(args[0]);
+        int activate_dim = activate_dims[0];
+        auto x = gate_proj(inputs[0], -1, activate_dim);
         x = silu(x);
-        auto y = up_proj(inputs[0], 256, -1);
+        auto y = up_proj(inputs[0], -1, activate_dim);
         x = x * y;
-        x = down_proj(x, 256, -1);
+        x = down_proj(x, activate_dim, -1);
         return {x};
     }
 };
 
 class ElasticLLaMABlock final : public Module {
-    MultiHeadAttention attention;
+    ElasticMultiHeadAttention attention;
     ElasticLLaMAMLP mlp;
     Layer norm1;
     Layer norm2;
@@ -45,18 +122,21 @@ class ElasticLLaMABlock final : public Module {
 public:
     ElasticLLaMABlock() = default;
     ElasticLLaMABlock(int hidden_dim, int head_size, int ffn_hidden, RoPEType RoPE_type, int cache_limit, const LLaMANameConfig &names, const string &base_name) {
-        attention = MultiHeadAttention(hidden_dim, head_size, head_size, hidden_dim / head_size, SPLIT_NONE, false, false,
+        attention = ElasticMultiHeadAttention(hidden_dim, head_size, head_size, hidden_dim / head_size,
                                        RoPE_type, cache_limit, true, false, names, base_name + names._attn_base_name);
         mlp = ElasticLLaMAMLP(hidden_dim, ffn_hidden, names, base_name + names._ffn_base_name);
         norm1 = RMSNorm(hidden_dim, 1e-6, base_name + names._attn_norm_name);
         norm2 = RMSNorm(hidden_dim, 1e-6, base_name + names._ffn_norm_name);
     }
     vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override  {
+        vector<int> activate_dims = std::any_cast<vector<int>>(args[0]);
+        vector<int> dim_attns = {activate_dims[0]};
+        vector<int> dim_mlps = {activate_dims[1]};
         auto x = norm1(inputs[0]);
-        x = attention({x, x, x})[0];
+        x = attention({x, x, x}, dim_attns)[0];
         auto tmp = x + inputs[0];
         x = norm2(tmp);
-        x = mlp({x})[0];
+        x = mlp({x}, dim_mlps)[0];
         x = x + tmp;
         return {x};
     }
@@ -81,9 +161,11 @@ class ElasticLLaMAModel final : public Module {
         lm_head = Linear(hidden_dim, vocab_size, false, names.lm_head_name);
     }
     vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override  {
+        vector<int> activate_dims = std::any_cast<vector<int>>(args[0]);
+
         auto x = embedding(inputs[0]);
         for (auto &block : blocks) {
-            x = block({x})[0];
+            x = block({x}, activate_dims)[0];
         }
         x = norm(x);
         x = lm_head(x);

From 51855901515d4f4b8822f76c5ee59b86800d1567 Mon Sep 17 00:00:00 2001
From: yirongjie <yirj0809@gmail.com>
Date: Thu, 18 Jul 2024 02:20:59 +0000
Subject: [PATCH 6/7] fix

---
 src/models/llama/modeling_elastic_llama.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/models/llama/modeling_elastic_llama.hpp b/src/models/llama/modeling_elastic_llama.hpp
index 3f4c9dd4..b340c5dc 100644
--- a/src/models/llama/modeling_elastic_llama.hpp
+++ b/src/models/llama/modeling_elastic_llama.hpp
@@ -8,7 +8,6 @@
 #include "Layer.hpp"
 #include "Module.hpp"
 #include "configuration_llama.hpp"
-#include "models/transformer/modeling_transformer.hpp"
 #include <vector>
 
 using namespace mllm;

From 32f72589e14d0745ee7ea28a365c0c6b8ea5c143 Mon Sep 17 00:00:00 2001
From: yirongjie <yirj0809@gmail.com>
Date: Thu, 18 Jul 2024 11:00:48 +0000
Subject: [PATCH 7/7] fix: load

---
 examples/demo_elastic_llama.cpp             | 40 +++++++++++++++++++--
 src/Module.hpp                              | 25 +++++++++++--
 src/ParamLoader.cpp                         |  6 +++-
 src/models/llama/modeling_elastic_llama.hpp | 18 ++++++++--
 4 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/examples/demo_elastic_llama.cpp b/examples/demo_elastic_llama.cpp
index d4301d0b..585df659 100644
--- a/examples/demo_elastic_llama.cpp
+++ b/examples/demo_elastic_llama.cpp
@@ -14,7 +14,7 @@ using namespace mllm;
 int main(int argc, char **argv) {
     cmdline::parser cmdParser;
     cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm");
-    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/elasticllama-2-7b-chat-q4_k.mllm");
+    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_k.mllm");
     cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
     cmdParser.add<int>("thread", 't', "num of threads", false, 4);
     cmdParser.parse_check(argc, argv);
@@ -41,8 +41,42 @@ int main(int argc, char **argv) {
         std::cout << "[Q] " << in_str << std::endl;
         std::cout << "[A] " << std::flush;
         for (int step = 0; step < 100; step++) {
-            // vector<int> activate_dims = {32*256,256}; // 32*256 is attn_head*attn_hidden_dim(e.g. llama:32*128); 256 is ffn_hidden_dim(e.g. llama:11008) 
-            vector<int> activate_dims = {-1,-1};
+            // vecor<vector<int>> activate_dims = {{32*8,256}}; 
+            // 32*8 is attn_head*attn_hidden_dim(e.g. llama:32*128); 256 is ffn_hidden_dim(e.g. llama:11008) 
+            vector<vector<int>> activate_dims = {
+                                        {-1,-1},  //0
+                                        {-1,-1}, //1
+                                        {-1,-1}, //2
+                                        {-1,-1}, //3
+                                        {-1,-1}, //4
+                                        {-1,-1}, //5
+                                        {-1,-1}, //6
+                                        {-1,-1}, //7
+                                        {-1,-1}, //8
+                                        {-1,-1}, //9
+                                        {-1,-1}, //10
+                                        {-1,-1}, //11
+                                        {-1,-1}, //12
+                                        {-1,-1}, //13
+                                        {-1,-1}, //14
+                                        {-1,-1}, //15
+                                        {-1,-1}, //16
+                                        {-1,-1}, //17
+                                        {-1,-1}, //18
+                                        {-1,-1}, //19
+                                        {-1,-1}, //20
+                                        {-1,-1}, //21
+                                        {-1,-1}, //22
+                                        {-1,-1}, //23
+                                        {-1,-1}, //24
+                                        {-1,-1}, //25
+                                        {-1,-1}, //26
+                                        {-1,-1}, //27
+                                        {-1,-1}, //28
+                                        {-1,-1}, //29
+                                        {-1,-1}, //30
+                                        {-1,-1} //31
+            };
             auto result = model({input_tensor}, activate_dims);
             auto outputs = tokenizer.detokenize(result[0]);
             auto out_string = outputs.first;
diff --git a/src/Module.hpp b/src/Module.hpp
index 12c57d59..c0470e01 100644
--- a/src/Module.hpp
+++ b/src/Module.hpp
@@ -12,6 +12,7 @@
 #include "backends/cpu/CPUBackend.hpp"
 
 #include <any>
+#include <iostream>
 #include <memory/SystemMemoryManager.hpp>
 #include <numeric>
 #include <utility>
@@ -70,9 +71,27 @@ class Module {
             Tensor::gph_[std::to_string(i)] = Tensor(Module::backends[MLLM_CPU]);
             tmps.push_back(Tensor::gph_[std::to_string(i)]);
         }
-        vector<int> tmpt = {0, 0};
-        uint64_t time_start = mllm_time_us();
-        operator()(tmps, tmpt);
+        vector<std::any> alternate_args={
+            {},
+            vector<int>{0, 0},
+            std::vector<std::vector<int>>(32, std::vector<int>(2))
+        };
+        uint64_t time_start = 0;
+        for (auto args : alternate_args) {
+            time_start = mllm_time_us();
+            try {
+                operator()(tmps, args);
+                break;
+            } catch (const std::exception& e) {
+                if("bad any_cast" != e.what()) {
+                    std::cerr << e.what() << std::endl;
+                    exit(0);
+                }
+            } catch (...) {
+                std::cerr << "load error" << std::endl;
+                exit(0);
+            }     
+        }
         uint64_t time_end = mllm_time_us();
         load_time_ = (time_end - time_start) / 1000.0F;//ms
         Module::doLoad = false;
diff --git a/src/ParamLoader.cpp b/src/ParamLoader.cpp
index f9029d8b..1dd51256 100644
--- a/src/ParamLoader.cpp
+++ b/src/ParamLoader.cpp
@@ -2,6 +2,7 @@
 #include "Types.hpp"
 #include <cstdint>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
 #include <iostream>
 #include <string>
@@ -123,7 +124,10 @@ std::tuple<uint8_t *, uint64_t> ParamLoader::load(string name) {
 }
 DataType ParamLoader::getDataType(string name) {
     if (data_type_.count(name) != 1) {
-        if (this->fp_ != nullptr) {
+        if(this->path_ != "" && this->fp_ == nullptr){
+            std::cerr<<this->path_<<" not found"<<std::endl;
+            exit(0);
+        }else if (this->fp_ != nullptr && this->path_ != "") {
             std::cerr<<name<<" not found"<<std::endl;
         }
         return DataType::MLLM_TYPE_COUNT;
diff --git a/src/models/llama/modeling_elastic_llama.hpp b/src/models/llama/modeling_elastic_llama.hpp
index b340c5dc..dcd1b164 100644
--- a/src/models/llama/modeling_elastic_llama.hpp
+++ b/src/models/llama/modeling_elastic_llama.hpp
@@ -8,6 +8,7 @@
 #include "Layer.hpp"
 #include "Module.hpp"
 #include "configuration_llama.hpp"
+#include <cassert>
 #include <vector>
 
 using namespace mllm;
@@ -146,6 +147,7 @@ class ElasticLLaMAModel final : public Module {
     vector<ElasticLLaMABlock> blocks;
     Layer norm;
     Layer lm_head;
+    int num_layer_size;
 
 public:
     explicit ElasticLLaMAModel(const LLaMAConfig &config) :
@@ -158,13 +160,23 @@ class ElasticLLaMAModel final : public Module {
         blocks = List<ElasticLLaMABlock>(block_num, hidden_dim, head_size, ffn_hidden, RoPE_type, cache_limit, names, base_name);
         norm = RMSNorm(hidden_dim, 1e-6, names.post_norm_name);
         lm_head = Linear(hidden_dim, vocab_size, false, names.lm_head_name);
+        num_layer_size = block_num;
     }
     vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override  {
-        vector<int> activate_dims = std::any_cast<vector<int>>(args[0]);
+        // vector<int> activate_dims = std::any_cast<vector<int>>(args[0]);
+        // assert(activate_dims.size() == 2*num_layer_size);
+        // auto x = embedding(inputs[0]);
+        // for (int id = 0; id<blocks.size(); id ++){
+        //     vector<int> activate_dims_ = {activate_dims[id], activate_dims[id+1]};
+        //     x = blocks[id]({x}, activate_dims_)[0];
+        // }
+
 
+        vector<vector<int>> activate_dims = std::any_cast<vector<vector<int>>>(args[0]);
+        assert(activate_dims.size() == num_layer_size);
         auto x = embedding(inputs[0]);
-        for (auto &block : blocks) {
-            x = block({x}, activate_dims)[0];
+        for (int id = 0; id<blocks.size(); id ++){
+            x = blocks[id]({x}, activate_dims[id])[0];
         }
         x = norm(x);
         x = lm_head(x);