diff --git a/examples/demo_elastic_llama.cpp b/examples/demo_elastic_llama.cpp index 585df659..ba69095a 100644 --- a/examples/demo_elastic_llama.cpp +++ b/examples/demo_elastic_llama.cpp @@ -14,7 +14,7 @@ using namespace mllm; int main(int argc, char **argv) { cmdline::parser cmdParser; cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm"); - cmdParser.add("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_k.mllm"); + cmdParser.add("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_0_4_4.mllm"); cmdParser.add("limits", 'l', "max KV cache size", false, 400); cmdParser.add("thread", 't', "num of threads", false, 4); cmdParser.parse_check(argc, argv); @@ -44,7 +44,8 @@ int main(int argc, char **argv) { // vecor> activate_dims = {{32*8,256}}; // 32*8 is attn_head*attn_hidden_dim(e.g. llama:32*128); 256 is ffn_hidden_dim(e.g. llama:11008) vector> activate_dims = { - {-1,-1}, //0 + // {(int)(32*128*0.5),(int)(11008*0.5)}, //0 + {-1,-1}, //0 {-1,-1}, //1 {-1,-1}, //2 {-1,-1}, //3 diff --git a/examples/demo_llama.cpp b/examples/demo_llama.cpp index b68603c1..4709cfbf 100644 --- a/examples/demo_llama.cpp +++ b/examples/demo_llama.cpp @@ -13,7 +13,7 @@ using namespace mllm; int main(int argc, char **argv) { cmdline::parser cmdParser; cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm"); - cmdParser.add("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_k.mllm"); + cmdParser.add("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_0_4_4.mllm"); cmdParser.add("limits", 'l', "max KV cache size", false, 400); cmdParser.add("thread", 't', "num of threads", false, 4); cmdParser.parse_check(argc, argv); @@ -32,7 +32,8 @@ int main(int argc, char **argv) { vector in_strs = { " Hello, who are you?", " What can you do?", - "Please introduce Beijing University of Posts and Telecommunications."}; + "Please introduce Beijing University of Posts and Telecommunications." + }; for (int i = 0; i < in_strs.size(); ++i) { auto in_str = in_strs[i]; diff --git a/include/Types.hpp b/include/Types.hpp index eb0500ed..42c5fd0c 100644 --- a/include/Types.hpp +++ b/include/Types.hpp @@ -19,7 +19,11 @@ using std::map; typedef map OpParam; -inline bool saveNDataFlag = false; +// #define DEBUGSAVETENSOR +// #define DEBUGOPTIME + + +#define LLAMAFILE_SGEMM typedef enum { MLLM_CPU, @@ -151,7 +155,6 @@ enum RoPEType { * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#define LLAMAFILE_SGEMM #if defined(__ARM_NEON) && !defined(_MSC_VER) typedef __fp16 mllm_fp16_t; diff --git a/src/Layer.hpp b/src/Layer.hpp index 98986a72..eadf52ae 100644 --- a/src/Layer.hpp +++ b/src/Layer.hpp @@ -193,6 +193,9 @@ class Layer { } string layer_next_name = "out-" + op_->name(); auto next_name = layername_2_tensorname[layer_next_name]; +#ifdef DEBUGOPTIME + auto start_t = mllm_time_us(); +#endif switch (Tensor::tensor_status) { case TENSOR_STATIC_INIT: { op_->reshape({Tensor::graphs[input.name()]}, {Tensor::graphs[next_name]}); @@ -207,9 +210,13 @@ class Layer { break; } } - if(saveNDataFlag){ - Tensor::graphs[next_name]->saveNData(layer_next_name); - } +#ifdef DEBUGOPTIME + auto end_t = mllm_time_us(); + std::cout<name() << " | "<saveNData(layer_next_name); +#endif return *Tensor::graphs[next_name]; } Tensor &_2I1O_OP(Tensor &input0, Tensor &input1) { @@ -239,6 +246,9 @@ class Layer { } string layer_next_name = "out-" + op_->name(); auto next_name = layername_2_tensorname[layer_next_name]; +#ifdef DEBUGOPTIME + auto start_t = mllm_time_us(); +#endif switch (Tensor::tensor_status) { case TENSOR_STATIC_INIT: { op_->reshape({Tensor::graphs[input0.name()], Tensor::graphs[input1.name()]}, {Tensor::graphs[next_name]}); @@ -253,9 +263,13 @@ class Layer { break; } } - if(saveNDataFlag){ - Tensor::graphs[next_name]->saveNData(layer_next_name); - } +#ifdef DEBUGOPTIME + auto end_t = mllm_time_us(); + std::cout<name() << " | "<saveNData(layer_next_name); +#endif return *Tensor::graphs[next_name]; } Tensor &_3I1O_OP(Tensor &input0, Tensor &input1, Tensor &input2) { @@ -289,6 +303,9 @@ class Layer { } string layer_next_name = "out-" + op_->name(); auto next_name = layername_2_tensorname[layer_next_name]; +#ifdef DEBUGOPTIME + auto start_t = mllm_time_us(); +#endif switch (Tensor::tensor_status) { case TENSOR_STATIC_INIT: { op_->reshape({Tensor::graphs[input0.name()], Tensor::graphs[input1.name()], Tensor::graphs[input2.name()]}, @@ -305,10 +322,14 @@ class Layer { default: { break; } - } - if(saveNDataFlag){ - Tensor::graphs[next_name]->saveNData(layer_next_name); } +#ifdef DEBUGOPTIME + auto end_t = mllm_time_us(); + std::cout<name() << " | "<saveNData(layer_next_name); +#endif return *Tensor::graphs[next_name]; } Tensor &_3I1OO1_OP(Tensor &input0, Tensor &input1, Tensor &input2) { @@ -334,6 +355,9 @@ class Layer { } string layer_next_name = "out-" + op_->name(); auto next_name = layername_2_tensorname[layer_next_name]; +#ifdef DEBUGOPTIME + auto start_t = mllm_time_us(); +#endif switch (Tensor::tensor_status) { case TENSOR_STATIC_INIT: { op_->reshape({Tensor::graphs[input0.name()], @@ -356,10 +380,14 @@ class Layer { default: { break; } - } - if(saveNDataFlag){ - Tensor::graphs[next_name]->saveNData(layer_next_name); } +#ifdef DEBUGOPTIME + auto end_t = mllm_time_us(); + std::cout<name() << " | "<saveNData(layer_next_name); + #endif return *Tensor::graphs[next_name]; } Tensor &_0I1O_OP() { @@ -381,6 +409,9 @@ class Layer { } string layer_next_name = "param-" + op_->name(); auto next_name = layername_2_tensorname[layer_next_name]; +#ifdef DEBUGOPTIME + auto start_t = mllm_time_us(); +#endif switch (Tensor::tensor_status) { case TENSOR_STATIC_INIT: { op_->reshape({}, {Tensor::graphs[next_name]}); @@ -395,9 +426,13 @@ class Layer { break; } } - if(saveNDataFlag){ - Tensor::graphs[next_name]->saveNData(layer_next_name); - } +#ifdef DEBUGOPTIME + auto end_t = mllm_time_us(); + std::cout<name() << " | "<saveNData(layer_next_name); +#endif return *Tensor::graphs[next_name]; } vector _1INO_OP(Tensor &input, int N) { @@ -442,6 +477,9 @@ class Layer { next_names.push_back(next_name); shared_outputs.push_back(Tensor::graphs[next_name]); } +#ifdef DEBUGOPTIME + auto start_t = mllm_time_us(); +#endif switch (Tensor::tensor_status) { case TENSOR_STATIC_INIT: { op_->reshape({ Tensor::graphs[input.name()]}, shared_outputs); @@ -456,12 +494,16 @@ class Layer { break; } } +#ifdef DEBUGOPTIME + auto end_t = mllm_time_us(); + std::cout<name() << " | "< output_result = {}; for (const auto &layer_next_name : layer_next_names) { auto next_name = layername_2_tensorname[layer_next_name]; - if(saveNDataFlag){ - Tensor::graphs[next_name]->saveNData(layer_next_name); - } +#ifdef DEBUGSAVETENSOR + Tensor::graphs[next_name]->saveNData(layer_next_name); +#endif output_result.push_back(*Tensor::graphs[next_name]); } return output_result; diff --git a/src/Module.hpp b/src/Module.hpp index 4ad6ff6b..42bf6ff8 100644 --- a/src/Module.hpp +++ b/src/Module.hpp @@ -250,6 +250,12 @@ class Module { // std::cout< #include "OpDefined.hpp" +#include "Timing.hpp" #include "Types.hpp" #include "backends/cpu/CPUTensorFunction.hpp" @@ -92,6 +93,9 @@ Tensor& Tensor::getFunc(const std::string& suffix, const TensorFuncType type, ve for (auto &other_tensor : other_tensors) { tensorPtrs.push_back(other_tensor); } +#ifdef DEBUGOPTIME + auto start_t = mllm_time_us(); +#endif switch (Tensor::tensor_status) { case TENSOR_STATIC_INIT: { func->setup({Tensor::graphs[next_name].get()}, tensorPtrs, float_args); @@ -99,14 +103,18 @@ Tensor& Tensor::getFunc(const std::string& suffix, const TensorFuncType type, ve } case TENSOR_STATIC_READY: { func->execute({Tensor::graphs[next_name].get()},tensorPtrs, float_args); - if(saveNDataFlag){ - Tensor::graphs[next_name]->saveData(); - } break; } default: { } } +#ifdef DEBUGOPTIME + auto end_t = mllm_time_us(); + std::cout<saveNData(); +#endif return *Tensor::graphs[next_name]; } @@ -238,6 +246,9 @@ Tensor& Tensor::getStaticFunc(const std::string& suffix, const TensorFuncType ty if (Module::doLoad) { return *Tensor::graphs[next_name]; } +#ifdef DEBUGOPTIME + auto start_t = mllm_time_us(); +#endif switch (Tensor::tensor_status) { case TENSOR_STATIC_INIT: { func->setup({Tensor::graphs[next_name].get()}, other_tensors, float_args); @@ -245,14 +256,18 @@ Tensor& Tensor::getStaticFunc(const std::string& suffix, const TensorFuncType ty } case TENSOR_STATIC_READY: { func->execute({Tensor::graphs[next_name].get()}, other_tensors, float_args); - if(saveNDataFlag){ - Tensor::graphs[next_name]->saveData(); - } break; } default: { } } +#ifdef DEBUGOPTIME + auto end_t = mllm_time_us(); + std::cout<saveNData(); +#endif return *Tensor::graphs[next_name]; } @@ -299,6 +314,9 @@ std::vector Tensor::getStaticFuncOupts(vector out_names, co for (auto out_name: out_names) { outPtrs.push_back(Tensor::graphs[out_name].get()); } +#ifdef DEBUGOPTIME + auto start_t = mllm_time_us(); +#endif switch (Tensor::tensor_status) { case TENSOR_STATIC_INIT: { func->setup(outPtrs, input_tensors, float_args); @@ -306,16 +324,20 @@ std::vector Tensor::getStaticFuncOupts(vector out_names, co } case TENSOR_STATIC_READY: { func->execute(outPtrs, input_tensors, float_args); - if(saveNDataFlag){ - for (auto out_name: out_names) { - Tensor::graphs[out_name]->saveData(); - } - } break; } default: { } } +#ifdef DEBUGOPTIME + auto end_t = mllm_time_us(); + std::cout<saveNData(); + } +#endif std::vector results; for (auto out_name: out_names) { results.push_back(*Tensor::graphs[out_name]); diff --git a/src/backends/cpu/CPUEmbedding.cpp b/src/backends/cpu/CPUEmbedding.cpp index 05b74a26..12dca917 100644 --- a/src/backends/cpu/CPUEmbedding.cpp +++ b/src/backends/cpu/CPUEmbedding.cpp @@ -115,6 +115,7 @@ ErrorCode CPUEmbedding::execute(vector> inputs, vectordtype()); @@ -708,23 +712,85 @@ ErrorCode mat_mul_elastic(Tensor *src0, Tensor *src1, Tensor *dst, bool support_ to->setBackend(src0->backend()); to->setDtype(vec_dot_type); to->alloc(); - void *row_src = src0->rawHostPtr(); - void *row_dst = to->rawHostPtr(); - auto row_size_src = row_size(src0_dtype, src0->dimension()); - auto row_size_dst = row_size(vec_dot_type, to->dimension()); - auto n_row = src0->batch() * src0->head() * src0->sequence(); - auto n_ele = src0->dimension(); -#pragma omp parallel for num_threads(thread_count) - for(int i = 0;i < n_row;i++){ // copy row by row - auto row1 = (char *)row_src + i * row_size_src; - auto row2 = (char *)row_dst + i * row_size_dst; - x_to_vec_dot_type(reinterpret_cast(row1), row2, n_ele); + int64_t i_processed = 0; + if (from_float_to_mat && gemv && dst->masterTensor()==nullptr){ + for (int b = 0; b < src0->batch(); b++) { + for (int h = 0; h < src0->head(); h++) { +#pragma omp parallel for collapse(1) num_threads(thread_count) + for (int64_t s = 0; s < src0->sequence() - src0->sequence() % 4; s += 4) { + from_float_to_mat(src0->hostPtr() + src0->offset(b, h, s, 0), + (char *)to->rawHostPtr() + to->offset(b, h, s, 0) * type_size(to->dtype()) / blck_size(to->dtype()), + 4, src0->dimension(), blck_size_interleave); + } + i_processed = src0->sequence() - src0->sequence() % 4; + } + } + } +#pragma omp parallel for collapse(3) num_threads(thread_count) + for (int b = 0; b < src0->batch(); b++) { + for (int h = 0; h < src0->head(); h++) { + for (int s = i_processed; s < src0->sequence(); s++) { + x_to_vec_dot_type(src0->hostPtr() + src0->offset(b, h, s, 0), + (char *)to->rawHostPtr() + to->offset(b, h, s, 0) * type_size(to->dtype()) / blck_size(to->dtype()), + src0->dimension()); + } + } } src0 = to.get(); src0_dtype = src0->dtype(); src0_type_size = type_size(src0->dtype()); src0_blck_size = blck_size(src0->dtype()); } + +#ifdef LLAMAFILE_SGEMM + if (check_llamafile_sgemm(N, M, use_K/blck_size(src1->dtype()),src1->dtype(),src0->dtype(),dst->dtype())&&!support_bias){ + const int ld_src1 = src1->sequence_skip_dim(); + const int ld_src0 = src0->sequence_skip_dim(); + const int ld_dst = dst->sequence_skip_dim(); +#pragma omp parallel for collapse(3) num_threads(thread_count) + for (int64_t b = 0; b < dst->batch(); b++){ + for (int64_t h = 0; h < dst->head(); h++){ + for (int id = 0; id < thread_count; id++){ + llamafile_sgemm(N, M, use_K/blck_size(src1->dtype()), + (char *)src1->rawHostPtr() + src1->offset(b, h, 0, 0) * src1_type_size / src1_blck_size, + ld_src1 / src1_blck_size, + (char *)src0->rawHostPtr() + src0->offset(b, h, 0, 0) * src0_type_size / src0_blck_size, + ld_src0/ src0_blck_size, + (char *)dst->rawHostPtr() + dst->offset(b, h, 0, 0) * type_size(dst->dtype()) / blck_size(dst->dtype()), + ld_dst/blck_size(dst->dtype()), + id, thread_count, + src1->dtype(), + src0->dtype(), + dst->dtype()); + } + } + } + return MLLM_NO_ERROR; + } +#endif + + if(gemv&&!support_bias){ + int nth=thread_count; +#pragma omp parallel for collapse(1) num_threads(thread_count) + for (int ith = 0; ith < nth; ith++){ + int64_t i_processed = 0; + int64_t seq_start = (ith * N) / nth; + int64_t seq_end = ((ith + 1) * N) / nth; + if (gemm && (M > 3) && dst->masterTensor()==nullptr) { + gemm(use_K, dst->hostPtr() + dst->offset(0, 0, 0, seq_start), + N, (char *)src1->rawHostPtr()+ src1->offset(0, 0, seq_start, 0) * src1_type_size / src1_blck_size, + (char *)src0->rawHostPtr(), M - M % 4, N/nth); + i_processed = M - M % 4; + } + for (int iter = i_processed; iter < M; iter++) { //M-M%4 + gemv(use_K, dst->hostPtr() + dst->offset(0, 0, iter, seq_start), + N, (char *)src1->rawHostPtr()+ src1->offset(0, 0, seq_start, 0) * src1_type_size / src1_blck_size, + (char *)src0->rawHostPtr() + src0->offset(0, 0, iter, 0) * src0_type_size / src0_blck_size, + 1, N/nth); + } + } + return MLLM_NO_ERROR; + } Tensor *src0_cal = src0;