diff --git a/examples/demo_tinyllama.cpp b/examples/demo_tinyllama.cpp index d844c078..627074e4 100644 --- a/examples/demo_tinyllama.cpp +++ b/examples/demo_tinyllama.cpp @@ -55,6 +55,7 @@ int main(int argc, char **argv) { chatPostProcessing(out_token, input_tensor, {}); } printf("\n"); + model.profiling(); } return 0; diff --git a/src/Module.hpp b/src/Module.hpp index 8dd9e3a1..802147ad 100644 --- a/src/Module.hpp +++ b/src/Module.hpp @@ -8,15 +8,23 @@ #include "Op.hpp" #include "ParamLoader.hpp" #include "Backend.hpp" +#include "Timing.hpp" #include "backends/cpu/CPUBackend.hpp" #include #include +#include #include +#include namespace mllm { class Module { +private: + double load_time_; + int prefilling_token_size_=0; + vector inference_times_; + public: static map backends; static AbstructLoader *loader; @@ -48,6 +56,7 @@ class Module { } void load(string path) { + mllm_time_init(); initLoader(path); Module::doLoad = true; vector tmps; @@ -57,7 +66,10 @@ class Module { tmps.push_back(Tensor::gph_[std::to_string(i)]); } vector tmpt = {0, 0}; + uint64_t time_start = mllm_time_us(); operator()(tmps, tmpt); + uint64_t time_end = mllm_time_us(); + load_time_ = (time_end - time_start) / 1000.0F;//ms Module::doLoad = false; Tensor::gph_.clear(); } @@ -99,13 +111,21 @@ class Module { } tensor_status = TENSOR_STATIC_INIT; + uint64_t time_start = mllm_time_us(); Forward(inputs, anyArgs); for (auto &input : inputs) { input.status() = TENSOR_STATIC_READY; } tensor_status = TENSOR_STATIC_READY; + auto output = Forward(inputs, anyArgs); + uint64_t time_end = mllm_time_us(); + if(prefilling_token_size_==0){ + prefilling_token_size_ = inputs[0].sequence(); + } + double inference_time_ = (time_end - time_start) / 1000.0F;//ms + inference_times_.push_back(inference_time_); - return Forward(inputs, anyArgs); + return output; } else { return Forward(inputs, anyArgs); } @@ -151,6 +171,21 @@ class Module { listIdx = 0; return modules; } + + void profiling() { + printf("\n"); + std::cout << "===========================================" << std::endl; + std::cout << " Load time: " << load_time_/1000.0F << " s" << std::endl; + if(prefilling_token_size_){ + std::cout << " Prefilling speed: " << 1000 * prefilling_token_size_ / inference_times_[0] << " tokens/s" << std::endl; + } + if(inference_times_.size()>1){ + double sum_decoding_time = std::accumulate(std::begin(inference_times_)+1, std::end(inference_times_), 0.0); + double mean_decoding_time = sum_decoding_time / (inference_times_.size()-1); + std::cout << " Decoding speed: " << 1000 / mean_decoding_time << " tokens/s" << std::endl; + } + std::cout << "===========================================" << std::endl; + } }; } // namespace mllm diff --git a/src/ParamLoader.hpp b/src/ParamLoader.hpp index 90017291..6265ff67 100644 --- a/src/ParamLoader.hpp +++ b/src/ParamLoader.hpp @@ -52,7 +52,7 @@ class AbstructLoader { public: virtual bool load(mllm::Tensor *tensor) = 0; virtual bool load(std::shared_ptr tensor) = 0; - virtual size_t getTensorSize(string name){fprintf(stderr,"loader not support getTensorSize");assert(false);} + virtual size_t getTensorSize(string name){fprintf(stderr,"loader not support getTensorSize");return NOT_SUPPORT;} virtual DataType getDataType(string name) {return MLLM_TYPE_COUNT;} }; diff --git a/src/backends/cpu/type/type.cpp b/src/backends/cpu/type/type.cpp index ed230372..77e30691 100644 --- a/src/backends/cpu/type/type.cpp +++ b/src/backends/cpu/type/type.cpp @@ -103,7 +103,7 @@ static void vec_dot_fp32_local(const int n, float *__restrict s, const float *__ vec_dot_fp32_arm(n, s, vx, vy); #endif } - +/* void vec_dot_q8_0_q8_0(int n, float * __restrict s, const void * __restrict vx, const void * __restrict vy) { const int qk = QK8_0; const int nb = n / qk; // number of blocks @@ -120,10 +120,10 @@ void vec_dot_q8_0_q8_0(int n, float * __restrict s, const void * __restrict vx, assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q8_0 * restrict x0 = &x[i + 0]; - const block_q8_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i + 0]; - const block_q8_0 * restrict y1 = &y[i + 1]; + const block_q8_0 * x0 = &x[i + 0]; + const block_q8_0 * x1 = &x[i + 1]; + const block_q8_0 * y0 = &y[i + 0]; + const block_q8_0 * y1 = &y[i + 1]; const int8x16_t x0_0 = vld1q_s8(x0->qs); const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); @@ -205,7 +205,7 @@ void vec_dot_q8_0_q8_0(int n, float * __restrict s, const void * __restrict vx, *s = sumf; #endif } - +*/ void fp32_add_row_to(int n, const float * MLLM_RESTRICT src, float * MLLM_RESTRICT dst, float alpha){ int i = 0; #ifdef __AVX2__ @@ -251,7 +251,7 @@ void fp_16_add_row_to(int n, const mllm_fp16_t * MLLM_RESTRICT src, float * MLLM _mm256_storeu_ps(dst + i, res_vec); // store back to dst } #elif defined(__ARM_NEON) - ASSERT(false); // not support now + std::cout<<"not support now"<