Merge pull request #91 from yirongjie/main

feat: Inference speed(tokens/s) profiling
UbiquitousLearning · Jul 15, 2024 · ec3360d · ec3360d
2 parents ca6a898 + be37bdc
commit ec3360d
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 18 deletions.
diff --git a/examples/demo_tinyllama.cpp b/examples/demo_tinyllama.cpp
@@ -55,6 +55,7 @@ int main(int argc, char **argv) {
             chatPostProcessing(out_token, input_tensor, {});
         }
         printf("\n");
+        model.profiling();
     }
 
     return 0;

diff --git a/src/Module.hpp b/src/Module.hpp
@@ -8,15 +8,23 @@
 #include "Op.hpp"
 #include "ParamLoader.hpp"
 #include "Backend.hpp"
+#include "Timing.hpp"
 #include "backends/cpu/CPUBackend.hpp"
 
 #include <any>
 #include <memory/SystemMemoryManager.hpp>
+#include <numeric>
 #include <utility>
+#include <vector>
 
 namespace mllm {
 
 class Module {
+private:
+    double load_time_;
+    int prefilling_token_size_=0;
+    vector<double> inference_times_;
+
 public:
     static map<BackendType, Backend *> backends;
     static AbstructLoader *loader;
@@ -48,6 +56,7 @@ class Module {
     }
 
     void load(string path) {
+        mllm_time_init();
         initLoader(path);
         Module::doLoad = true;
         vector<Tensor> tmps;
@@ -57,7 +66,10 @@ class Module {
             tmps.push_back(Tensor::gph_[std::to_string(i)]);
         }
         vector<int> tmpt = {0, 0};
+        uint64_t time_start = mllm_time_us();
         operator()(tmps, tmpt);
+        uint64_t time_end = mllm_time_us();
+        load_time_ = (time_end - time_start) / 1000.0F;//ms
         Module::doLoad = false;
         Tensor::gph_.clear();
     }
@@ -99,13 +111,21 @@ class Module {
             }
             tensor_status = TENSOR_STATIC_INIT;
 
+            uint64_t time_start = mllm_time_us();
             Forward(inputs, anyArgs);
             for (auto &input : inputs) {
                 input.status() = TENSOR_STATIC_READY;
             }
             tensor_status = TENSOR_STATIC_READY;
+            auto output = Forward(inputs, anyArgs);
+            uint64_t time_end = mllm_time_us();
+            if(prefilling_token_size_==0){
+                prefilling_token_size_ = inputs[0].sequence();
+            }
+            double inference_time_ = (time_end - time_start) / 1000.0F;//ms
+            inference_times_.push_back(inference_time_);
 
-            return Forward(inputs, anyArgs);
+            return output;
         } else {
             return Forward(inputs, anyArgs);
         }
@@ -151,6 +171,21 @@ class Module {
         listIdx = 0;
         return modules;
     }
+
+    void profiling() {
+        printf("\n");
+        std::cout << "===========================================" << std::endl;
+        std::cout << "  Load time: " << load_time_/1000.0F << " s" << std::endl;
+        if(prefilling_token_size_){
+            std::cout << "  Prefilling speed: " << 1000 * prefilling_token_size_ / inference_times_[0] << " tokens/s" << std::endl;
+        }
+        if(inference_times_.size()>1){
+            double sum_decoding_time = std::accumulate(std::begin(inference_times_)+1, std::end(inference_times_), 0.0);
+            double mean_decoding_time = sum_decoding_time / (inference_times_.size()-1);
+            std::cout << "  Decoding speed: " << 1000 / mean_decoding_time << " tokens/s" << std::endl;
+        }
+        std::cout << "===========================================" << std::endl;
+    }
 };
 
 } // namespace mllm

diff --git a/src/ParamLoader.hpp b/src/ParamLoader.hpp
@@ -52,7 +52,7 @@ class AbstructLoader {
 public:
     virtual bool load(mllm::Tensor *tensor) = 0;
     virtual bool load(std::shared_ptr<mllm::Tensor> tensor) = 0;
-    virtual size_t getTensorSize(string name){fprintf(stderr,"loader not support getTensorSize");assert(false);}
+    virtual size_t getTensorSize(string name){fprintf(stderr,"loader not support getTensorSize");return NOT_SUPPORT;}
     virtual DataType getDataType(string name) {return MLLM_TYPE_COUNT;}
 };
 

diff --git a/src/backends/cpu/type/type.cpp b/src/backends/cpu/type/type.cpp
@@ -103,7 +103,7 @@ static void vec_dot_fp32_local(const int n, float *__restrict s, const float *__
     vec_dot_fp32_arm(n, s, vx, vy);
 #endif
 }
-
+/*
 void vec_dot_q8_0_q8_0(int n, float * __restrict s, const void * __restrict vx, const void * __restrict vy) {
     const int qk = QK8_0;
     const int nb = n / qk;  // number of blocks
@@ -120,10 +120,10 @@ void vec_dot_q8_0_q8_0(int n, float * __restrict s, const void * __restrict vx,
     assert(nb % 2 == 0); // TODO: handle odd nb
 
     for (int i = 0; i < nb; i += 2) {
-        const block_q8_0 * restrict x0 = &x[i + 0];
-        const block_q8_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
+        const block_q8_0 * x0 = &x[i + 0];
+        const block_q8_0 * x1 = &x[i + 1];
+        const block_q8_0 * y0 = &y[i + 0];
+        const block_q8_0 * y1 = &y[i + 1];
 
         const int8x16_t x0_0 = vld1q_s8(x0->qs);
         const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
@@ -205,7 +205,7 @@ void vec_dot_q8_0_q8_0(int n, float * __restrict s, const void * __restrict vx,
     *s = sumf;
 #endif
 }
-
+*/
 void fp32_add_row_to(int n, const float * MLLM_RESTRICT src, float * MLLM_RESTRICT dst, float alpha){
     int i = 0;
 #ifdef __AVX2__
@@ -251,7 +251,7 @@ void fp_16_add_row_to(int n, const mllm_fp16_t * MLLM_RESTRICT src, float * MLLM
         _mm256_storeu_ps(dst + i, res_vec); // store back to dst
     }
 #elif defined(__ARM_NEON)
-    ASSERT(false); // not support now
+    std::cout<<"not support now"<<std::endl;
 #endif
 
     // 处理剩余的元素
@@ -425,15 +425,15 @@ type_traits_t type_traits[] = {
     {},
     {},
     {},
-    /*[MLLM_TYPE_Q8_0] = */{
-        .size = sizeof(block_q8_0),
-        .blck_size = QK8_0,
-        .to_float = (mllm_to_float_func) dequantize_row_q8_0,
-        .from_float = (mllm_from_float_func) quantize_row_q8_0,
-        .vec_dot = (mllm_vec_dot_func) vec_dot_q8_0_q8_0,
-        .vec_dot_type = MLLM_TYPE_Q8_0,
-        .add_row_to = (mllm_vec_add_row_func)q8_0_add_row_to,
-    },
+    // /*[MLLM_TYPE_Q8_0] = */{
+    //     .size = sizeof(block_q8_0),
+    //     .blck_size = QK8_0,
+    //     .to_float = (mllm_to_float_func) dequantize_row_q8_0,
+    //     .from_float = (mllm_from_float_func) quantize_row_q8_0,
+    //     .vec_dot = (mllm_vec_dot_func) vec_dot_q8_0_q8_0,
+    //     .vec_dot_type = MLLM_TYPE_Q8_0,
+    //     .add_row_to = (mllm_vec_add_row_func)q8_0_add_row_to,
+    // },
     /*[MLLM_TYPE_Q8_1] = */{},
     {},
     {},