Skip to content

Commit

Permalink
Merge pull request #91 from yirongjie/main
Browse files Browse the repository at this point in the history
feat: Inference speed(tokens/s) profiling
  • Loading branch information
yirongjie authored Jul 15, 2024
2 parents ca6a898 + be37bdc commit ec3360d
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 18 deletions.
1 change: 1 addition & 0 deletions examples/demo_tinyllama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ int main(int argc, char **argv) {
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
model.profiling();
}

return 0;
Expand Down
37 changes: 36 additions & 1 deletion src/Module.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,23 @@
#include "Op.hpp"
#include "ParamLoader.hpp"
#include "Backend.hpp"
#include "Timing.hpp"
#include "backends/cpu/CPUBackend.hpp"

#include <any>
#include <memory/SystemMemoryManager.hpp>
#include <numeric>
#include <utility>
#include <vector>

namespace mllm {

class Module {
private:
double load_time_;
int prefilling_token_size_=0;
vector<double> inference_times_;

public:
static map<BackendType, Backend *> backends;
static AbstructLoader *loader;
Expand Down Expand Up @@ -48,6 +56,7 @@ class Module {
}

void load(string path) {
mllm_time_init();
initLoader(path);
Module::doLoad = true;
vector<Tensor> tmps;
Expand All @@ -57,7 +66,10 @@ class Module {
tmps.push_back(Tensor::gph_[std::to_string(i)]);
}
vector<int> tmpt = {0, 0};
uint64_t time_start = mllm_time_us();
operator()(tmps, tmpt);
uint64_t time_end = mllm_time_us();
load_time_ = (time_end - time_start) / 1000.0F;//ms
Module::doLoad = false;
Tensor::gph_.clear();
}
Expand Down Expand Up @@ -99,13 +111,21 @@ class Module {
}
tensor_status = TENSOR_STATIC_INIT;

uint64_t time_start = mllm_time_us();
Forward(inputs, anyArgs);
for (auto &input : inputs) {
input.status() = TENSOR_STATIC_READY;
}
tensor_status = TENSOR_STATIC_READY;
auto output = Forward(inputs, anyArgs);
uint64_t time_end = mllm_time_us();
if(prefilling_token_size_==0){
prefilling_token_size_ = inputs[0].sequence();
}
double inference_time_ = (time_end - time_start) / 1000.0F;//ms
inference_times_.push_back(inference_time_);

return Forward(inputs, anyArgs);
return output;
} else {
return Forward(inputs, anyArgs);
}
Expand Down Expand Up @@ -151,6 +171,21 @@ class Module {
listIdx = 0;
return modules;
}

void profiling() {
printf("\n");
std::cout << "===========================================" << std::endl;
std::cout << " Load time: " << load_time_/1000.0F << " s" << std::endl;
if(prefilling_token_size_){
std::cout << " Prefilling speed: " << 1000 * prefilling_token_size_ / inference_times_[0] << " tokens/s" << std::endl;
}
if(inference_times_.size()>1){
double sum_decoding_time = std::accumulate(std::begin(inference_times_)+1, std::end(inference_times_), 0.0);
double mean_decoding_time = sum_decoding_time / (inference_times_.size()-1);
std::cout << " Decoding speed: " << 1000 / mean_decoding_time << " tokens/s" << std::endl;
}
std::cout << "===========================================" << std::endl;
}
};

} // namespace mllm
Expand Down
2 changes: 1 addition & 1 deletion src/ParamLoader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class AbstructLoader {
public:
virtual bool load(mllm::Tensor *tensor) = 0;
virtual bool load(std::shared_ptr<mllm::Tensor> tensor) = 0;
virtual size_t getTensorSize(string name){fprintf(stderr,"loader not support getTensorSize");assert(false);}
virtual size_t getTensorSize(string name){fprintf(stderr,"loader not support getTensorSize");return NOT_SUPPORT;}
virtual DataType getDataType(string name) {return MLLM_TYPE_COUNT;}
};

Expand Down
32 changes: 16 additions & 16 deletions src/backends/cpu/type/type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ static void vec_dot_fp32_local(const int n, float *__restrict s, const float *__
vec_dot_fp32_arm(n, s, vx, vy);
#endif
}

/*
void vec_dot_q8_0_q8_0(int n, float * __restrict s, const void * __restrict vx, const void * __restrict vy) {
const int qk = QK8_0;
const int nb = n / qk; // number of blocks
Expand All @@ -120,10 +120,10 @@ void vec_dot_q8_0_q8_0(int n, float * __restrict s, const void * __restrict vx,
assert(nb % 2 == 0); // TODO: handle odd nb
for (int i = 0; i < nb; i += 2) {
const block_q8_0 * restrict x0 = &x[i + 0];
const block_q8_0 * restrict x1 = &x[i + 1];
const block_q8_0 * restrict y0 = &y[i + 0];
const block_q8_0 * restrict y1 = &y[i + 1];
const block_q8_0 * x0 = &x[i + 0];
const block_q8_0 * x1 = &x[i + 1];
const block_q8_0 * y0 = &y[i + 0];
const block_q8_0 * y1 = &y[i + 1];
const int8x16_t x0_0 = vld1q_s8(x0->qs);
const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
Expand Down Expand Up @@ -205,7 +205,7 @@ void vec_dot_q8_0_q8_0(int n, float * __restrict s, const void * __restrict vx,
*s = sumf;
#endif
}

*/
void fp32_add_row_to(int n, const float * MLLM_RESTRICT src, float * MLLM_RESTRICT dst, float alpha){
int i = 0;
#ifdef __AVX2__
Expand Down Expand Up @@ -251,7 +251,7 @@ void fp_16_add_row_to(int n, const mllm_fp16_t * MLLM_RESTRICT src, float * MLLM
_mm256_storeu_ps(dst + i, res_vec); // store back to dst
}
#elif defined(__ARM_NEON)
ASSERT(false); // not support now
std::cout<<"not support now"<<std::endl;
#endif

// 处理剩余的元素
Expand Down Expand Up @@ -425,15 +425,15 @@ type_traits_t type_traits[] = {
{},
{},
{},
/*[MLLM_TYPE_Q8_0] = */{
.size = sizeof(block_q8_0),
.blck_size = QK8_0,
.to_float = (mllm_to_float_func) dequantize_row_q8_0,
.from_float = (mllm_from_float_func) quantize_row_q8_0,
.vec_dot = (mllm_vec_dot_func) vec_dot_q8_0_q8_0,
.vec_dot_type = MLLM_TYPE_Q8_0,
.add_row_to = (mllm_vec_add_row_func)q8_0_add_row_to,
},
// /*[MLLM_TYPE_Q8_0] = */{
// .size = sizeof(block_q8_0),
// .blck_size = QK8_0,
// .to_float = (mllm_to_float_func) dequantize_row_q8_0,
// .from_float = (mllm_from_float_func) quantize_row_q8_0,
// .vec_dot = (mllm_vec_dot_func) vec_dot_q8_0_q8_0,
// .vec_dot_type = MLLM_TYPE_Q8_0,
// .add_row_to = (mllm_vec_add_row_func)q8_0_add_row_to,
// },
/*[MLLM_TYPE_Q8_1] = */{},
{},
{},
Expand Down

0 comments on commit ec3360d

Please sign in to comment.