Skip to content

Commit

Permalink
Merge pull request #106 from yirongjie/main
Browse files Browse the repository at this point in the history
feat: add DEBUGSAVETENSOR & DEBUGOPTIME
  • Loading branch information
UbiquitousLearning authored Jul 31, 2024
2 parents c9862f3 + af4d7e2 commit a5ffe1a
Show file tree
Hide file tree
Showing 8 changed files with 188 additions and 46 deletions.
5 changes: 3 additions & 2 deletions examples/demo_elastic_llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ using namespace mllm;
int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_k.mllm");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_0_4_4.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);
Expand Down Expand Up @@ -44,7 +44,8 @@ int main(int argc, char **argv) {
// vecor<vector<int>> activate_dims = {{32*8,256}};
// 32*8 is attn_head*attn_hidden_dim(e.g. llama:32*128); 256 is ffn_hidden_dim(e.g. llama:11008)
vector<vector<int>> activate_dims = {
{-1,-1}, //0
// {(int)(32*128*0.5),(int)(11008*0.5)}, //0
{-1,-1}, //0
{-1,-1}, //1
{-1,-1}, //2
{-1,-1}, //3
Expand Down
5 changes: 3 additions & 2 deletions examples/demo_llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ using namespace mllm;
int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_k.mllm");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_0_4_4.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);
Expand All @@ -32,7 +32,8 @@ int main(int argc, char **argv) {
vector<string> in_strs = {
" Hello, who are you?",
" What can you do?",
"Please introduce Beijing University of Posts and Telecommunications."};
"Please introduce Beijing University of Posts and Telecommunications."
};

for (int i = 0; i < in_strs.size(); ++i) {
auto in_str = in_strs[i];
Expand Down
7 changes: 5 additions & 2 deletions include/Types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ using std::map;
typedef map<std::string, float> OpParam;


inline bool saveNDataFlag = false;
// #define DEBUGSAVETENSOR
// #define DEBUGOPTIME


#define LLAMAFILE_SGEMM

typedef enum {
MLLM_CPU,
Expand Down Expand Up @@ -151,7 +155,6 @@ enum RoPEType {
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#define LLAMAFILE_SGEMM

#if defined(__ARM_NEON) && !defined(_MSC_VER)
typedef __fp16 mllm_fp16_t;
Expand Down
78 changes: 60 additions & 18 deletions src/Layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,9 @@ class Layer {
}
string layer_next_name = "out-" + op_->name();
auto next_name = layername_2_tensorname[layer_next_name];
#ifdef DEBUGOPTIME
auto start_t = mllm_time_us();
#endif
switch (Tensor::tensor_status) {
case TENSOR_STATIC_INIT: {
op_->reshape({Tensor::graphs[input.name()]}, {Tensor::graphs[next_name]});
Expand All @@ -207,9 +210,13 @@ class Layer {
break;
}
}
if(saveNDataFlag){
Tensor::graphs[next_name]->saveNData<float>(layer_next_name);
}
#ifdef DEBUGOPTIME
auto end_t = mllm_time_us();
std::cout<<op_->name() << " | "<<Tensor::tensor_status<<" time: " << (end_t - start_t)/1000.0F <<"ms"<< std::endl;
#endif
#ifdef DEBUGSAVETENSOR
Tensor::graphs[next_name]->saveNData<float>(layer_next_name);
#endif
return *Tensor::graphs[next_name];
}
Tensor &_2I1O_OP(Tensor &input0, Tensor &input1) {
Expand Down Expand Up @@ -239,6 +246,9 @@ class Layer {
}
string layer_next_name = "out-" + op_->name();
auto next_name = layername_2_tensorname[layer_next_name];
#ifdef DEBUGOPTIME
auto start_t = mllm_time_us();
#endif
switch (Tensor::tensor_status) {
case TENSOR_STATIC_INIT: {
op_->reshape({Tensor::graphs[input0.name()], Tensor::graphs[input1.name()]}, {Tensor::graphs[next_name]});
Expand All @@ -253,9 +263,13 @@ class Layer {
break;
}
}
if(saveNDataFlag){
Tensor::graphs[next_name]->saveNData<float>(layer_next_name);
}
#ifdef DEBUGOPTIME
auto end_t = mllm_time_us();
std::cout<<op_->name() << " | "<<Tensor::tensor_status<<" time: " << (end_t - start_t)/1000.0F <<"ms"<< std::endl;
#endif
#ifdef DEBUGSAVETENSOR
Tensor::graphs[next_name]->saveNData<float>(layer_next_name);
#endif
return *Tensor::graphs[next_name];
}
Tensor &_3I1O_OP(Tensor &input0, Tensor &input1, Tensor &input2) {
Expand Down Expand Up @@ -289,6 +303,9 @@ class Layer {
}
string layer_next_name = "out-" + op_->name();
auto next_name = layername_2_tensorname[layer_next_name];
#ifdef DEBUGOPTIME
auto start_t = mllm_time_us();
#endif
switch (Tensor::tensor_status) {
case TENSOR_STATIC_INIT: {
op_->reshape({Tensor::graphs[input0.name()], Tensor::graphs[input1.name()], Tensor::graphs[input2.name()]},
Expand All @@ -305,10 +322,14 @@ class Layer {
default: {
break;
}
}
if(saveNDataFlag){
Tensor::graphs[next_name]->saveNData<float>(layer_next_name);
}
#ifdef DEBUGOPTIME
auto end_t = mllm_time_us();
std::cout<<op_->name() << " | "<<Tensor::tensor_status<<" time: " << (end_t - start_t)/1000.0F <<"ms"<< std::endl;
#endif
#ifdef DEBUGSAVETENSOR
Tensor::graphs[next_name]->saveNData<float>(layer_next_name);
#endif
return *Tensor::graphs[next_name];
}
Tensor &_3I1OO1_OP(Tensor &input0, Tensor &input1, Tensor &input2) {
Expand All @@ -334,6 +355,9 @@ class Layer {
}
string layer_next_name = "out-" + op_->name();
auto next_name = layername_2_tensorname[layer_next_name];
#ifdef DEBUGOPTIME
auto start_t = mllm_time_us();
#endif
switch (Tensor::tensor_status) {
case TENSOR_STATIC_INIT: {
op_->reshape({Tensor::graphs[input0.name()],
Expand All @@ -356,10 +380,14 @@ class Layer {
default: {
break;
}
}
if(saveNDataFlag){
Tensor::graphs[next_name]->saveNData<float>(layer_next_name);
}
#ifdef DEBUGOPTIME
auto end_t = mllm_time_us();
std::cout<<op_->name() << " | "<<Tensor::tensor_status<<" time: " << (end_t - start_t)/1000.0F <<"ms"<< std::endl;
#endif
#ifdef DEBUGSAVETENSOR
Tensor::graphs[next_name]->saveNData<float>(layer_next_name);
#endif
return *Tensor::graphs[next_name];
}
Tensor &_0I1O_OP() {
Expand All @@ -381,6 +409,9 @@ class Layer {
}
string layer_next_name = "param-" + op_->name();
auto next_name = layername_2_tensorname[layer_next_name];
#ifdef DEBUGOPTIME
auto start_t = mllm_time_us();
#endif
switch (Tensor::tensor_status) {
case TENSOR_STATIC_INIT: {
op_->reshape({}, {Tensor::graphs[next_name]});
Expand All @@ -395,9 +426,13 @@ class Layer {
break;
}
}
if(saveNDataFlag){
Tensor::graphs[next_name]->saveNData<float>(layer_next_name);
}
#ifdef DEBUGOPTIME
auto end_t = mllm_time_us();
std::cout<<op_->name() << " | "<<Tensor::tensor_status<<" time: " << (end_t - start_t)/1000.0F <<"ms"<< std::endl;
#endif
#ifdef DEBUGSAVETENSOR
Tensor::graphs[next_name]->saveNData<float>(layer_next_name);
#endif
return *Tensor::graphs[next_name];
}
vector<Tensor> _1INO_OP(Tensor &input, int N) {
Expand Down Expand Up @@ -442,6 +477,9 @@ class Layer {
next_names.push_back(next_name);
shared_outputs.push_back(Tensor::graphs[next_name]);
}
#ifdef DEBUGOPTIME
auto start_t = mllm_time_us();
#endif
switch (Tensor::tensor_status) {
case TENSOR_STATIC_INIT: {
op_->reshape({ Tensor::graphs[input.name()]}, shared_outputs);
Expand All @@ -456,12 +494,16 @@ class Layer {
break;
}
}
#ifdef DEBUGOPTIME
auto end_t = mllm_time_us();
std::cout<<op_->name() << " | "<<Tensor::tensor_status<<" time: " << (end_t - start_t)/1000.0F <<"ms"<< std::endl;
#endif
vector<Tensor> output_result = {};
for (const auto &layer_next_name : layer_next_names) {
auto next_name = layername_2_tensorname[layer_next_name];
if(saveNDataFlag){
Tensor::graphs[next_name]->saveNData<float>(layer_next_name);
}
#ifdef DEBUGSAVETENSOR
Tensor::graphs[next_name]->saveNData<float>(layer_next_name);
#endif
output_result.push_back(*Tensor::graphs[next_name]);
}
return output_result;
Expand Down
6 changes: 6 additions & 0 deletions src/Module.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,12 @@ class Module {
// std::cout<<Tensor::forward_times<< " - "<<Tensor::forward_times_2<<" = "<<Tensor::forward_times-Tensor::forward_times_2<<std::endl;

std::cout << "===========================================" << std::endl;


prefilling_token_size_=0;
decoding_token_size_=0;
inference_times_.clear();
last_shape_bshd_.clear();
}
};

Expand Down
44 changes: 33 additions & 11 deletions src/Tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <express/ExpressBase.hpp>
#include "OpDefined.hpp"
#include "Timing.hpp"
#include "Types.hpp"
#include "backends/cpu/CPUTensorFunction.hpp"

Expand Down Expand Up @@ -92,21 +93,28 @@ Tensor& Tensor::getFunc(const std::string& suffix, const TensorFuncType type, ve
for (auto &other_tensor : other_tensors) {
tensorPtrs.push_back(other_tensor);
}
#ifdef DEBUGOPTIME
auto start_t = mllm_time_us();
#endif
switch (Tensor::tensor_status) {
case TENSOR_STATIC_INIT: {
func->setup({Tensor::graphs[next_name].get()}, tensorPtrs, float_args);
break;
}
case TENSOR_STATIC_READY: {
func->execute({Tensor::graphs[next_name].get()},tensorPtrs, float_args);
if(saveNDataFlag){
Tensor::graphs[next_name]->saveData<float>();
}
break;
}
default: {
}
}
#ifdef DEBUGOPTIME
auto end_t = mllm_time_us();
std::cout<<next_name << " | "<<Tensor::tensor_status<<" time: " << (end_t - start_t)/1000.0F <<"ms"<< std::endl;
#endif
#ifdef DEBUGSAVETENSOR
Tensor::graphs[next_name]->saveNData<float>();
#endif
return *Tensor::graphs[next_name];
}

Expand Down Expand Up @@ -238,21 +246,28 @@ Tensor& Tensor::getStaticFunc(const std::string& suffix, const TensorFuncType ty
if (Module::doLoad) {
return *Tensor::graphs[next_name];
}
#ifdef DEBUGOPTIME
auto start_t = mllm_time_us();
#endif
switch (Tensor::tensor_status) {
case TENSOR_STATIC_INIT: {
func->setup({Tensor::graphs[next_name].get()}, other_tensors, float_args);
break;
}
case TENSOR_STATIC_READY: {
func->execute({Tensor::graphs[next_name].get()}, other_tensors, float_args);
if(saveNDataFlag){
Tensor::graphs[next_name]->saveData<float>();
}
break;
}
default: {
}
}
#ifdef DEBUGOPTIME
auto end_t = mllm_time_us();
std::cout<<next_name << " | "<<Tensor::tensor_status<<" time: " << (end_t - start_t)/1000.0F <<"ms"<< std::endl;
#endif
#ifdef DEBUGSAVETENSOR
Tensor::graphs[next_name]->saveNData<float>();
#endif
return *Tensor::graphs[next_name];
}

Expand Down Expand Up @@ -299,23 +314,30 @@ std::vector<Tensor> Tensor::getStaticFuncOupts(vector<std::string> out_names, co
for (auto out_name: out_names) {
outPtrs.push_back(Tensor::graphs[out_name].get());
}
#ifdef DEBUGOPTIME
auto start_t = mllm_time_us();
#endif
switch (Tensor::tensor_status) {
case TENSOR_STATIC_INIT: {
func->setup(outPtrs, input_tensors, float_args);
break;
}
case TENSOR_STATIC_READY: {
func->execute(outPtrs, input_tensors, float_args);
if(saveNDataFlag){
for (auto out_name: out_names) {
Tensor::graphs[out_name]->saveData<float>();
}
}
break;
}
default: {
}
}
#ifdef DEBUGOPTIME
auto end_t = mllm_time_us();
std::cout<<out_names[0] << " | "<<Tensor::tensor_status<<" time: " << (end_t - start_t)/1000.0F <<"ms"<< std::endl;
#endif
#ifdef DEBUGSAVETENSOR
for (auto out_name: out_names) {
Tensor::graphs[out_name]->saveNData<float>();
}
#endif
std::vector<Tensor> results;
for (auto out_name: out_names) {
results.push_back(*Tensor::graphs[out_name]);
Expand Down
1 change: 1 addition & 0 deletions src/backends/cpu/CPUEmbedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ ErrorCode CPUEmbedding::execute(vector<shared_ptr<Tensor>> inputs, vector<shared
case MLLM_TYPE_I16: break;
case MLLM_TYPE_I32: break;
case MLLM_TYPE_COUNT: break;
default: break;
}
return MLLM_NO_ERROR;
}
Expand Down
Loading

0 comments on commit a5ffe1a

Please sign in to comment.