Skip to content

Commit

Permalink
Merge pull request #177 from chenghuaWang/main
Browse files Browse the repository at this point in the history
feat: drop xnn wrapper and move xnnwrapper to new front-end
  • Loading branch information
yirongjie authored Nov 5, 2024
2 parents 2fe197c + 7cbf442 commit 6ef3c5a
Show file tree
Hide file tree
Showing 54 changed files with 1,150 additions and 347 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ else()
endif()
set(XNNPACK_BUILD_TESTS OFF)
set(XNNPACK_BUILD_BENCHMARKS OFF)
add_definitions(-DMLLM_BUILD_XNNPACK_BACKEND)
add_definitions(-DMLLM_BUILD_XNNPACK_BACKEND=1)
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/xnnpack)
endif()

Expand Down
9 changes: 6 additions & 3 deletions examples/demo_qwen_xp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#include "models/qwen/tokenization_qwen.hpp"
#include "models/qwen/modeling_qwen_xp_sdpa.hpp"
#include "backends/xnnpack/Utils/Logger.hpp"
#include "xnnpack/XnnpackBackend.hpp"
#include "backends/xnnpack/XnnpackBackend.hpp"

using namespace mllm;

Expand All @@ -36,10 +36,13 @@ int main(int argc, char **argv) {
int tokens_limit = cmdParser.get<int>("limits");
mllm::xnnpack::XnnpackBackend::xnn_threads = cmdParser.get<int>("thread");

Layer::use_layername_2_tensorname = false;
mllm::xnnpack::XnnpackBackend::enable_dynamic_shape = false;
mllm::xnnpack::XnnpackBackend::enable_legacy_wrapper = false;

auto tokenizer = QWenTokenizer(vocab_path, merge_path);
QWenConfig config(tokens_limit, model_billion, RoPEType::HFHUBROPE);
auto model = QWenForCausalLM(config);
model.to(BackendType::MLLM_XNNPACK);
model.load(model_path);

vector<string> in_strs = {
Expand All @@ -49,7 +52,7 @@ int main(int argc, char **argv) {
};
for (const auto &in_str : in_strs) {
auto input_str = tokenizer.apply_chat_template(in_str);
auto input_tensor = tokenizer.tokenize(input_str, "name", MLLM_XNNPACK);
auto input_tensor = tokenizer.tokenize(input_str, "name", MLLM_CPU);
std::cout << "[Q] " << in_str << std::endl;
std::cout << "[A] " << std::flush;

Expand Down
10 changes: 5 additions & 5 deletions src/Backend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@ class Backend {
virtual Op *opCreate(const OpParam &op_param, string name = "", int threadCount = 4) = 0;
virtual TensorFunction *funcCreate(TensorFuncType type) = 0;

virtual void onSetUpStart(vector<shared_ptr<Tensor>> &inputs, vector<shared_ptr<Tensor>> &outputs, string graphName = ""){};
virtual void onSetUpEnd(vector<shared_ptr<Tensor>> &inputs, vector<shared_ptr<Tensor>> &outputs, string graphName = ""){};
virtual void onExecuteStart(vector<shared_ptr<Tensor>> &inputs, vector<shared_ptr<Tensor>> &outputs, string graphName = ""){};
virtual void onExecuteEnd(){};
virtual void onSetUpStart(vector<shared_ptr<Tensor>> &inputs, vector<shared_ptr<Tensor>> &outputs, string graphName = "") {};
virtual void onSetUpEnd(vector<shared_ptr<Tensor>> &inputs, vector<shared_ptr<Tensor>> &outputs, string graphName = "") {};
virtual void onExecuteStart(vector<shared_ptr<Tensor>> &inputs, vector<shared_ptr<Tensor>> &outputs, string graphName = "") {};
virtual void onExecuteEnd(std::vector<std::shared_ptr<Tensor>> &outputs, const string &graph_name = "") {};

/**
* \brief Registers all the operations supported by the backend.
Expand All @@ -85,7 +85,7 @@ class Backend {
*/
class BackendCreator {
public:
virtual Backend* create(BackendConfig config) = 0;
virtual Backend *create(BackendConfig config) = 0;
};

/**
Expand Down
4 changes: 2 additions & 2 deletions src/Graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ std::string intToStringWithLeadingZero(int num) {

namespace mllm {


Graph::Graph(const NetParameter &param, Backend *bn,
unordered_map<string, shared_ptr<Tensor>> &external_tensors,
int threadCount) {
Expand Down Expand Up @@ -214,7 +213,8 @@ const vector<shared_ptr<Tensor>> &Graph::forward(bool autofree) {
}
}
// backend event hook
this->backend_->onExecuteEnd();
auto &_ = ops_output_tensors_[op_names_[op_names_.size() - 1]];
this->backend_->onExecuteEnd(_, "");
return ops_output_tensors_[op_names_[op_names_.size() - 1]];
}

Expand Down
9 changes: 9 additions & 0 deletions src/Layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -537,16 +537,25 @@ class KVCache final : public Layer {
explicit KVCache(int cache_max, std::string name) {
param_["n_rep"] = 1;
param_["cache_max"] = cache_max;
param_["for_xnn"] = false;
init(std::move(name), OpType::KVCACHE);
}
explicit KVCache(int n_rep, int cache_max, std::string name) {
param_["n_rep"] = n_rep;
param_["cache_max"] = cache_max;
param_["for_xnn"] = false;
init(std::move(name), OpType::KVCACHE);
}
explicit KVCache(int n_rep, int cache_max, bool for_xnn, std::string name) {
param_["n_rep"] = n_rep;
param_["cache_max"] = cache_max;
param_["for_xnn"] = for_xnn;
init(std::move(name), OpType::KVCACHE);
}
explicit KVCache(int n_rep, int cache_max, std::string name, bool npuEnbaled) {
param_["n_rep"] = n_rep;
param_["cache_max"] = cache_max;
param_["for_xnn"] = false;
if (npuEnbaled) {
init(std::move(name), OpType::KVCACHENPU);
} else {
Expand Down
27 changes: 26 additions & 1 deletion src/Module.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,11 +226,23 @@ class Module {
return oss.str();
};
Backend::global_backends[device_]->onSetUpStart(inputs_vec, outputs_vec, getUinqueName());

// for xnnpack currently
for (auto &i : inputs) {
i.uuid() = inputs[0].module()->activation_tensors[i.name()]->uuid();
}

auto outputs = Forward(inputs, anyArgs);
for (auto &output : outputs) {
outputs_vec.push_back(inputs[0].module()->activation_tensors[output.name()]);
}
Backend::global_backends[device_]->onSetUpEnd(inputs_vec, outputs_vec, getUinqueName());

// for xnnpack currently
for (auto &o : outputs) {
o.uuid() = outputs[0].module()->activation_tensors[o.name()]->uuid();
}

return outputs;
} else if (Tensor::tensor_status == TENSOR_STATIC_READY && device_ != MLLM_CPU) { // backend specific module execute
auto inputs_vec = vector<shared_ptr<Tensor>>();
Expand All @@ -244,8 +256,21 @@ class Module {
return oss.str();
};
Backend::global_backends[device_]->onExecuteStart(inputs_vec, outputs_vec, getUinqueName());

auto outputs = Forward(inputs, anyArgs);
Backend::global_backends[device_]->onExecuteEnd();

for (auto &output : outputs) {
outputs_vec.push_back(inputs[0].module()->activation_tensors[output.name()]);
}

Backend::global_backends[device_]->onExecuteEnd(outputs_vec, getUinqueName());

// for xnnpack currently
for (auto &o : outputs) {
o.uuid() = outputs[0].module()->activation_tensors[o.name()]->uuid();
o.forceResetHostPointer(outputs[0].module()->activation_tensors[o.name()]->rawHostPtr());
}

return outputs;
}
return Forward(inputs, anyArgs);
Expand Down
14 changes: 12 additions & 2 deletions src/Tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,9 @@ void Tensor::alloc() {
// AVX 128 should be 16B
// AVX 256 should be 32B
#if defined(__ARM_NEON) && defined(__aarch64__)
backend_->alloc(&host_ptr_, cntSize(), 16);
backend_->alloc(&host_ptr_, cntSize() + 16, 128);
#else
backend_->alloc(&host_ptr_, cntSize(), 32);
backend_->alloc(&host_ptr_, cntSize() + 16, 128);
#endif
}
allocated_ = count_;
Expand Down Expand Up @@ -383,6 +383,16 @@ Tensor &Tensor::to(BackendType backend_type) {
if (backend_type == MLLM_QNN && device() == MLLM_CPU) {
this->free();
}
if (backend_type == MLLM_CPU && device() == MLLM_XNNPACK) {
module()->activation_tensors[name()]->setBackend(Backend::global_backends[backend_type]);
this->setBackend(Backend::global_backends[backend_type]);
return *this;
}
if (backend_type == MLLM_XNNPACK && device() == MLLM_CPU) {
module()->activation_tensors[name()]->setBackend(Backend::global_backends[backend_type]);
this->setBackend(Backend::global_backends[backend_type]);
return *this;
}
module()->activation_tensors[name()]->setBackend(Backend::global_backends[backend_type]);
this->alloc();
return *this;
Expand Down
4 changes: 3 additions & 1 deletion src/backends/cpu/CPUBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,14 @@
#include "CPUMergeOutput.hpp"
#include "CPULinearINT8Shadow.hpp"
#include "CPUIRoPE.hpp"
#include "CPUKVCacheXp.hpp"

#include "CPUTensorFunction.hpp"
#include "CPUPosition.hpp"

namespace mllm {
class CPUBackendCreator : public BackendCreator {
Backend* create(BackendConfig config) {
Backend *create(BackendConfig config) {
shared_ptr<MemoryManager> mm = nullptr;
switch (config.memory) {
case BackendConfig::Memory_High:
Expand Down Expand Up @@ -146,6 +147,7 @@ void CPUBackend::registerOps() {
addCreator(SPLITINPUT, (CPUBackend::Creator *)(new CPUSplitInputCreator()));
addCreator(LINEARINT8SHADOW, (CPUBackend::Creator *)(new CPULinearINT8ShadowCreator()));
addCreator(IROPE, (CPUBackend::Creator *)(new CPUIRoPECreator()));
addCreator(XP_KVCACHE, (CPUBackend::Creator *)(new CPUKVCacheXpCreator()));
}
TensorFunction *CPUBackend::funcCreate(const TensorFuncType type) {
auto iter = map_function_.find(type);
Expand Down
20 changes: 14 additions & 6 deletions src/backends/cpu/CPUKVCache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,28 @@ ErrorCode CPUKVCache::reshape(vector<shared_ptr<Tensor>> inputs,
assert(inputs.size() == 1);
assert(outputs.size() == 1);
if (cache_seq_len_ < 0) {
if (for_xnn_) cache_.setDtype(MLLM_TYPE_F32);

cache_.reshape(inputs[0]->batch(), inputs[0]->head() * n_rep_, cache_limit_,
inputs[0]->dimension());
cache_.setName(name() + ".Cache");
cache_.alloc();
#ifdef KVCache_TYPE_16
memset(cache_.hostPtr<mllm_fp16_t>(), 0, cache_.count() * sizeof(mllm_fp16_t));
#else
memset(cache_.hostPtr<float>(), 0, cache_.count() * sizeof(float));
#endif

switch (cache_.dtype()) {
case MLLM_TYPE_F32:
memset(cache_.hostPtr<float>(), 0, cache_.count() * sizeof(float));
break;
case MLLM_TYPE_F16:
memset(cache_.hostPtr<mllm_fp16_t>(), 0, cache_.count() * sizeof(mllm_fp16_t));
break;
default:
break;
};
cache_seq_len_ = 0;
}
int sequence = inputs[0]->sequence() + cache_seq_len_;
#ifdef LLAMAFILE_SGEMM
if (sequence % n_pack != 0) sequence = ((sequence + (n_pack - 1)) / n_pack) * n_pack;
if (!for_xnn_ && sequence % n_pack != 0) sequence = ((sequence + (n_pack - 1)) / n_pack) * n_pack;
#endif
outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head() * n_rep_, sequence,
inputs[0]->dimension());
Expand Down
16 changes: 12 additions & 4 deletions src/backends/cpu/CPUKVCache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,15 @@ class CPUKVCache final : public Op {

Tensor cache_;

int getCacheSeqLen() override{
int getCacheSeqLen() override {
return cache_seq_len_;
}
void clearCache() override{
cache_seq_len_ = 0 ;
void clearCache() override {
cache_seq_len_ = 0;
}

void setForXnn(bool for_xnn) {
for_xnn_ = for_xnn;
}

private:
Expand All @@ -33,6 +37,7 @@ class CPUKVCache final : public Op {
int cache_seq_len_ = -999;
int n_rep_ = 1;

bool for_xnn_ = false;
int cache_limit_;
};

Expand All @@ -41,7 +46,10 @@ class CPUKVCacheCreator : public CPUBackend::Creator {
virtual Op *create(OpParam op_param, Backend *bn, string name, int threadCount) const {
int n_rep = (int)op_param["n_rep"];
int cache_max = (int)op_param["cache_max"];
return new CPUKVCache(bn, name, n_rep, cache_max, threadCount);
bool for_xnn = (bool)op_param["for_xnn"];
auto ret = new CPUKVCache(bn, name, n_rep, cache_max, threadCount);
ret->setForXnn(for_xnn);
return ret;
}
};

Expand Down
73 changes: 73 additions & 0 deletions src/backends/cpu/CPUKVCacheXp.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#include "backends/cpu/CPUKVCacheXp.hpp"
#include "Types.hpp"

namespace mllm {

CPUKVCacheXp::CPUKVCacheXp(Backend *bn, const string &op_name, int n_rep, int cache_max, int thread_count) :
Op(bn, op_name), n_rep_(n_rep), cache_limit_(cache_max), thread_count_(thread_count) {
cache_.setBackend(bn);
cache_.setDtype(MLLM_TYPE_F32);
}

ErrorCode CPUKVCacheXp::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
assert(inputs.size() == 1);
assert(outputs.size() == 1);

if (cache_seq_len_ < 0) {
cache_.reshape(inputs[0]->batch(), inputs[0]->head() * n_rep_, cache_limit_, inputs[0]->dimension());
cache_.setName(name() + ".Cache");
cache_.alloc();
memset(cache_.hostPtr<float>(), 0, cache_.count() * sizeof(float));
cache_seq_len_ = 0;
}

int sequence = inputs[0]->sequence() + cache_seq_len_;
outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head() * n_rep_, sequence, inputs[0]->dimension());

if (sequence > cache_limit_) {
std::cerr << "\n[ERROR]: Current tokens exceed cache limit: " << sequence << ">"
<< cache_limit_ << ";";
std::cerr << "\n Please set args `--limits` >" << cache_limit_ << std::endl;
exit(-1);
}
return Op::reshape(inputs, outputs);
}

ErrorCode CPUKVCacheXp::load(AbstructLoader &loader) {
return Op::load(loader);
}

ErrorCode CPUKVCacheXp::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
int cache_seq_len_old = cache_seq_len_;
cache_seq_len_ += inputs[0]->sequence();

// copy input to cache
for (int b = 0; b < cache_.batch(); ++b) {
for (int h = 0; h < cache_.head(); ++h) {
#pragma omp parallel for collapse(2) num_threads(thread_count_)
for (int seq = cache_seq_len_old; seq < cache_seq_len_; ++seq) {
for (int i_rep = 0; i_rep < n_rep_; ++i_rep) {
auto cache_head = h * n_rep_ + i_rep;
auto src_ptr = inputs[0]->ptrAt<float>(b, h, seq - cache_seq_len_old, 0);
auto dst_ptr = cache_.ptrAt<float>(b, cache_head, seq, 0);
int copy_size = cache_.dimension();
memcpy(dst_ptr, src_ptr, copy_size * sizeof(float));
}
}
}
}

// copy cache to output
memcpy(outputs[0]->rawHostPtr(), cache_.rawHostPtr(), outputs[0]->count() * sizeof(float));

return MLLM_NO_ERROR;
}

ErrorCode CPUKVCacheXp::free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
return Op::free(inputs, outputs);
}

ErrorCode CPUKVCacheXp::setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
return Op::setUp(inputs, outputs);
}
} // namespace mllm
Loading

0 comments on commit 6ef3c5a

Please sign in to comment.