UbiquitousLearning · yirongjie · Nov 5, 2024 · Nov 5, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -137,7 +137,7 @@ else()
 endif()
     set(XNNPACK_BUILD_TESTS OFF)
     set(XNNPACK_BUILD_BENCHMARKS OFF)
-    add_definitions(-DMLLM_BUILD_XNNPACK_BACKEND)
+    add_definitions(-DMLLM_BUILD_XNNPACK_BACKEND=1)
     add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/xnnpack)
 endif()
 

diff --git a/examples/demo_qwen_xp.cpp b/examples/demo_qwen_xp.cpp
@@ -13,7 +13,7 @@
 #include "models/qwen/tokenization_qwen.hpp"
 #include "models/qwen/modeling_qwen_xp_sdpa.hpp"
 #include "backends/xnnpack/Utils/Logger.hpp"
-#include "xnnpack/XnnpackBackend.hpp"
+#include "backends/xnnpack/XnnpackBackend.hpp"
 
 using namespace mllm;
 
@@ -36,10 +36,13 @@ int main(int argc, char **argv) {
     int tokens_limit = cmdParser.get<int>("limits");
     mllm::xnnpack::XnnpackBackend::xnn_threads = cmdParser.get<int>("thread");
 
+    Layer::use_layername_2_tensorname = false;
+    mllm::xnnpack::XnnpackBackend::enable_dynamic_shape = false;
+    mllm::xnnpack::XnnpackBackend::enable_legacy_wrapper = false;
+
     auto tokenizer = QWenTokenizer(vocab_path, merge_path);
     QWenConfig config(tokens_limit, model_billion, RoPEType::HFHUBROPE);
     auto model = QWenForCausalLM(config);
-    model.to(BackendType::MLLM_XNNPACK);
     model.load(model_path);
 
     vector<string> in_strs = {
@@ -49,7 +52,7 @@ int main(int argc, char **argv) {
     };
     for (const auto &in_str : in_strs) {
         auto input_str = tokenizer.apply_chat_template(in_str);
-        auto input_tensor = tokenizer.tokenize(input_str, "name", MLLM_XNNPACK);
+        auto input_tensor = tokenizer.tokenize(input_str, "name", MLLM_CPU);
         std::cout << "[Q] " << in_str << std::endl;
         std::cout << "[A] " << std::flush;
 

diff --git a/src/Backend.hpp b/src/Backend.hpp
@@ -58,10 +58,10 @@ class Backend {
     virtual Op *opCreate(const OpParam &op_param, string name = "", int threadCount = 4) = 0;
     virtual TensorFunction *funcCreate(TensorFuncType type) = 0;
 
-    virtual void onSetUpStart(vector<shared_ptr<Tensor>> &inputs, vector<shared_ptr<Tensor>> &outputs, string graphName = ""){};
-    virtual void onSetUpEnd(vector<shared_ptr<Tensor>> &inputs, vector<shared_ptr<Tensor>> &outputs, string graphName = ""){};
-    virtual void onExecuteStart(vector<shared_ptr<Tensor>> &inputs, vector<shared_ptr<Tensor>> &outputs, string graphName = ""){};
-    virtual void onExecuteEnd(){};
+    virtual void onSetUpStart(vector<shared_ptr<Tensor>> &inputs, vector<shared_ptr<Tensor>> &outputs, string graphName = "") {};
+    virtual void onSetUpEnd(vector<shared_ptr<Tensor>> &inputs, vector<shared_ptr<Tensor>> &outputs, string graphName = "") {};
+    virtual void onExecuteStart(vector<shared_ptr<Tensor>> &inputs, vector<shared_ptr<Tensor>> &outputs, string graphName = "") {};
+    virtual void onExecuteEnd(std::vector<std::shared_ptr<Tensor>> &outputs, const string &graph_name = "") {};
 
     /**
      * \brief Registers all the operations supported by the backend.
@@ -85,7 +85,7 @@ class Backend {
  */
 class BackendCreator {
 public:
-    virtual Backend* create(BackendConfig config) = 0;
+    virtual Backend *create(BackendConfig config) = 0;
 };
 
 /**

diff --git a/src/Graph.cpp b/src/Graph.cpp
@@ -18,7 +18,6 @@ std::string intToStringWithLeadingZero(int num) {
 
 namespace mllm {
 
-
 Graph::Graph(const NetParameter &param, Backend *bn,
              unordered_map<string, shared_ptr<Tensor>> &external_tensors,
              int threadCount) {
@@ -214,7 +213,8 @@ const vector<shared_ptr<Tensor>> &Graph::forward(bool autofree) {
         }
     }
     // backend event hook
-    this->backend_->onExecuteEnd();
+    auto &_ = ops_output_tensors_[op_names_[op_names_.size() - 1]];
+    this->backend_->onExecuteEnd(_, "");
     return ops_output_tensors_[op_names_[op_names_.size() - 1]];
 }
 

diff --git a/src/Layer.hpp b/src/Layer.hpp
@@ -537,16 +537,25 @@ class KVCache final : public Layer {
     explicit KVCache(int cache_max, std::string name) {
         param_["n_rep"] = 1;
         param_["cache_max"] = cache_max;
+        param_["for_xnn"] = false;
         init(std::move(name), OpType::KVCACHE);
     }
     explicit KVCache(int n_rep, int cache_max, std::string name) {
         param_["n_rep"] = n_rep;
         param_["cache_max"] = cache_max;
+        param_["for_xnn"] = false;
+        init(std::move(name), OpType::KVCACHE);
+    }
+    explicit KVCache(int n_rep, int cache_max, bool for_xnn, std::string name) {
+        param_["n_rep"] = n_rep;
+        param_["cache_max"] = cache_max;
+        param_["for_xnn"] = for_xnn;
         init(std::move(name), OpType::KVCACHE);
     }
     explicit KVCache(int n_rep, int cache_max, std::string name, bool npuEnbaled) {
         param_["n_rep"] = n_rep;
         param_["cache_max"] = cache_max;
+        param_["for_xnn"] = false;
         if (npuEnbaled) {
             init(std::move(name), OpType::KVCACHENPU);
         } else {

diff --git a/src/Module.hpp b/src/Module.hpp
@@ -226,11 +226,23 @@ class Module {
                     return oss.str();
                 };
                 Backend::global_backends[device_]->onSetUpStart(inputs_vec, outputs_vec, getUinqueName());
+
+                // for xnnpack currently
+                for (auto &i : inputs) {
+                    i.uuid() = inputs[0].module()->activation_tensors[i.name()]->uuid();
+                }
+
                 auto outputs = Forward(inputs, anyArgs);
                 for (auto &output : outputs) {
                     outputs_vec.push_back(inputs[0].module()->activation_tensors[output.name()]);
                 }
                 Backend::global_backends[device_]->onSetUpEnd(inputs_vec, outputs_vec, getUinqueName());
+
+                // for xnnpack currently
+                for (auto &o : outputs) {
+                    o.uuid() = outputs[0].module()->activation_tensors[o.name()]->uuid();
+                }
+
                 return outputs;
             } else if (Tensor::tensor_status == TENSOR_STATIC_READY && device_ != MLLM_CPU) { // backend specific module execute
                 auto inputs_vec = vector<shared_ptr<Tensor>>();
@@ -244,8 +256,21 @@ class Module {
                     return oss.str();
                 };
                 Backend::global_backends[device_]->onExecuteStart(inputs_vec, outputs_vec, getUinqueName());
+
                 auto outputs = Forward(inputs, anyArgs);
-                Backend::global_backends[device_]->onExecuteEnd();
+
+                for (auto &output : outputs) {
+                    outputs_vec.push_back(inputs[0].module()->activation_tensors[output.name()]);
+                }
+
+                Backend::global_backends[device_]->onExecuteEnd(outputs_vec, getUinqueName());
+
+                // for xnnpack currently
+                for (auto &o : outputs) {
+                    o.uuid() = outputs[0].module()->activation_tensors[o.name()]->uuid();
+                    o.forceResetHostPointer(outputs[0].module()->activation_tensors[o.name()]->rawHostPtr());
+                }
+
                 return outputs;
             }
             return Forward(inputs, anyArgs);

diff --git a/src/Tensor.cpp b/src/Tensor.cpp
@@ -82,9 +82,9 @@ void Tensor::alloc() {
             // AVX 128 should be 16B
             // AVX 256 should be 32B
 #if defined(__ARM_NEON) && defined(__aarch64__)
-            backend_->alloc(&host_ptr_, cntSize(), 16);
+            backend_->alloc(&host_ptr_, cntSize() + 16, 128);
 #else
-            backend_->alloc(&host_ptr_, cntSize(), 32);
+            backend_->alloc(&host_ptr_, cntSize() + 16, 128);
 #endif
         }
         allocated_ = count_;
@@ -383,6 +383,16 @@ Tensor &Tensor::to(BackendType backend_type) {
     if (backend_type == MLLM_QNN && device() == MLLM_CPU) {
         this->free();
     }
+    if (backend_type == MLLM_CPU && device() == MLLM_XNNPACK) {
+        module()->activation_tensors[name()]->setBackend(Backend::global_backends[backend_type]);
+        this->setBackend(Backend::global_backends[backend_type]);
+        return *this;
+    }
+    if (backend_type == MLLM_XNNPACK && device() == MLLM_CPU) {
+        module()->activation_tensors[name()]->setBackend(Backend::global_backends[backend_type]);
+        this->setBackend(Backend::global_backends[backend_type]);
+        return *this;
+    }
     module()->activation_tensors[name()]->setBackend(Backend::global_backends[backend_type]);
     this->alloc();
     return *this;

diff --git a/src/backends/cpu/CPUBackend.cpp b/src/backends/cpu/CPUBackend.cpp
@@ -55,13 +55,14 @@
 #include "CPUMergeOutput.hpp"
 #include "CPULinearINT8Shadow.hpp"
 #include "CPUIRoPE.hpp"
+#include "CPUKVCacheXp.hpp"
 
 #include "CPUTensorFunction.hpp"
 #include "CPUPosition.hpp"
 
 namespace mllm {
 class CPUBackendCreator : public BackendCreator {
-    Backend* create(BackendConfig config) {
+    Backend *create(BackendConfig config) {
         shared_ptr<MemoryManager> mm = nullptr;
         switch (config.memory) {
         case BackendConfig::Memory_High:
@@ -146,6 +147,7 @@ void CPUBackend::registerOps() {
     addCreator(SPLITINPUT, (CPUBackend::Creator *)(new CPUSplitInputCreator()));
     addCreator(LINEARINT8SHADOW, (CPUBackend::Creator *)(new CPULinearINT8ShadowCreator()));
     addCreator(IROPE, (CPUBackend::Creator *)(new CPUIRoPECreator()));
+    addCreator(XP_KVCACHE, (CPUBackend::Creator *)(new CPUKVCacheXpCreator()));
 }
 TensorFunction *CPUBackend::funcCreate(const TensorFuncType type) {
     auto iter = map_function_.find(type);

diff --git a/src/backends/cpu/CPUKVCache.cpp b/src/backends/cpu/CPUKVCache.cpp
@@ -29,20 +29,28 @@ ErrorCode CPUKVCache::reshape(vector<shared_ptr<Tensor>> inputs,
     assert(inputs.size() == 1);
     assert(outputs.size() == 1);
     if (cache_seq_len_ < 0) {
+        if (for_xnn_) cache_.setDtype(MLLM_TYPE_F32);
+
         cache_.reshape(inputs[0]->batch(), inputs[0]->head() * n_rep_, cache_limit_,
                        inputs[0]->dimension());
         cache_.setName(name() + ".Cache");
         cache_.alloc();
-#ifdef KVCache_TYPE_16
-        memset(cache_.hostPtr<mllm_fp16_t>(), 0, cache_.count() * sizeof(mllm_fp16_t));
-#else
-        memset(cache_.hostPtr<float>(), 0, cache_.count() * sizeof(float));
-#endif
+
+        switch (cache_.dtype()) {
+        case MLLM_TYPE_F32:
+            memset(cache_.hostPtr<float>(), 0, cache_.count() * sizeof(float));
+            break;
+        case MLLM_TYPE_F16:
+            memset(cache_.hostPtr<mllm_fp16_t>(), 0, cache_.count() * sizeof(mllm_fp16_t));
+            break;
+        default:
+            break;
+        };
         cache_seq_len_ = 0;
     }
     int sequence = inputs[0]->sequence() + cache_seq_len_;
 #ifdef LLAMAFILE_SGEMM
-    if (sequence % n_pack != 0) sequence = ((sequence + (n_pack - 1)) / n_pack) * n_pack;
+    if (!for_xnn_ && sequence % n_pack != 0) sequence = ((sequence + (n_pack - 1)) / n_pack) * n_pack;
 #endif
     outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head() * n_rep_, sequence,
                         inputs[0]->dimension());

diff --git a/src/backends/cpu/CPUKVCache.hpp b/src/backends/cpu/CPUKVCache.hpp
@@ -20,11 +20,15 @@ class CPUKVCache final : public Op {
 
     Tensor cache_;
 
-    int getCacheSeqLen() override{
+    int getCacheSeqLen() override {
         return cache_seq_len_;
     }
-    void clearCache() override{
-        cache_seq_len_ = 0 ;
+    void clearCache() override {
+        cache_seq_len_ = 0;
+    }
+
+    void setForXnn(bool for_xnn) {
+        for_xnn_ = for_xnn;
     }
 
 private:
@@ -33,6 +37,7 @@ class CPUKVCache final : public Op {
     int cache_seq_len_ = -999;
     int n_rep_ = 1;
 
+    bool for_xnn_ = false;
     int cache_limit_;
 };
 
@@ -41,7 +46,10 @@ class CPUKVCacheCreator : public CPUBackend::Creator {
     virtual Op *create(OpParam op_param, Backend *bn, string name, int threadCount) const {
         int n_rep = (int)op_param["n_rep"];
         int cache_max = (int)op_param["cache_max"];
-        return new CPUKVCache(bn, name, n_rep, cache_max, threadCount);
+        bool for_xnn = (bool)op_param["for_xnn"];
+        auto ret = new CPUKVCache(bn, name, n_rep, cache_max, threadCount);
+        ret->setForXnn(for_xnn);
+        return ret;
     }
 };
 

diff --git a/src/backends/cpu/CPUKVCacheXp.cpp b/src/backends/cpu/CPUKVCacheXp.cpp
@@ -0,0 +1,73 @@
+#include "backends/cpu/CPUKVCacheXp.hpp"
+#include "Types.hpp"
+
+namespace mllm {
+
+CPUKVCacheXp::CPUKVCacheXp(Backend *bn, const string &op_name, int n_rep, int cache_max, int thread_count) :
+    Op(bn, op_name), n_rep_(n_rep), cache_limit_(cache_max), thread_count_(thread_count) {
+    cache_.setBackend(bn);
+    cache_.setDtype(MLLM_TYPE_F32);
+}
+
+ErrorCode CPUKVCacheXp::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    assert(inputs.size() == 1);
+    assert(outputs.size() == 1);
+
+    if (cache_seq_len_ < 0) {
+        cache_.reshape(inputs[0]->batch(), inputs[0]->head() * n_rep_, cache_limit_, inputs[0]->dimension());
+        cache_.setName(name() + ".Cache");
+        cache_.alloc();
+        memset(cache_.hostPtr<float>(), 0, cache_.count() * sizeof(float));
+        cache_seq_len_ = 0;
+    }
+
+    int sequence = inputs[0]->sequence() + cache_seq_len_;
+    outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head() * n_rep_, sequence, inputs[0]->dimension());
+
+    if (sequence > cache_limit_) {
+        std::cerr << "\n[ERROR]: Current tokens exceed cache limit: " << sequence << ">"
+                  << cache_limit_ << ";";
+        std::cerr << "\n         Please set args `--limits` >" << cache_limit_ << std::endl;
+        exit(-1);
+    }
+    return Op::reshape(inputs, outputs);
+}
+
+ErrorCode CPUKVCacheXp::load(AbstructLoader &loader) {
+    return Op::load(loader);
+}
+
+ErrorCode CPUKVCacheXp::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    int cache_seq_len_old = cache_seq_len_;
+    cache_seq_len_ += inputs[0]->sequence();
+
+    // copy input to cache
+    for (int b = 0; b < cache_.batch(); ++b) {
+        for (int h = 0; h < cache_.head(); ++h) {
+#pragma omp parallel for collapse(2) num_threads(thread_count_)
+            for (int seq = cache_seq_len_old; seq < cache_seq_len_; ++seq) {
+                for (int i_rep = 0; i_rep < n_rep_; ++i_rep) {
+                    auto cache_head = h * n_rep_ + i_rep;
+                    auto src_ptr = inputs[0]->ptrAt<float>(b, h, seq - cache_seq_len_old, 0);
+                    auto dst_ptr = cache_.ptrAt<float>(b, cache_head, seq, 0);
+                    int copy_size = cache_.dimension();
+                    memcpy(dst_ptr, src_ptr, copy_size * sizeof(float));
+                }
+            }
+        }
+    }
+
+    // copy cache to output
+    memcpy(outputs[0]->rawHostPtr(), cache_.rawHostPtr(), outputs[0]->count() * sizeof(float));
+
+    return MLLM_NO_ERROR;
+}
+
+ErrorCode CPUKVCacheXp::free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    return Op::free(inputs, outputs);
+}
+
+ErrorCode CPUKVCacheXp::setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    return Op::setUp(inputs, outputs);
+}
+} // namespace mllm