UbiquitousLearning · yirongjie · Oct 31, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 10, 2024
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -88,4 +88,4 @@ jobs:
 
 
 
-
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -127,9 +127,16 @@ if(QNN) # QNN lib
     add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/qnn)
 endif()
 
-option(MLLM_BUILD_XNNPACK_BACKEND "Build mllm's XNNPACK backend" OFF)
+option(MLLM_BUILD_XNNPACK_BACKEND "Build mllm's XNNPACK backend" ON)
 if(MLLM_BUILD_XNNPACK_BACKEND)
+if(NOT WIN32)
     add_compile_options(-fPIC)
+else()
+    # -fPIC is not a windows flag
+    set(CMAKE_POSITION_INDEPENDENT_CODE FALSE)
+endif()
+    set(XNNPACK_BUILD_TESTS OFF)
+    set(XNNPACK_BUILD_BENCHMARKS OFF)
     add_definitions(-DMLLM_BUILD_XNNPACK_BACKEND)
     add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/xnnpack)
 endif()

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -12,6 +12,9 @@ macro(func_link_libaries target)
             target_link_libraries(${target} PUBLIC  MLLM_CPU MLLM_QNN ${CMAKE_DL_LIBS} -fopenmp -static-openmp)
         endif ()
     endif()
+    if (MLLM_BUILD_XNNPACK_BACKEND)
+        target_link_libraries(${target} PRIVATE MLLM_CPU MllmXnnpackBackend)
+    endif()
 endmacro()
 
 
@@ -55,7 +58,9 @@ endmacro()
 
 
 ## new demos
-func_llm_add_executable(benchmark)
+
+# if(NOT MLLM_BUILD_XNNPACK_BACKEND)
+func_llm_add_executable(mllm_benchmark)
 func_llm_add_executable(demo_llama)
 func_llm_add_executable(demo_tinyllama)
 func_llm_add_executable(demo_stablelm)
@@ -81,7 +86,7 @@ func_vlm_add_executable(demo_vit)
 func_vlm_add_executable(demo_clip)
 func_vlm_add_executable(demo_imagebind)
 func_vlm_add_executable(demo_imagebind_1mod)
-# func_vlm_add_executable(demo)
+# endif()
 
 # QNN demo
 if(QNN)
@@ -90,7 +95,9 @@ if(QNN)
 endif()
 
 
-
+if(MLLM_BUILD_XNNPACK_BACKEND)
+    func_llm_add_executable(demo_qwen_xp)
+endif()
 
 
 # old main

diff --git a/examples/demo_qwen_xp.cpp b/examples/demo_qwen_xp.cpp
@@ -0,0 +1,74 @@
+/**
+ * @file demo_llama_xp.cpp
+ * @author your name ([email protected])
+ * @version 0.1
+ * @date 2024-10-20
+ *
+ * @copyright Copyright (c) 2024
+ *
+ */
+#include "Types.hpp"
+#include "cmdline.h"
+#include "models/qwen/configuration_qwen.hpp"
+#include "models/qwen/tokenization_qwen.hpp"
+#include "models/qwen/modeling_qwen_xp_sdpa.hpp"
+#include "backends/xnnpack/Utils/Logger.hpp"
+#include "xnnpack/XnnpackBackend.hpp"
+
+using namespace mllm;
+
+int main(int argc, char **argv) {
+    mllm::xnnpack::Log::log_level = mllm::xnnpack::Log::LogLevel::ERROR;
+
+    cmdline::parser cmdParser;
+    cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen_vocab.mllm");
+    cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen_merges.txt");
+    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-1.8b-fp32.mllm");
+    cmdParser.add<string>("billion", 'b', "[0.5B | 1.8B]", false, "1.8B");
+    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
+    cmdParser.add<int>("thread", 't', "num of threads", false, 4);
+    cmdParser.parse_check(argc, argv);
+
+    string vocab_path = cmdParser.get<string>("vocab");
+    string merge_path = cmdParser.get<string>("merge");
+    string model_path = cmdParser.get<string>("model");
+    string model_billion = cmdParser.get<string>("billion");
+    int tokens_limit = cmdParser.get<int>("limits");
+    mllm::xnnpack::XnnpackBackend::xnn_threads = cmdParser.get<int>("thread");
+
+    auto tokenizer = QWenTokenizer(vocab_path, merge_path);
+    QWenConfig config(tokens_limit, model_billion, RoPEType::HFHUBROPE);
+    auto model = QWenForCausalLM(config);
+    model.to(BackendType::MLLM_XNNPACK);
+    model.load(model_path);
+
+    vector<string> in_strs = {
+        "Hello, who are you?",
+        "What can you do?",
+        "Please introduce Beijing University of Posts and Telecommunications.",
+    };
+    for (const auto &in_str : in_strs) {
+        auto input_str = tokenizer.apply_chat_template(in_str);
+        auto input_tensor = tokenizer.tokenize(input_str, "name", MLLM_XNNPACK);
+        std::cout << "[Q] " << in_str << std::endl;
+        std::cout << "[A] " << std::flush;
+
+        LlmTextGeneratorOpts opt{
+            .max_new_tokens = 100,
+            .do_sample = false,
+            .temperature = 0.3F,
+            .top_k = 50,
+            .top_p = 0.F,
+        };
+        model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
+            auto out_string = tokenizer.detokenize({out_token});
+            auto [not_end, output_string] = tokenizer.postprocess(out_string);
+            if (!not_end) { return false; }
+            std::cout << output_string << std::flush;
+            return true;
+        });
+        std::cout << "\n";
+    }
+
+    return 0;
+}
diff --git a/examples/benchmark.cpp → examples/mllm_benchmark.cpp b/examples/benchmark.cpp → examples/mllm_benchmark.cpp
diff --git a/include/OpDefined.hpp b/include/OpDefined.hpp
@@ -61,8 +61,19 @@ enum OpType {
     MERGEOUTPUT,
     SPLITINPUT,
     IROPE,
+    OP_NUM,
+
+    // add in xnnpack
+    DIRECT,
+    DISPATCH,
+    SUBGRAPHSTART,
+    SUBGRAPHFINALIZE,
+    D2H,
+    XP_KVCACHE,
+    SDPA,
+
+    // new front-end
     SUPERSILU,
-    OP_NUM
 };
 
 static const vector<string> OpNames = {
@@ -119,8 +130,18 @@ static const vector<string> OpNames = {
     "MergeOutput",
     "SplitInput",
     "IRoPE",
+    "OP_NUM",
+
+    // in xnnpack
+    "Direct",
+    "Dispatch",
+    "SubgraphStart",
+    "SubgraphFinalize",
+    "D2H",
+    "XP_KVCACHE",
+    "SDPA",
     "SuperSiLU",
-    "OP_NUM"};
+};
 
 enum TensorFuncType {
     FUNC_ADD,

diff --git a/include/Types.hpp b/include/Types.hpp
@@ -27,7 +27,8 @@ typedef enum {
     MLLM_DEFAULT,
     MLLM_CPU,
     MLLM_OPENCL,
-    MLLM_QNN
+    MLLM_QNN,
+    MLLM_XNNPACK,
 } BackendType;
 
 enum TensorStatus {
@@ -96,6 +97,7 @@ enum TensorType {
     INPUT_TENSOR = 0, // used for input of the model
     NORMAL_TENSOR,
     GRAPH_OUTPUT, // used for output of a graph
+    OUTPUT_TENSOR,
 };
 
 enum Chl {

diff --git a/scripts/run_test.sh b/scripts/run_test.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+for file in ../bin/*Test ../bin/*TEST; do
+    if [ -x "$file" ]; then
+    echo "Running $file..."
+    "../bin/$file"
+    else
+    echo "Skipping non-executable $file..."
+    fi
+done
diff --git a/src/Backend.cpp b/src/Backend.cpp
@@ -9,6 +9,8 @@ namespace mllm {
 extern void registerCPUBackendCreator();
 #ifdef USE_QNN
 extern void registerQNNBackendCreator();
+#elif defined(MLLM_BUILD_XNNPACK_BACKEND)
+extern void registerXNNBackendCreator();
 #endif
 
 static std::once_flag s_flag;
@@ -17,6 +19,8 @@ void registerBackend() {
         registerCPUBackendCreator();
 #ifdef USE_QNN
         registerQNNBackendCreator();
+#elif defined(MLLM_BUILD_XNNPACK_BACKEND)
+            registerXNNBackendCreator();
 #endif
     });
 }
@@ -30,7 +34,7 @@ static std::unordered_map<BackendType, std::shared_ptr<BackendCreator>> &GetBack
 }
 
 const std::shared_ptr<BackendCreator> GetBackendCreator(BackendType type) {
-    if (type == MLLM_QNN) {
+    if (type == MLLM_QNN || type == MLLM_XNNPACK) {
         Layer::use_layername_2_tensorname = false;
     }
     registerBackend();

diff --git a/src/Layer.hpp b/src/Layer.hpp
@@ -5,10 +5,8 @@
 #ifndef OPERATION_H
 #define OPERATION_H
 
-#include <cassert>
 #include <cstddef>
 #include <cstdlib>
-#include <iostream>
 #include <memory>
 #include <utility>
 
@@ -17,7 +15,6 @@
 #include "Op.hpp"
 #include "ParamLoader.hpp"
 #include "Backend.hpp"
-#include "Timing.hpp"
 
 #include <Module.hpp>
 
@@ -60,6 +57,11 @@ class Layer {
         return ts[0].get();
     }
 
+    Tensor &operator()(Tensor &input0, Tensor &input1, Tensor &input2, Tensor &input3) {
+        auto ts = run({input0, input1, input2, input3}, 1);
+        return ts[0].get();
+    }
+
 private:
     std::string name_num_to_X(const std::string &input_string) {
         std::regex pattern(R"(\.\d{1,3}\.)"); // Matches any number between 1 and 100 between two dots
@@ -731,6 +733,21 @@ class Quantize final : public Layer {
     }
 };
 
+class Direct final : public Layer {
+public:
+    enum DirectType : uint32_t {
+        Normal = 0,
+        ExternalInput = 1,
+        ExternalOutput = 2,
+        KeepLive = 3,
+    };
+
+    Direct(DirectType t, const std::string &name) {
+        param_["DirectType"] = (float)t;
+        init(name, OpType::DIRECT);
+    }
+};
+
 class Dequantize final : public Layer {
 public:
     explicit Dequantize(bool isNSHD, std::string name, bool isFP32 = true) {
@@ -744,6 +761,18 @@ class Dequantize final : public Layer {
     }
 };
 
+class Dispatch final : public Layer {
+public:
+    explicit Dispatch(const std::string &name) {
+        init(name, OpType::DISPATCH);
+    }
+
+    Tensor &operator()(Tensor &input) {
+        auto ts = run({input}, 1);
+        return ts[0].get();
+    }
+};
+
 class Add final : public Layer {
 public:
     explicit Add(std::string name) {
@@ -819,6 +848,18 @@ class View final : public Layer {
     }
 };
 
+class SubgraphStart final : public Layer {
+public:
+    explicit SubgraphStart(const std::string &name) {
+        init(name, OpType::SUBGRAPHSTART);
+    }
+
+    Tensor &operator()(Tensor &input) {
+        auto ts = run({input}, 1);
+        return ts[0].get();
+    }
+};
+
 class Transpose final : public Layer {
 public:
     explicit Transpose(std::vector<int> perm, std::string name) {
@@ -834,6 +875,56 @@ class Transpose final : public Layer {
     }
 };
 
+class SubgraphFinalize final : public Layer {
+public:
+    explicit SubgraphFinalize(const std::string &name) {
+        init(name, OpType::SUBGRAPHFINALIZE);
+    }
+
+    Tensor &operator()(Tensor &input) {
+        auto ts = run({input}, 1);
+        return ts[0].get();
+    }
+};
+
+class Device2Host final : public Layer {
+public:
+    explicit Device2Host(const std::string &name) {
+        init(name, OpType::D2H);
+    }
+
+    Tensor &operator()(Tensor &input) {
+        auto ts = run({input}, 1);
+        return ts[0].get();
+    }
+};
+
+class XP_KVCache final : public Layer {
+public:
+    explicit XP_KVCache(int n_rep, int cache_max, std::string name) {
+        param_["n_rep"] = (float)n_rep;
+        param_["cache_max"] = (float)cache_max;
+        init(std::move(name), OpType::XP_KVCACHE);
+    }
+
+    Tensor &operator()(Tensor &input) {
+        auto ts = run({input}, 1);
+        return ts[0].get();
+    }
+};
+
+class ScaledDotProductAttention final : public Layer {
+public:
+    explicit ScaledDotProductAttention(std::string name) {
+        init(std::move(name), OpType::SDPA);
+    }
+
+    // Q, K, V
+    Tensor &operator()(Tensor &Q, Tensor &K, Tensor &V) {
+        auto ts = run({Q, K, V}, 1); // Q, K, V
+        return ts[0].get();
+    }
+};
 //  Only for QNN END
 
 } // namespace mllm