From e00553cd19478d99ac3646242d2614693d1089ac Mon Sep 17 00:00:00 2001
From: xiaying <xiaotang.jxt@alibaba-inc.com>
Date: Tue, 23 Apr 2024 13:54:38 +0800
Subject: [PATCH] [MNN:Bugfix] Fix bug for resize opt bug, support llama3 8b

---
 llm/include/llm.hpp                       | 13 ++++++++-
 llm/src/llm.cpp                           | 17 +++++++++++
 source/core/Pipeline.cpp                  | 32 ++++++++++++++-------
 source/core/Pipeline.hpp                  |  3 +-
 source/core/Schedule.cpp                  |  3 ++
 source/core/Schedule.hpp                  |  1 +
 source/core/Session.cpp                   |  1 +
 source/core/Tensor.cpp                    |  4 +--
 source/core/TensorUtils.hpp               |  6 ++--
 source/geometry/GeometryComputerUtils.cpp | 35 +++++++++--------------
 10 files changed, 75 insertions(+), 40 deletions(-)
diff --git a/llm/include/llm.hpp b/llm/include/llm.hpp
index 5b56fb481..d003ed106 100644
--- a/llm/include/llm.hpp
+++ b/llm/include/llm.hpp
@@ -228,7 +228,18 @@ class Llama2_7b : public Llm {
     virtual VARP gen_position_ids(int seq_len) override;
     virtual bool is_stop(int token_id) override;
 };
-
+class Llama3_8b : public Llama2_7b {
+public:
+    Llama3_8b() {
+        model_name_ = "Llama3_8b";
+        layer_nums_ = 32;
+        key_value_shape_ = {2, 1, 8, 0, 128};
+        hidden_size_ = 4096;
+    }
+private:
+    virtual std::vector<int> tokenizer(const std::string& query) override;
+    virtual bool is_stop(int token_id) override;
+};
 class Qwen2 : public Llama2_7b {
 public:
     Qwen2() {
diff --git a/llm/src/llm.cpp b/llm/src/llm.cpp
index 118c2c019..89c38fa1b 100644
--- a/llm/src/llm.cpp
+++ b/llm/src/llm.cpp
@@ -13,6 +13,7 @@
 
 #include <MNN/expr/ExecutorScope.hpp>
 #include <MNN/AutoTime.hpp>
+#include "cpp/ExprDebug.hpp"
 #include "llm.hpp"
 #include "tokenizer.hpp"
 
@@ -86,6 +87,9 @@ Llm* Llm::createLLM(const std::string& path, std::string model_type, int forward
     } else if (model_type.find("yi") != std::string::npos) {
         llm = new Yi_6b;
         llm->model_name_ = "Yi_6b";
+    } else if (model_type.find("llama3") != std::string::npos) {
+        llm = new Llama3_8b;
+        llm->model_name_ = "Llama3_8b";
     }
     if (!llm) {
         std::cerr << "model type can't judge!" << std::endl;
@@ -229,6 +233,8 @@ void Llm::load(const std::string& model_dir) {
     config.backendConfig = &cpuBackendConfig;
     runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config));
     runtime_manager_->setHint(MNN::Interpreter::MEM_ALLOCATOR_TYPE, 0);
+//    runtime_manager_->setMode(MNN::Interpreter::Session_Debug);
+//    _initTensorStatic();
     {
         runtime_manager_->setCache(".tempcache");
     }
@@ -801,6 +807,17 @@ std::vector<int> Yi_6b::tokenizer(const std::string& query) {
 bool Yi_6b::is_stop(int token_id) {
     return token_id == 7 || token_id == 64001;
 }
+std::vector<int> Llama3_8b::tokenizer(const std::string& query) {
+    // <|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n+query+<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n
+    auto ids = tokenizer_encode(query);
+    ids.insert(ids.begin(), {128000, 128006, 882, 128007, 271});
+    ids.insert(ids.end(), {128009, 128006, 78191, 128007, 271});
+    return ids;
+}
+
+bool Llama3_8b::is_stop(int token_id) {
+    return token_id == 128001 || token_id == 128009;
+}
 // Llm end
 
 // Embedding start
diff --git a/source/core/Pipeline.cpp b/source/core/Pipeline.cpp
index 163b7a2a3..3be39b13a 100644
--- a/source/core/Pipeline.cpp
+++ b/source/core/Pipeline.cpp
@@ -706,16 +706,26 @@ static void _makeCopyOp(std::shared_ptr<BufferStorage>& copyOp) {
         copyOp->storage = builder.ReleaseRaw(copyOp->allocated_size, copyOp->offset);
     }
 }
-static ErrorCode _InsertCopy(Schedule::PipelineInfo& mInfo, std::map<Tensor*, std::shared_ptr<Tensor>>& mCacheConstTensors, std::map<std::pair<Tensor*, Backend*>, std::shared_ptr<Tensor>>& shapeFixConstCache, bool ownInput, bool permitCodegen) {
+static ErrorCode _InsertCopy(Schedule::PipelineInfo& mInfo, std::map<Tensor*, std::shared_ptr<Tensor>>& mCacheConstTensors, Pipeline::WrapTensorCache& shapeFixConstCache, bool ownInput, bool permitCodegen) {
     std::shared_ptr<BufferStorage> copyOp;
-    for (auto& iter : shapeFixConstCache) {
-        auto des = TensorUtils::getDescribe(iter.second.get());
-        if (des->usage == Tensor::InsideDescribe::CONSTANT && des->stageMask == 0) {
-            // If the tensor is not compute in shape-geometry stage, needn't recopy it
+    for (auto iterP = shapeFixConstCache.begin(); iterP != shapeFixConstCache.end();) {
+        auto& iter = *iterP;
+        if (iter.second.first.lock() == nullptr) {
+            // Has released, remove cache
+            iterP = shapeFixConstCache.erase(iterP);
             continue;
         }
-        TensorUtils::getDescribeOrigin(iter.second.get())->setBackend(nullptr);
-        TensorUtils::getDescribeOrigin(iter.second.get())->mem = nullptr;
+        auto des = iter.first.first;
+        bool needReset = true;
+        if (des->usage == Tensor::InsideDescribe::CONSTANT && ((des->stageMask & Tensor::InsideDescribe::CONTENT_NOT_CHANGE) != 0)) {
+            // If the tensor is not compute in shape-geometry stage, needn't recopy it
+            needReset = false;
+        }
+        if (needReset) {
+            TensorUtils::getDescribeOrigin(iter.second.second.get())->setBackend(nullptr);
+            TensorUtils::getDescribeOrigin(iter.second.second.get())->mem = nullptr;
+        }
+        iterP++;
     }
     for (auto& info : mInfo.second) {
         if (info.type == Schedule::CONSTANT) {
@@ -778,12 +788,12 @@ static ErrorCode _InsertCopy(Schedule::PipelineInfo& mInfo, std::map<Tensor*, st
                             }
                         }
                         {
-                            auto titer = shapeFixConstCache.find(std::make_pair(t, curBackend));
+                            auto titer = shapeFixConstCache.find(std::make_pair(des, curBackend));
                             if (titer != shapeFixConstCache.end()) {
-                                newTensor = titer->second.get();
+                                newTensor = titer->second.second.get();
                             } else {
                                 std::shared_ptr<MNN::Tensor> tensor(new Tensor);
-                                shapeFixConstCache.insert(std::make_pair(std::make_pair(t, curBackend), tensor));
+                                shapeFixConstCache.insert(std::make_pair(std::make_pair(des, curBackend), std::make_pair(std::weak_ptr<Tensor::InsideDescribe::NativeInsideDescribe>(TensorUtils::getDescribeOrigin(t)->mContent), tensor)));
                                 newTensor = tensor.get();
                             }
                             iter.workInputs[v] = newTensor;
@@ -1067,7 +1077,7 @@ ErrorCode Pipeline::allocMemory(bool firstMalloc, bool forbidReplace) {
                     }
                     auto des = TensorUtils::getDescribe(t);
                     auto usage = des->usage;
-                    if (TensorUtils::getDescribeOrigin(t)->mContent->count() > 1 && usage != Tensor::InsideDescribe::CONSTANT) {
+                    if (TensorUtils::getDescribeOrigin(t)->mContent.use_count() > 1 && usage != Tensor::InsideDescribe::CONSTANT) {
                         TensorUtils::getDescribeOrigin(t)->mem = nullptr;
                         auto res = TensorUtils::getDescribeOrigin(t)->getBackend()->onAcquireBuffer(t, Backend::STATIC);
                         if (!res) {
diff --git a/source/core/Pipeline.hpp b/source/core/Pipeline.hpp
index 6fcaf142e..c32701db5 100644
--- a/source/core/Pipeline.hpp
+++ b/source/core/Pipeline.hpp
@@ -62,6 +62,7 @@ class Pipeline : public NonCopyable {
     MNNForwardType getMainForwardType() const  {
         return mInfo.first.cache.first->type();
     }
+    typedef std::map<std::pair<Tensor::InsideDescribe::NativeInsideDescribe*, Backend*>, std::pair<std::weak_ptr<Tensor::InsideDescribe::NativeInsideDescribe>, std::shared_ptr<Tensor>>> WrapTensorCache;
 private:
     ErrorCode _allocForTensor(int index, bool allocInput);
     void _copyInputs();
@@ -76,7 +77,7 @@ class Pipeline : public NonCopyable {
 
     // For gpu or other backend
     std::map<Tensor*, std::shared_ptr<Tensor>> mCacheConstTensors;
-    std::map<std::pair<Tensor*, Backend*>, std::shared_ptr<Tensor>> mWrapTensors;
+    WrapTensorCache mWrapTensors;
 #ifndef MNN_BUILD_MINI
     GeometryComputer::Context mContext;
     Runtime::CompilerType mUseGeometry;
diff --git a/source/core/Schedule.cpp b/source/core/Schedule.cpp
index 1c8cd0878..74b949941 100644
--- a/source/core/Schedule.cpp
+++ b/source/core/Schedule.cpp
@@ -81,6 +81,9 @@ bool Schedule::OpResizeCache::match(const std::vector<Tensor*>& inputs) {
 void Schedule::OpResizeCache::open() {
     mCanCache = true;
 }
+void Schedule::OpResizeCache::copyImmutable(const OpResizeCache& cache) {
+    mNeedCompareContent = cache.mNeedCompareContent;
+}
 
 void Schedule::OpResizeCache::insert(const std::vector<Tensor*>& inputs) {
     if (!mCanCache) {
diff --git a/source/core/Schedule.hpp b/source/core/Schedule.hpp
index 476f37343..e05c3133c 100644
--- a/source/core/Schedule.hpp
+++ b/source/core/Schedule.hpp
@@ -42,6 +42,7 @@ class MNN_PUBLIC Schedule {
         bool needComputeShape = true;
         bool needExecuteConst = false;
         void addContentIndex(int index);
+        void copyImmutable(const OpResizeCache& cache);
     private:
         struct ShapeInfo {
             int order;
diff --git a/source/core/Session.cpp b/source/core/Session.cpp
index 9537bee77..5998c9253 100644
--- a/source/core/Session.cpp
+++ b/source/core/Session.cpp
@@ -427,6 +427,7 @@ Session* Session::clone(RuntimeInfo&& runtime, std::shared_ptr<Schedule::Schedul
         auto& opInfo = oplists[i];
         opInfo.op = opCaches[i].op;
         opInfo.type = srcOpInfo.type;
+        opInfo.computeCache.copyImmutable(srcOpInfo.computeCache);
         auto op = opInfo.op;
         if (nullptr != op->outputIndexes()) {
             auto data = op->outputIndexes()->data();
diff --git a/source/core/Tensor.cpp b/source/core/Tensor.cpp
index 5bf50a0b2..4165f1ab7 100644
--- a/source/core/Tensor.cpp
+++ b/source/core/Tensor.cpp
@@ -20,7 +20,7 @@ namespace MNN {
 Tensor::Tensor(int dimSize, DimensionType type) {
     MNN_ASSERT(dimSize <= MNN_MAX_TENSOR_DIM);
     mDescribe          = new InsideDescribe;
-    mDescribe->mContent = new InsideDescribe::NativeInsideDescribe;
+    mDescribe->mContent.reset(new InsideDescribe::NativeInsideDescribe);
     auto nativeDescribe = mDescribe->mContent.get();
     mBuffer.dimensions = dimSize;
     mBuffer.type       = halide_type_of<float>();
@@ -49,7 +49,7 @@ Tensor::Tensor(const Tensor* tensor, DimensionType type, bool allocMemory) {
 
     auto buffer        = tensor->buffer();
     mDescribe          = new InsideDescribe;
-    mDescribe->mContent = new InsideDescribe::NativeInsideDescribe;
+    mDescribe->mContent.reset(new InsideDescribe::NativeInsideDescribe);
     auto nativeDescribe = mDescribe->mContent.get();
     mBuffer.dimensions = buffer.dimensions;
     mBuffer.type       = buffer.type;
diff --git a/source/core/TensorUtils.hpp b/source/core/TensorUtils.hpp
index 61d15776a..d8f8498ec 100644
--- a/source/core/TensorUtils.hpp
+++ b/source/core/TensorUtils.hpp
@@ -79,10 +79,10 @@ struct Tensor::InsideDescribe {
         GEOMETRY_STAGE = 1,
         CONVERTED_STAGE = 1 << 1,
         COMPUTE_SHAPE_STAGE = 1 << 2,
-        COMPUTE_CONTENT_STAGE = 1 << 3,
+        CONTENT_NOT_CHANGE = 1 << 3,
     };
     /** extra tensor info container */
-    struct NativeInsideDescribe : public RefCount {
+    struct NativeInsideDescribe {
     public:
         /** dimension format */
         MNN_DATA_FORMAT dimensionFormat = MNN_DATA_FORMAT_NC4HW4;
@@ -115,7 +115,7 @@ struct Tensor::InsideDescribe {
         // For isMutable = false Tensor , determine whether the content can be convert to main backend
         uint32_t stageMask = 0;
     };
-    SharedPtr<NativeInsideDescribe> mContent;
+    std::shared_ptr<NativeInsideDescribe> mContent;
     SharedPtr<Backend::MemObj> mem;
     inline Backend* getBackend() const {
         return backend;
diff --git a/source/geometry/GeometryComputerUtils.cpp b/source/geometry/GeometryComputerUtils.cpp
index a9377df7d..0dc328691 100644
--- a/source/geometry/GeometryComputerUtils.cpp
+++ b/source/geometry/GeometryComputerUtils.cpp
@@ -164,8 +164,8 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
             auto type = des->memoryType;
             MNN_ASSERT(type != Tensor::InsideDescribe::MEMORY_OUTSIDE);
             MNN_ASSERT(type != Tensor::InsideDescribe::MEMORY_HOST);
-            if (TensorUtils::getDescribeOrigin(t)->mContent->count() > 1) {
-                TensorUtils::getDescribeOrigin(t)->mContent = new Tensor::InsideDescribe::NativeInsideDescribe;
+            if (TensorUtils::getDescribeOrigin(t)->mContent.use_count() > 1) {
+                TensorUtils::getDescribeOrigin(t)->mContent.reset(new  Tensor::InsideDescribe::NativeInsideDescribe);
                 t->buffer().dim = TensorUtils::getDescribe(t)->dims;
                 TensorUtils::getDescribeOrigin(t)->setBackend(nullptr);
                 TensorUtils::getDescribeOrigin(t)->mem = nullptr;
@@ -210,13 +210,18 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
                 TensorUtils::getDescribe(t)->rasterCommand.reset();
                 TensorUtils::getDescribe(t)->stageMask |= Tensor::InsideDescribe::StageInfo::COMPUTE_SHAPE_STAGE;
                 // The content may be computed by geometry computer, which will not make execution
-                TensorUtils::getDescribe(t)->stageMask &= (~Tensor::InsideDescribe::StageInfo::COMPUTE_CONTENT_STAGE);
+                TensorUtils::getDescribe(t)->stageMask &= (~Tensor::InsideDescribe::StageInfo::CONTENT_NOT_CHANGE);
             }
         }
         info.computeCache.needComputeShape = needCompute;
         if (info.type != Schedule::CONSTANT) {
             continue;
         }
+        if (!needCompute) {
+            for (auto t : info.outputs) {
+                TensorUtils::getDescribe(t)->stageMask |= Tensor::InsideDescribe::StageInfo::CONTENT_NOT_CHANGE;
+            }
+        }
         if (_hasZeroShapeOutput(info)) {
             continue;
         }
@@ -292,7 +297,7 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
                         dirty = true;
                         break;
                     }
-                    if ((des->stageMask &                Tensor::InsideDescribe::StageInfo::COMPUTE_CONTENT_STAGE) == 0) {
+                    if ((des->stageMask &                Tensor::InsideDescribe::StageInfo::CONTENT_NOT_CHANGE) == 0) {
                         dirty = true;
                         break;
                     }
@@ -305,26 +310,12 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
                     return NOT_SUPPORT;
                 }
                 for (auto t : c.outputs) {
-                    TensorUtils::getDescribe(t)->stageMask &= (~Tensor::InsideDescribe::StageInfo::COMPUTE_CONTENT_STAGE);
+                    TensorUtils::getDescribe(t)->stageMask &= (~Tensor::InsideDescribe::StageInfo::CONTENT_NOT_CHANGE);
                 }
-            }
-        }
-    }
-    for (int i=0; i<infos.size(); ++i) {
-        auto& info = infos[i];
-        if (info.type != Schedule::CONSTANT) {
-            continue;
-        }
-        auto& cmdBufferVir = info.executeBuffer;
-        for (auto& cp : cmdBufferVir.command) {
-            auto& c = *cp;
-            bool dirty = false;
-            for (auto t : c.inputs) {
-                auto des = TensorUtils::getDescribe(t);
-                if ((!des->isMutable) || des->group) {
-                    continue;
+            } else {
+                for (auto t : c.outputs) {
+                    TensorUtils::getDescribe(t)->stageMask |= Tensor::InsideDescribe::StageInfo::CONTENT_NOT_CHANGE;
                 }
-                des->stageMask |= Tensor::InsideDescribe::StageInfo::COMPUTE_CONTENT_STAGE;
             }
         }
     }