From 190b538fb220479ef22c8a8180f93be24a83ca73 Mon Sep 17 00:00:00 2001
From: SaeHie Park <saehie.park@gmail.com>
Date: Fri, 18 Oct 2024 06:21:48 +0900
Subject: [PATCH 01/46] [luci/svc] Fix Reshape shape inference (#14234)

This will fix Reshape shape inference to refer newShape attribute.

ONE-DCO-1.0-Signed-off-by: SaeHie Park <saehie.park@gmail.com>
---
 .../luci/service/src/Nodes/CircleReshape.cpp  | 35 +++++++++++++++----
 .../service/src/Nodes/CircleReshape.test.cpp  | 35 +++++++++++++++++++
 2 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/compiler/luci/service/src/Nodes/CircleReshape.cpp b/compiler/luci/service/src/Nodes/CircleReshape.cpp
index 553e1eabd5d..025ad4912a1 100644
--- a/compiler/luci/service/src/Nodes/CircleReshape.cpp
+++ b/compiler/luci/service/src/Nodes/CircleReshape.cpp
@@ -117,12 +117,35 @@ loco::TensorShape Algorithm::visit(const luci::CircleReshape *node)
     }
     else
     {
-      auto shape_node = loco::must_cast<luci::CircleNode *>(node->shape());
-      assert(shape_node->rank() == 1);
-      // shape_node tensor values will provide new shape, like [2, 3, 4]
-      auto num_elements = shape_node->dim(0).value(); // above example will give 3
-      shape_by_input.rank(num_elements);
-      is_static_shape = false;
+      // NOTE assumption is that `shape` and `newShape` having same value.
+      // for non-existing `shape`, we can use `newShape` if it's valid
+      auto new_shape = node->newShape();
+      auto rank = new_shape->rank();
+      auto shape_dummy = dynamic_cast<luci::CircleOutputDummy *>(node->shape());
+      if (shape_dummy && rank > 0)
+      {
+        is_static_shape = true;
+        shape_by_input.rank(rank);
+        for (uint32_t i = 0; i < rank; ++i)
+        {
+          if (new_shape->dim(i) > 0)
+            shape_by_input.dim(i) = static_cast<uint32_t>(new_shape->dim(i));
+          else
+          {
+            is_static_shape = false;
+            shape_by_input.dim(i).unset();
+          }
+        }
+      }
+      else
+      {
+        auto shape_node = loco::must_cast<luci::CircleNode *>(node->shape());
+        assert(shape_node->rank() == 1);
+        // shape_node tensor values will provide new shape, like [2, 3, 4]
+        auto num_elements = shape_node->dim(0).value(); // above example will give 3
+        shape_by_input.rank(num_elements);
+        is_static_shape = false;
+      }
     }
   }
 
diff --git a/compiler/luci/service/src/Nodes/CircleReshape.test.cpp b/compiler/luci/service/src/Nodes/CircleReshape.test.cpp
index 4bb13edc2f9..653cb690d18 100644
--- a/compiler/luci/service/src/Nodes/CircleReshape.test.cpp
+++ b/compiler/luci/service/src/Nodes/CircleReshape.test.cpp
@@ -162,3 +162,38 @@ TEST(ShapeRuleTest, reshape_by_input_node)
   ASSERT_FALSE(output_shape.dim(0).known());
   ASSERT_FALSE(output_shape.dim(1).known());
 }
+
+TEST(ShapeRuleTest, reshape_by_newShape)
+{
+  auto g = loco::make_graph();
+  auto node_reshape = g->nodes()->create<luci::CircleReshape>();
+  auto tensor_input = g->nodes()->create<luci::CircleInput>();
+  auto shape_dummy = g->nodes()->create<luci::CircleOutputDummy>();
+
+  tensor_input->dtype(loco::DataType::S32);
+  tensor_input->shape({2, 3, 4});
+  tensor_input->shape_status(luci::ShapeStatus::VALID);
+
+  shape_dummy->dtype(loco::DataType::S32);
+  shape_dummy->shape({});
+  shape_dummy->shape_status(luci::ShapeStatus::VALID);
+
+  node_reshape->tensor(tensor_input);
+  node_reshape->shape(shape_dummy);
+
+  // reshape to {2, 12}
+  node_reshape->newShape()->rank(2);
+  node_reshape->newShape()->dim(0) = 2;
+  node_reshape->newShape()->dim(1) = 12;
+
+  loco::TensorShape output_shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(node_reshape, output_shape));
+
+  ASSERT_EQ(2, output_shape.rank());
+  ASSERT_TRUE(output_shape.dim(0).known());
+  ASSERT_TRUE(output_shape.dim(1).known());
+  ASSERT_EQ(2, output_shape.dim(0).value());
+  ASSERT_EQ(12, output_shape.dim(1).value());
+}

From 3d01a869f65e17e2c144c2233478fa545672c8b2 Mon Sep 17 00:00:00 2001
From: youngsik kim <ys44.kim@samsung.com>
Date: Fri, 18 Oct 2024 14:54:10 +0900
Subject: [PATCH 02/46] [onert/nnfw_api] Add unit tests for RoPE op (#14219)

This commit adds unit tests for RoPE

ONE-DCO-1.0-Signed-off-by: youngsik kim <ys44.kim@samsung.com>
---
 tests/nnfw_api/lib/CircleGen.cc               |  7 +++
 tests/nnfw_api/lib/CircleGen.h                |  1 +
 .../GenModelTests/one_op_tests/RoPE.test.cc   | 60 +++++++++++++++++++
 3 files changed, 68 insertions(+)
 create mode 100644 tests/nnfw_api/src/GenModelTests/one_op_tests/RoPE.test.cc

diff --git a/tests/nnfw_api/lib/CircleGen.cc b/tests/nnfw_api/lib/CircleGen.cc
index 38559a678b2..80d1fcfc870 100644
--- a/tests/nnfw_api/lib/CircleGen.cc
+++ b/tests/nnfw_api/lib/CircleGen.cc
@@ -596,6 +596,13 @@ uint32_t CircleGen::addOperatorRmsNorm(const OperatorParams &params, float epsil
                                 circle::BuiltinOptions_RmsNormOptions, options);
 }
 
+uint32_t CircleGen::addOperatorRoPE(const OperatorParams &params, circle::RoPEMode mode)
+{
+  auto options = circle::CreateRoPEOptions(_fbb, mode).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_ROPE,
+                                circle::BuiltinOptions_RoPEOptions, options);
+}
+
 // NOTE Please add addOperator functions ABOVE this lie
 //
 // %  How to add a new addOperatorXXX fuction
diff --git a/tests/nnfw_api/lib/CircleGen.h b/tests/nnfw_api/lib/CircleGen.h
index 388e173ae8e..6297f415c69 100644
--- a/tests/nnfw_api/lib/CircleGen.h
+++ b/tests/nnfw_api/lib/CircleGen.h
@@ -202,6 +202,7 @@ class CircleGen
   uint32_t addOperatorReduce(const OperatorParams &params, circle::BuiltinOperator reduce_op,
                              bool keep_dims);
   uint32_t addOperatorRmsNorm(const OperatorParams &params, float epsilon);
+  uint32_t addOperatorRoPE(const OperatorParams &params, circle::RoPEMode mode);
   /**
    * @brief Create circle Reshape op
    *        the second param new_shape can be optional just like circle::CreateReshapeOptionsDirect
diff --git a/tests/nnfw_api/src/GenModelTests/one_op_tests/RoPE.test.cc b/tests/nnfw_api/src/GenModelTests/one_op_tests/RoPE.test.cc
new file mode 100644
index 00000000000..156cc1c16fa
--- /dev/null
+++ b/tests/nnfw_api/src/GenModelTests/one_op_tests/RoPE.test.cc
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+TEST_F(GenModelTest, OneOp_RoPE)
+{
+  CircleGen cgen;
+  uint32_t sin_table_buf = cgen.addBuffer(std::vector<float>{0.5, 1.0, 1.0, 0.5});
+  int sin_table =
+    cgen.addTensor({{1, 1, 1, 4}, circle::TensorType::TensorType_FLOAT32, sin_table_buf});
+  uint32_t cos_table_buf = cgen.addBuffer(std::vector<float>{1.0, 0.5, 0.5, 1.0});
+  int cos_table =
+    cgen.addTensor({{1, 1, 1, 4}, circle::TensorType::TensorType_FLOAT32, cos_table_buf});
+  int in = cgen.addTensor({{1, 1, 1, 4}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 1, 1, 4}, circle::TensorType::TensorType_FLOAT32});
+
+  cgen.addOperatorRoPE({{in, sin_table, cos_table}, {out}}, circle::RoPEMode_GPT_NEOX);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<float>({{0, 1.0, 2.0, 3.0}}, {{-1.0, -2.5, 1.0, 3.5}}));
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_RoPE_InvalidShape)
+{
+  CircleGen cgen;
+  uint32_t sin_table_buf = cgen.addBuffer(std::vector<float>{0.5, 1.0, 1.0, 0.5});
+  int sin_table =
+    cgen.addTensor({{1, 1, 1, 4}, circle::TensorType::TensorType_FLOAT32, sin_table_buf});
+  uint32_t cos_table_buf = cgen.addBuffer(std::vector<float>{1.0, 0.5, 0.5, 1.0});
+  int cos_table =
+    cgen.addTensor({{1, 1, 1, 4}, circle::TensorType::TensorType_FLOAT32, cos_table_buf});
+  int in = cgen.addTensor({{1, 1, 1, 4}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 1, 1, 3}, circle::TensorType::TensorType_FLOAT32});
+
+  cgen.addOperatorRoPE({{in, sin_table, cos_table}, {out}}, circle::RoPEMode_GPT_NEOX);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailCompile();
+
+  SUCCEED();
+}

From 760c5d50774950e97ed0d097de444d21ca02aeb8 Mon Sep 17 00:00:00 2001
From: SeungHui Youn <61981457+zetwhite@users.noreply.github.com>
Date: Mon, 21 Oct 2024 15:44:12 +0900
Subject: [PATCH 03/46] [onert/train] Update TensorPlanner log (#14236)

This PR update log in TensorPlanner.

ONE-DCO-1.0-Signed-off-by: seunghui youn <sseung.youn@samsung.com>
---
 runtime/onert/backend/train/TensorPlanner.cc | 24 ++++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/runtime/onert/backend/train/TensorPlanner.cc b/runtime/onert/backend/train/TensorPlanner.cc
index 8deb2112673..19d17d138dc 100644
--- a/runtime/onert/backend/train/TensorPlanner.cc
+++ b/runtime/onert/backend/train/TensorPlanner.cc
@@ -37,7 +37,7 @@ TensorPlanner::TensorPlanner(const ir::train::TrainableGraph &tgraph,
 
 void TensorPlanner::planNonConstTensors(TensorBuilder *tensor_builder)
 {
-  VERBOSE(BackendContext) << "Start planning non-constant tensors" << std::endl;
+  VERBOSE(TensorPlanner) << "Start planning non-constant tensors" << std::endl;
 
   const auto &training_usedefs = _tgraph.trainingUseDefs();
 
@@ -207,12 +207,12 @@ void TensorPlanner::planNonConstTensors(TensorBuilder *tensor_builder)
     defs_map.begin(), defs_map.end(),
     [](std::pair<const ir::train::TrainingOperandIndex, uint32_t> it) { return it.second == 0; }));
 
-  VERBOSE(BackendContext) << "Finish planning non-constant tensors" << std::endl;
+  VERBOSE(TensorPlanner) << "Finish planning non-constant tensors" << std::endl;
 }
 
 void TensorPlanner::planTrainableTensors(TensorBuilder *tensor_builder)
 {
-  VERBOSE(BackendContext) << "Start planning constant tensors" << std::endl;
+  VERBOSE(TensorPlanner) << "Start planning constant tensors" << std::endl;
 
   const auto &training_usedefs = _tgraph.trainingUseDefs();
 
@@ -272,12 +272,12 @@ void TensorPlanner::planTrainableTensors(TensorBuilder *tensor_builder)
     defs_map.begin(), defs_map.end(),
     [](std::pair<const ir::train::TrainingOperandIndex, uint32_t> it) { return it.second == 0; }));
 
-  VERBOSE(BackendContext) << "Finish planning constant tensors" << std::endl;
+  VERBOSE(TensorPlanner) << "Finish planning constant tensors" << std::endl;
 }
 
 void TensorPlanner::planBackPropTensors(TensorBuilder *tensor_builder)
 {
-  VERBOSE(BackendContext) << "Start planning back-propagated tensors" << std::endl;
+  VERBOSE(TensorPlanner) << "Start planning back-propagated tensors" << std::endl;
 
   std::unordered_map<ir::train::TrainingOperandIndex, uint32_t> uses_map;
   std::unordered_map<ir::train::TrainingOperandIndex, uint32_t> defs_map;
@@ -409,12 +409,12 @@ void TensorPlanner::planBackPropTensors(TensorBuilder *tensor_builder)
     defs_map.begin(), defs_map.end(),
     [](std::pair<const ir::train::TrainingOperandIndex, uint32_t> it) { return it.second == 0; }));
 
-  VERBOSE(BackendContext) << "Finish planning back-propagated tensors" << std::endl;
+  VERBOSE(TensorPlanner) << "Finish planning back-propagated tensors" << std::endl;
 }
 
 void TensorPlanner::planGradientTensors(TensorBuilder *tensor_builder)
 {
-  VERBOSE(BackendContext) << "Start planning gradient tensors" << std::endl;
+  VERBOSE(TensorPlanner) << "Start planning gradient tensors" << std::endl;
 
   // TODO Use DisposableTensor instead of GradientTensor to plan them together if possible
   //      Backward layers and the corresponding GradientApplier exist in the same back-propagated
@@ -453,12 +453,12 @@ void TensorPlanner::planGradientTensors(TensorBuilder *tensor_builder)
     }
   }
 
-  VERBOSE(BackendContext) << "Finish planning gradient tensors" << std::endl;
+  VERBOSE(TensorPlanner) << "Finish planning gradient tensors" << std::endl;
 }
 
 void TensorPlanner::planDisposableBackPropTensors(TensorBuilder *tensor_builder)
 {
-  VERBOSE(BackendContext) << "Start planning disposable back-prop tensors" << std::endl;
+  VERBOSE(TensorPlanner) << "Start planning disposable back-prop tensors" << std::endl;
 
   for (const auto &op_index : _tgraph.essentialBackwardOrder())
   {
@@ -487,7 +487,7 @@ void TensorPlanner::planDisposableBackPropTensors(TensorBuilder *tensor_builder)
     }
   }
 
-  VERBOSE(BackendContext) << "Finish planning disposable back-prop tensors" << std::endl;
+  VERBOSE(TensorPlanner) << "Finish planning disposable back-prop tensors" << std::endl;
 }
 
 ir::OperandIndexSequence TensorPlanner::getOutgoingBackPropSeq(const ir::OperationIndex &op_index,
@@ -521,6 +521,8 @@ ir::OperandIndexSequence TensorPlanner::getOutgoingBackPropSeq(const ir::Operati
 
 void TensorPlanner::planLayerScopeTensors(TensorBuilder *tensor_builder)
 {
+  VERBOSE(TensorPlanner) << "Start planning layer scope tensors" << std::endl;
+
   // forwading order
   const auto f_order = _tgraph.topolSortOperations();
   for (const auto &op_index : f_order)
@@ -560,6 +562,8 @@ void TensorPlanner::planLayerScopeTensors(TensorBuilder *tensor_builder)
         tensor_builder->notifyLayerScopeLastUse(idx);
     }
   }
+
+  VERBOSE(TensorPlanner) << "Finish planning layerscope tensors" << std::endl;
 }
 
 } // namespace train

From b480c5614773b1c7c657e2d0125d5a8cd1daf177 Mon Sep 17 00:00:00 2001
From: SeungHui Youn <61981457+zetwhite@users.noreply.github.com>
Date: Mon, 21 Oct 2024 15:44:34 +0900
Subject: [PATCH 04/46] [onert/train] Register LayerScopeTensor to registry
 (#14235)

This PR registers LayerScopeTensor from each layer into tensor registry.

ONE-DCO-1.0-Signed-off-by: seunghui youn <sseung.youn@samsung.com>

--------------------------------------

draft : https://github.com/Samsung/ONE/pull/13486
---
 runtime/onert/backend/train/BackendContext.cc | 38 ++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/runtime/onert/backend/train/BackendContext.cc b/runtime/onert/backend/train/BackendContext.cc
index 446fc68c244..d0a634b1460 100644
--- a/runtime/onert/backend/train/BackendContext.cc
+++ b/runtime/onert/backend/train/BackendContext.cc
@@ -262,7 +262,43 @@ FunctionMap BackendContext::generateFunctionMap()
 
 void BackendContext::planLayerScopeTensors([[maybe_unused]] const FunctionMap &fn_map)
 {
-  // TODO: Register LayerScopeTensors
+  const auto &ops = trainable_graph()->operations();
+
+  auto register_tensors = [this](const ir::OperationIndex &op_idx,
+                                 std::optional<LayerScopeTensors> &&tensors) {
+    if (not tensors.has_value())
+      return;
+
+    auto ls_tensors = tensors.value();
+    for (auto i = 0u; i < ls_tensors.size(); ++i)
+    {
+      LayerScopeTensorIndex tensor_idx(op_idx, i);
+      _tensor_builder->registerLayerScopeTensor(tensor_idx, ls_tensors[i]);
+
+      VERBOSE(BackendContext) << "(idx:" << tensor_idx << ") registered" << std::endl;
+    }
+    return;
+  };
+
+  for (auto &pair : fn_map)
+  {
+    const auto &op_idx = pair.first;
+    auto &fn_seq = pair.second;
+
+    const ir::IOperation *op = &ops.at(op_idx);
+    const auto trainable_op = dynamic_cast<const ir::train::TrainableOperation *>(op);
+    assert(trainable_op != nullptr);
+
+    if (not trainable_op->isRequiredForBackward())
+      continue;
+
+    VERBOSE(BackendContext) << "register layerscope tensor for " << trainable_op->name()
+                            << std::endl;
+
+    fn_seq->iterate([&](exec::train::ITrainableFunction &fn) {
+      register_tensors(op_idx, (&fn)->registerLayerScopeTensors());
+    });
+  }
 
   const auto ctx_data = data();
   TensorPlanner tensor_planner{*ctx_data->tgraph.get(), ctx_data->external_operands};

From 4331e4af3d61bea7e1fd8cfc2c1388b543c56bd7 Mon Sep 17 00:00:00 2001
From: Evgenii Maltsev <e.maltsev@samsung.com>
Date: Mon, 21 Oct 2024 17:11:09 +0400
Subject: [PATCH 05/46] [onert-micro] This PR adds makefile example for TizenRT
 (#14221)

Makefile for TizenRT was added for the [issue](https://github.com/Samsung/ONE/issues/14184)

ONE-DCO-1.0-Signed-off-by:  Evgenii Maltsev e.maltsev@samsung.com
---
 onert-micro/examples/TizenRT/Makefile | 340 ++++++++++++++++++++++++++
 1 file changed, 340 insertions(+)
 create mode 100644 onert-micro/examples/TizenRT/Makefile

diff --git a/onert-micro/examples/TizenRT/Makefile b/onert-micro/examples/TizenRT/Makefile
new file mode 100644
index 00000000000..89a949da881
--- /dev/null
+++ b/onert-micro/examples/TizenRT/Makefile
@@ -0,0 +1,340 @@
+###########################################################################
+#
+# Copyright 2024 Samsung Electronics All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+# either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+#
+###########################################################################
+
+-include $(TOPDIR)/.config
+-include $(TOPDIR)/Make.defs
+
+OBJEXT		?= .o
+ASRCS		=
+CXXSRCS		=
+
+CXXFLAGS += -Wno-shadow -Wno-pedantic -fno-permissive
+CXXFLAGS += -pipe -std=c++14
+CXXFLAGS += -fno-exceptions -fcheck-new -fno-rtti
+CXXFLAGS += -ffunction-sections -fdata-sections
+CXXFLAGS += -Os
+# for using quantized models disable this flag
+CXXFLAGS += -DDIS_QUANT
+CXXFLAGS += -Wno-maybe-uninitialized
+CXXFLAGS += -Wno-missing-field-initializers
+CXXFLAGS += -Wno-type-limits -Wno-undef
+
+CFLAGS += -Wno-strict-prototypes
+
+ONERTMICRO_SRC_DIR = $(TOPDIR)/../external/onert-micro/onert-micro/onert-micro/src
+ONERTMICRO_INCLUDE_DIR = $(TOPDIR)/../external/onert-micro/onert-micro/onert-micro/include
+ONERTMICRO_PAL_MCU_DIR = $(TOPDIR)/../external/onert-micro/onert-micro/onert-micro/include/pal/mcu
+ONERTMICRO_PAL_CMSISNN_DIR = $(TOPDIR)/../external/onert-micro/onert-micro/onert-micro/include/pal/cmsisnn
+ONERTMICRO_PAL_COMMON_DIR = $(TOPDIR)/../external/onert-micro/onert-micro/onert-micro/include/pal/common
+FLATBUFFER_DIR = $(TOPDIR)/../external/onert-micro
+SCHEMA_DIR = $(TOPDIR)/../external/onert-micro/externals/gen
+
+CXXFLAGS += -I$(SCHEMA_DIR) -I$(ONERTMICRO_INCLUDE_DIR) -I$(ONERTMICRO_SRC_DIR) -I$(FLATBUFFER_DIR)
+CXXFLAGS += -I$(ONERTMICRO_PAL_COMMON_DIR)
+
+ifeq ($(CONFIG_EXTERNAL_CMSIS_NN), y)
+CXXFLAGS += -I$(ONERTMICRO_PAL_CMSISNN_DIR)
+else
+CXXFLAGS += -I$(ONERTMICRO_PAL_MCU_DIR)
+endif
+
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/OMInterpreter.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/core/memory/OMMemoryManager.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/core/memory/OMRuntimeAllocator.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/core/reader/OMCircleReader.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/core/reader/OMTrainingConfigFileReader.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/core/reader/OMWeightOnlyFormatReader.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/core/OMDataType.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/core/OMKernelType.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/core/OMRuntimeContext.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/core/OMRuntimeGraph.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/core/OMRuntimeModule.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/core/OMRuntimeStorage.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/core/OMUtils.cpp
+
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/OMKernelExecute.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/OMKernelExecutionBuilder.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/OMRuntimeKernel.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/OMUtils.cpp
+
+#Execute Kernels
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Abs.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Add.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/AddN.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/ArgCommon.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/ArgMax.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/ArgMin.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/AveragePool2D.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/BatchToSpaceND.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Concatenation.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Conv2D.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/ConvolutionCommon.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Cos.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/DepthwiseConv2D.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Dequantize.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Div.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Exp.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Equal.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/ExpandDims.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Floor.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/FloorDiv.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/FloorMod.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/FullyConnected.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Gather.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/GatherND.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Greater.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/GreaterEqual.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/L2Normalize.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/L2Pool2D.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/LeakyRelu.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Less.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/LessEqual.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Log.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Logistic.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/LogSoftmax.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/MathCommon.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Maximum.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/MaxPool2D.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Minimum.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Mul.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Neg.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/NotEqual.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Pad.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/PoolingCommon.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/ReadKernelDataCommon.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Relu.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Relu6.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/ReluCommon.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Reshape.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/ReshapeCommon.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Round.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Rsqrt.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Sin.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Slice.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Softmax.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/SpacesBatchesNDCommon.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/SpaceToBatchND.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/SpaceToDepth.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Split.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Sqrt.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Square.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/SquaredDifference.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/StridedSlice.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Sub.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Tanh.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Transpose.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/TransposeConv.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Unpack.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/While.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Ceil.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Cast.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Elu.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Fill.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/SplitV.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Quantize.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/SVDF.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/GRU.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/execute/kernels/Pack.cpp
+
+# Import Kernels
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Abs.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Add.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/AddN.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/ArgMax.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/ArgMin.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/AveragePool2D.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/BatchToSpaceND.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Concatenation.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Conv2D.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Cos.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/DepthwiseConv2D.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Dequantize.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Div.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Exp.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Equal.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/ExpandDims.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Floor.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/FloorDiv.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/FloorMod.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/FullyConnected.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Gather.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/GatherND.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Greater.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/GreaterEqual.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/L2Normalize.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/L2Pool2D.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/LeakyRelu.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Less.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/LessEqual.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Log.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Logistic.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/LogSoftmax.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Maximum.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/MaxPool2D.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Minimum.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Mul.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Neg.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/NotEqual.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Pad.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Relu.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Relu6.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Reshape.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Round.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Rsqrt.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Sin.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Slice.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Softmax.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/SpaceToBatchND.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/SpaceToDepth.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Split.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Sqrt.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Square.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/SquaredDifference.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/StridedSlice.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Sub.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Tanh.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Transpose.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/TransposeConv.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Unpack.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/While.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Ceil.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Cast.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Elu.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Fill.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/SplitV.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Quantize.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/SVDF.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/GRU.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/kernels/Pack.cpp
+
+# Import Helpers
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/helpers/OMConfigureSISOKernel.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/helpers/OMConfigureTISOKernel.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/helpers/OMPadCommon.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/helpers/OMPoolingCommon.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/helpers/OMSpacesBatchesNDCommon.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/helpers/OMFloorCommon.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/helpers/OMArgCommon.cpp
+
+# Import some utils
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/OMExecutionPlanCreator.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/OMKernelConfiguration.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/OMKernelConfigureBuilder.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/OMDynamicShapesHandler.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/import/OMUtils.cpp
+
+# Optimize
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/optimize/pass/FindInplaceOpPass.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/optimize/OMOptimizer.cpp
+
+# TRAINING
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/OMTrainingInterpreter.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/core/OMTrainingRuntimeModule.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/core/train/OMTrainingHandler.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/core/train/OMTrainingStorage.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/core/train/OMCheckpointSaver.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/core/train/OMCheckpointLoader.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/OMBackpropExecute.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/OMBackpropExecutionBuilder.cpp
+#Train kernels
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/kernels/GRU.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/kernels/StridedSlice.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/kernels/Conv2D.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/kernels/FullyConnected.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/kernels/MaxPool2D.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/kernels/Reshape.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/kernels/Softmax.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/losses_functions/CrossEntropy.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/losses_functions/MSE.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/losses_functions/SparseCrossEntropy.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/metrics/Accuracy.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/metrics/CrossEntropy.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/metrics/MAE.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/metrics/MSE.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/metrics/SparseCrossEntropyAccuracy.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/train_optimizers/Adam.cpp
+CXXSRCS += $(ONERTMICRO_SRC_DIR)/train/train_optimizers/SGD.cpp
+
+CFLAGS += -Wno-maybe-uninitialized
+CFLAGS += -Wno-missing-field-initializers
+CFLAGS += -Wno-pointer-sign
+CFLAGS += -Wno-type-limits -Wno-undef
+
+AOBJS		= $(ASRCS:.S=$(OBJEXT))
+CXXOBJS		= $(CXXSRCS:.cpp=$(OBJEXT))
+
+SRCS		= $(ASRCS) $(CXXSRCS)
+OBJS		= $(AOBJS) $(CXXOBJS)
+
+ifeq ($(CONFIG_WINDOWS_NATIVE),y)
+  BIN		= ..\libexternal$(LIBEXT)
+else
+ifeq ($(WINTOOL),y)
+  BIN		= ..\\libexternal$(LIBEXT)
+else
+  BIN		= ../libexternal$(LIBEXT)
+endif
+endif
+
+DEPPATH	= --dep-path .
+
+# Common build
+
+VPATH		=
+
+all: .built
+.PHONY: depend clean distclean chkcxx
+
+chkcxx:
+ifneq ($(CONFIG_HAVE_CXX),y)
+	@echo ""
+	@echo "In order to use this example, you toolchain must support must"
+	@echo ""
+	@echo "  (1) Explicitly select CONFIG_HAVE_CXX to build in C++ support"
+	@echo "  (2) Define CXX, CXXFLAGS, and COMPILEXX in the Make.defs file"
+	@echo "      of the configuration that you are using."
+	@echo ""
+	@exit 1
+endif
+
+$(AOBJS): %$(OBJEXT): %.S
+	$(call ASSEMBLE, $<, $@)
+
+$(CXXOBJS) $(MAINOBJ): %$(OBJEXT): %.cpp
+	$(call COMPILEXX, $<, $@)
+
+.built: chkcxx $(OBJS)
+	$(call ARCHIVE, $(BIN), $(OBJS))
+	$(Q) touch .built
+
+.depend: Makefile $(SRCS)
+	$(Q) $(MKDEP) $(DEPPATH) "$(CXX)" -- $(CXXFLAGS) -- $(SRCS) >Make.dep
+	$(Q) touch $@
+
+depend: .depend
+
+clean:
+	$(call DELFILE, .built)
+	$(call CLEAN)
+	$(foreach FILE, $(CXXOBJS), $(call DELFILE, $(FILE)))
+
+distclean: clean
+	$(call DELFILE, Make.dep)
+	$(call DELFILE, .depend)
+
+-include Make.dep

From a92b6c19ced3ecb86705d25a8617011d18879a11 Mon Sep 17 00:00:00 2001
From: seongwoo chae <mhs4670go@naver.com>
Date: Tue, 22 Oct 2024 08:27:04 +0900
Subject: [PATCH 06/46] [fme-apply] Change phase strategy and find depth
 (#14240)

This commit changes phase strategy and find depth.

ONE-DCO-1.0-Signed-off-by: seongwoo <mhs4670go@naver.com>
---
 compiler/fme-apply/src/FMEqualizer.cpp      | 4 ++--
 compiler/fme-apply/src/InsertScaleShift.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler/fme-apply/src/FMEqualizer.cpp b/compiler/fme-apply/src/FMEqualizer.cpp
index 3dcd79eb032..7f81a96a7fb 100644
--- a/compiler/fme-apply/src/FMEqualizer.cpp
+++ b/compiler/fme-apply/src/FMEqualizer.cpp
@@ -82,8 +82,8 @@ void FMEqualizer::equalize(loco::Graph *g, std::vector<EqualizePattern> &p)
   phase.emplace_back(std::make_unique<fme_apply::FusePreScalePass>());
   phase.emplace_back(std::make_unique<fme_apply::FusePostScalePass>());
 
-  ProgressReporter prog(g, logo::PhaseStrategy::Restart);
-  logo::PhaseRunner<logo::PhaseStrategy::Restart> phase_runner{g};
+  ProgressReporter prog(g, logo::PhaseStrategy::Saturate);
+  logo::PhaseRunner<logo::PhaseStrategy::Saturate> phase_runner{g};
   phase_runner.attach(&prog);
   phase_runner.run(phase);
 
diff --git a/compiler/fme-apply/src/InsertScaleShift.cpp b/compiler/fme-apply/src/InsertScaleShift.cpp
index 4aa040a5dff..27469a1327f 100644
--- a/compiler/fme-apply/src/InsertScaleShift.cpp
+++ b/compiler/fme-apply/src/InsertScaleShift.cpp
@@ -301,7 +301,7 @@ struct InsertScaleShiftVisitor final : public luci::CircleNodeMutableVisitor<voi
     auto valid = ::calculate_smooth_quant_scale(node, _pattern);
     auto back_node = node;
     // Find front node.
-    const auto support_depth = 2;
+    const auto support_depth = 3;
     auto front_node = find_arg_with_name(node, _pattern->front, support_depth);
     if (not front_node)
     {

From ba739adf9218cc87f8f5c989a814ff4546ff39e8 Mon Sep 17 00:00:00 2001
From: seongwoo chae <mhs4670go@naver.com>
Date: Tue, 22 Oct 2024 09:46:13 +0900
Subject: [PATCH 07/46] [fme-detect] Support one more forwardable successors
 (#14241)

This commit supports one more forwardable successors.

ONE-DCO-1.0-Signed-off-by: seongwoo <mhs4670go@naver.com>
---
 .../fme-detect/src/EqualizePatternFinder.cpp  | 30 +++++++++++++------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/compiler/fme-detect/src/EqualizePatternFinder.cpp b/compiler/fme-detect/src/EqualizePatternFinder.cpp
index b856f03a34d..369b3eba478 100644
--- a/compiler/fme-detect/src/EqualizePatternFinder.cpp
+++ b/compiler/fme-detect/src/EqualizePatternFinder.cpp
@@ -186,8 +186,11 @@ void match(luci::CircleNode *front, std::vector<EqualizePattern> &res)
     throw std::invalid_argument("front");
 
   auto front_fusability = fusability(front);
-
-  for (auto succ : loco::succs(front))
+  auto succs = loco::succs(front);
+  // TODO Support multiple successors.
+  if (succs.size() != 1)
+    return;
+  for (auto succ : succs)
   {
     // Check succ fusability
     auto back = loco::must_cast<luci::CircleNode *>(succ);
@@ -201,15 +204,24 @@ void match(luci::CircleNode *front, std::vector<EqualizePattern> &res)
       auto f = forwardable(back);
       if (f.scale_forwardable)
       {
-        auto succ_succs = loco::succs(back);
+        auto back_succs = loco::succs(back);
         // Only support single successor for simplicity
-        if (succ_succs.size() != 1)
+        if (back_succs.size() != 1)
           continue;
-        auto next_succ = *succ_succs.begin();
-        auto next_back = loco::must_cast<luci::CircleNode *>(next_succ);
-        back_fusability = fusability(next_back);
-        back_fusability.pre_scale &= f.scale_forwardable;
-        back = next_back;
+        back = loco::must_cast<luci::CircleNode *>(*back_succs.begin());
+        back_fusability = fusability(back);
+        if (not back_fusability.pre_scale)
+        {
+          f = forwardable(back);
+          if (f.scale_forwardable)
+          {
+            back_succs = loco::succs(back);
+            if (back_succs.size() != 1)
+              continue;
+            back = loco::must_cast<luci::CircleNode *>(*back_succs.begin());
+            back_fusability = fusability(back);
+          }
+        }
       }
     }
 

From ff5201ffdaec0c17ad203dbbe18efe8aab629680 Mon Sep 17 00:00:00 2001
From: seongwoo chae <mhs4670go@naver.com>
Date: Tue, 22 Oct 2024 11:25:42 +0900
Subject: [PATCH 08/46] [fme-detect] Do not forward Gelu (#14242)

This commit doesn't forward Gelu when finding a pattern.

ONE-DCO-1.0-Signed-off-by: seongwoo <mhs4670go@naver.com>
---
 compiler/fme-detect/src/EqualizePatternFinder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/fme-detect/src/EqualizePatternFinder.cpp b/compiler/fme-detect/src/EqualizePatternFinder.cpp
index 369b3eba478..543522ac163 100644
--- a/compiler/fme-detect/src/EqualizePatternFinder.cpp
+++ b/compiler/fme-detect/src/EqualizePatternFinder.cpp
@@ -173,7 +173,7 @@ Forwardable forwardable(luci::CircleNode *node)
     case luci::CircleOpcode::LEAKY_RELU:
       return {true, false};
     case luci::CircleOpcode::GELU:
-      return {true, false};
+      return {false, false};
     default:
       return {false, false};
   }

From f280955d33b01b794c6bcc9f36c3bc9ec810d285 Mon Sep 17 00:00:00 2001
From: Evgenii Maltsev <e.maltsev@samsung.com>
Date: Wed, 23 Oct 2024 06:54:05 +0400
Subject: [PATCH 09/46] [onert-micro] support weight quantized (int8)
 FullyConnected kernel (#14137)

- FullyConnected kernel (input:FLOAT32 + weights:INT8)

ONE-DCO-1.0-Signed-off-by:  Evgenii Maltsev e.maltsev@samsung.com
---
 .../onert-micro/include/core/OMKernelData.h   |  2 +
 .../pal/common/PALFullyConnectedCommon.h      | 31 +++++--
 .../FloatFullyConnectedKernel.h               | 90 +++++++++++++++++++
 .../src/execute/kernels/FullyConnected.cpp    | 34 +++++--
 .../kernels/tests/FullyConnected.test.cpp     |  9 ++
 .../src/import/kernels/FullyConnected.cpp     | 64 +++++++++++--
 6 files changed, 212 insertions(+), 18 deletions(-)

diff --git a/onert-micro/onert-micro/include/core/OMKernelData.h b/onert-micro/onert-micro/include/core/OMKernelData.h
index d0ab251777e..64bf96f0521 100644
--- a/onert-micro/onert-micro/include/core/OMKernelData.h
+++ b/onert-micro/onert-micro/include/core/OMKernelData.h
@@ -186,6 +186,8 @@ struct FullyConnectedParams
   int32_t weights_offset;
   int32_t output_offset;
   int32_t output_multiplier;
+  const float *weights_scales;
+  bool is_channel_wise_quant;
   int output_shift;
   // uint8_t, etc, activation params.
   int32_t quantized_activation_min;
diff --git a/onert-micro/onert-micro/include/pal/common/PALFullyConnectedCommon.h b/onert-micro/onert-micro/include/pal/common/PALFullyConnectedCommon.h
index e0cd74cf8f5..69908232510 100644
--- a/onert-micro/onert-micro/include/pal/common/PALFullyConnectedCommon.h
+++ b/onert-micro/onert-micro/include/pal/common/PALFullyConnectedCommon.h
@@ -76,12 +76,11 @@ OMStatus FullyConnected(const core::FullyConnectedParams &params, const InputTyp
   return Ok;
 }
 
-template <>
-OMStatus inline FullyConnected<float>(const core::FullyConnectedParams &params,
-                                      const float *input_data,
-                                      const core::OMRuntimeShape &filter_shape,
-                                      const float *filter_data, const float *bias_data,
-                                      const core::OMRuntimeShape &output_shape, float *output_data)
+template <typename WeightType>
+OMStatus inline FullyConnected(const core::FullyConnectedParams &params, const float *input_data,
+                               const core::OMRuntimeShape &filter_shape,
+                               const WeightType *filter_data, const float *bias_data,
+                               const core::OMRuntimeShape &output_shape, float *output_data)
 {
   const float output_activation_min = params.float_activation_min;
   const float output_activation_max = params.float_activation_max;
@@ -93,12 +92,24 @@ OMStatus inline FullyConnected<float>(const core::FullyConnectedParams &params,
 
   for (int b = 0; b < batches; ++b)
   {
+    const float *weight_scale_ptr = params.weights_scales;
     for (int out_c = 0; out_c < output_depth; ++out_c)
     {
       float total = 0.f;
       for (int d = 0; d < accum_depth; ++d)
       {
-        total += input_data[b * accum_depth + d] * filter_data[out_c * accum_depth + d];
+        auto input_value = input_data[b * accum_depth + d];
+        if (std::is_same<WeightType, float>::value)
+        {
+          total += input_value * filter_data[out_c * accum_depth + d];
+        }
+        else
+        {
+          const float filter_scale = *weight_scale_ptr;
+          const float filter_value =
+            static_cast<float>(filter_data[out_c * accum_depth + d]) * filter_scale;
+          total += input_value * filter_value;
+        }
       }
       float bias_value = 0.0f;
       if (bias_data)
@@ -107,6 +118,12 @@ OMStatus inline FullyConnected<float>(const core::FullyConnectedParams &params,
       }
       output_data[out_c + output_depth * b] =
         std::min(std::max(total + bias_value, output_activation_min), output_activation_max);
+
+      if (std::is_same<WeightType, int8_t>::value)
+      {
+        if (params.is_channel_wise_quant)
+          weight_scale_ptr++;
+      }
     }
   }
   return Ok;
diff --git a/onert-micro/onert-micro/include/test_models/fully_connected/FloatFullyConnectedKernel.h b/onert-micro/onert-micro/include/test_models/fully_connected/FloatFullyConnectedKernel.h
index 00442fa939a..6f5f62db939 100644
--- a/onert-micro/onert-micro/include/test_models/fully_connected/FloatFullyConnectedKernel.h
+++ b/onert-micro/onert-micro/include/test_models/fully_connected/FloatFullyConnectedKernel.h
@@ -96,6 +96,82 @@ const std::vector<float> reference_output_data = {263.84323, 260.84323, 259.8432
 
 } // namespace fully_connected_float
 
+namespace fully_connected_float_weights_quantized_int8
+{
+
+/*
+ * FullyConnected Kernel:
+ * Input - float32
+ * Weight - int8
+ * Bias - float32
+ * Out - float32
+ *
+ * Input(1, 4)   Weight(4, 4)   Bias(4)
+ *            \        |         /
+ *             \       |        /
+ *               FullyConnected
+ *                     |
+ *                Output(1, 4)
+ */
+
+const unsigned char test_kernel_model_circle[] = {
+  0x20, 0x00, 0x00, 0x00, 0x43, 0x49, 0x52, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x00, 0x00, 0x14, 0x00,
+  0x12, 0x00, 0x00, 0x00, 0xd8, 0x00, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0xe0, 0x02, 0x00, 0x00,
+  0xc8, 0x02, 0x00, 0x00, 0x24, 0x02, 0x00, 0x00, 0xc0, 0x01, 0x00, 0x00, 0x90, 0x01, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0xcc, 0xff, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x4f, 0x4e, 0x45, 0x5f, 0x6f, 0x70, 0x5f, 0x74, 0x61, 0x62, 0x6c, 0x65,
+  0x00, 0x00, 0x00, 0x00, 0x22, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x4f, 0x4e, 0x45, 0x5f, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x5f, 0x74, 0x61, 0x62, 0x6c, 0x65,
+  0x00, 0x00, 0x00, 0x00, 0x62, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6f, 0x75, 0x74, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,
+  0x6e, 0x6e, 0x70, 0x61, 0x63, 0x6b, 0x61, 0x67, 0x65, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x09, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00,
+  0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x7c, 0x01, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00,
+  0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00,
+  0x07, 0x00, 0x10, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x18, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x90, 0xfe, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0xe0, 0xfe, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x6f, 0x75, 0x74, 0x00, 0xc4, 0xfe, 0xff, 0xff,
+  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0xff, 0xff, 0xff,
+  0x34, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x00, 0x00, 0xa6, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x40, 0xc0,
+  0x00, 0x00, 0x80, 0x40, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
+  0x18, 0x00, 0x08, 0x00, 0x07, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x09, 0x94, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+  0x40, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x00, 0x00,
+  0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x55, 0x7f, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x00,
+  0x00, 0x00, 0x00, 0x7f, 0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x83, 0xc1, 0x3c, 0x04, 0x02, 0x01, 0x3d,
+  0x85, 0x42, 0x21, 0x3d, 0x06, 0x83, 0x41, 0x3d, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x69, 0x6e, 0x00, 0x00, 0xf0, 0xff, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00};
+
+const std::vector<float> input_data = {17.491695, 15.660671, 4.7347794, -15.796822};
+
+const std::vector<float> reference_output_data = {-19.529659, 60.642685, 20.673897, -90.780930};
+
+} // namespace fully_connected_float_weights_quantized_int8
+
 class TestDataFloatFullyConnected : public TestDataFullyConnectedBase<float>
 {
 public:
@@ -109,6 +185,20 @@ class TestDataFloatFullyConnected : public TestDataFullyConnectedBase<float>
   ~TestDataFloatFullyConnected() override = default;
 };
 
+class TestDataFloatWQInt8FullyConnected : public TestDataFullyConnectedBase<float>
+{
+public:
+  TestDataFloatWQInt8FullyConnected()
+  {
+    _input_data = fully_connected_float_weights_quantized_int8::input_data;
+    _reference_output_data = fully_connected_float_weights_quantized_int8::reference_output_data;
+    _test_kernel_model_circle =
+      fully_connected_float_weights_quantized_int8::test_kernel_model_circle;
+  }
+
+  ~TestDataFloatWQInt8FullyConnected() override = default;
+};
+
 } // namespace test_model
 } // namespace onert_micro
 
diff --git a/onert-micro/onert-micro/src/execute/kernels/FullyConnected.cpp b/onert-micro/onert-micro/src/execute/kernels/FullyConnected.cpp
index 981e93df324..89d3482a3fc 100644
--- a/onert-micro/onert-micro/src/execute/kernels/FullyConnected.cpp
+++ b/onert-micro/onert-micro/src/execute/kernels/FullyConnected.cpp
@@ -147,11 +147,35 @@ onert_micro::execute::execute_kernel_CircleFullyConnected(const OMExecuteArgs &e
       if (status != Ok)
         return status;
 
-      status =
-        pal::FullyConnected(params, core::utils::castInputData<float>(input_data),
-                            OMRuntimeShape(weight), core::utils::castInputData<float>(weight_data),
-                            core::utils::castInputData<float>(bias_data), OMRuntimeShape(output),
-                            core::utils::castOutputData<float>(output_data));
+      switch (weight->type())
+      {
+        case circle::TensorType_FLOAT32:
+        {
+
+          status = pal::FullyConnected(
+            params, core::utils::castInputData<float>(input_data), OMRuntimeShape(weight),
+            core::utils::castInputData<float>(weight_data),
+            core::utils::castInputData<float>(bias_data), OMRuntimeShape(output),
+            core::utils::castOutputData<float>(output_data));
+        }
+        break;
+        case circle::TensorType_INT8:
+        {
+          // weight quantized INT8 mode
+          params.weights_scales =
+            reinterpret_cast<const float *>(weight->quantization()->scale()->data());
+          params.is_channel_wise_quant = weight->quantization()->scale()->size() > 1;
+
+          status = pal::FullyConnected(
+            params, core::utils::castInputData<float>(input_data), OMRuntimeShape(weight),
+            core::utils::castInputData<int8_t>(weight_data),
+            core::utils::castInputData<float>(bias_data), OMRuntimeShape(output),
+            core::utils::castOutputData<float>(output_data));
+        }
+        break;
+        default:
+          assert(false && "Unsupported hybrid weight type");
+      }
     }
     break;
 #endif // DIS_FLOAT
diff --git a/onert-micro/onert-micro/src/execute/kernels/tests/FullyConnected.test.cpp b/onert-micro/onert-micro/src/execute/kernels/tests/FullyConnected.test.cpp
index 5085341b761..a61e9cda715 100644
--- a/onert-micro/onert-micro/src/execute/kernels/tests/FullyConnected.test.cpp
+++ b/onert-micro/onert-micro/src/execute/kernels/tests/FullyConnected.test.cpp
@@ -41,6 +41,15 @@ TEST_F(FullyConnectedTest, Float_P)
   EXPECT_THAT(output_data_vector, test_data_kernel.get_output_data_by_index(0));
 }
 
+// test hybrid kernel input:float32 + weight:int8
+TEST_F(FullyConnectedTest, FloatWQInt8_P)
+{
+  onert_micro::test_model::TestDataFloatWQInt8FullyConnected test_data_kernel;
+  std::vector<float> output_data_vector =
+    onert_micro::execute::testing::checkKernel<float>(1, &test_data_kernel);
+  EXPECT_THAT(output_data_vector, test_data_kernel.get_output_data_by_index(0));
+}
+
 TEST_F(FullyConnectedTest, S8_P)
 {
   onert_micro::test_model::TestDataS8FullyConnected test_data_kernel;
diff --git a/onert-micro/onert-micro/src/import/kernels/FullyConnected.cpp b/onert-micro/onert-micro/src/import/kernels/FullyConnected.cpp
index e7bd5a4b71a..f9e401e9dbd 100644
--- a/onert-micro/onert-micro/src/import/kernels/FullyConnected.cpp
+++ b/onert-micro/onert-micro/src/import/kernels/FullyConnected.cpp
@@ -40,6 +40,7 @@ constexpr uint32_t outputTensorIdx = 0;
 OMStatus
 onert_micro::import::configure_kernel_CircleFullyConnected(const OMConfigureArgs &config_args)
 {
+
   OMRuntimeContext &runtime_context = config_args.runtime_context;
   uint16_t op_index = config_args.kernel_index;
   OMRuntimeStorage &runtime_storage = config_args.runtime_storage;
@@ -50,7 +51,6 @@ onert_micro::import::configure_kernel_CircleFullyConnected(const OMConfigureArgs
   const circle::Tensor *input = runtime_kernel.inputs[inputTensorIdx];
   const circle::Tensor *weight = runtime_kernel.inputs[weightTensorIdx];
   const circle::Tensor *bias = runtime_kernel.inputs[biasTensorIdx];
-
   const circle::Tensor *output = runtime_kernel.outputs[outputTensorIdx];
 
   assert(input != nullptr);
@@ -60,13 +60,65 @@ onert_micro::import::configure_kernel_CircleFullyConnected(const OMConfigureArgs
 
   OMStatus status = Ok;
 
-  if ((input->type() == circle::TensorType_FLOAT32 &&
-       weight->type() != circle::TensorType_FLOAT32) or
-      (input->type() == circle::TensorType_INT8 && weight->type() != circle::TensorType_INT8) or
-      (input->type() == circle::TensorType_INT16 && weight->type() != circle::TensorType_INT16))
+#ifndef DIS_FLOAT
+  if (weight->type() == circle::TensorType_FLOAT32)
   {
-    return UnsupportedType;
+
+    status = utils::checkCondition(input->type() == circle::TensorType_FLOAT32 and
+                                   output->type() == circle::TensorType_FLOAT32 and
+                                   (!bias or bias->type() == circle::TensorType_FLOAT32));
+    if (status != Ok)
+      return status;
+  }
+#endif // DIS_FLOAT
+#ifndef DIS_QUANT
+  if (weight->type() == circle::TensorType_UINT8)
+  {
+
+    status = utils::checkCondition(input->type() == circle::TensorType_UINT8 and
+                                   output->type() == circle::TensorType_UINT8 and
+                                   (!bias or bias->type() == circle::TensorType_INT32));
+    if (status != Ok)
+      return status;
   }
+  else if (weight->type() == circle::TensorType_INT8)
+  {
+    status = utils::checkCondition(input->type() == circle::TensorType_INT8 or
+                                   input->type() == circle::TensorType_FLOAT32);
+    if (status != Ok)
+      return status;
+
+    status = utils::checkCondition(output->type() == circle::TensorType_INT8 or
+                                   output->type() == circle::TensorType_FLOAT32);
+    if (status != Ok)
+      return status;
+
+    status = utils::checkCondition(!bias or bias->type() == circle::TensorType_INT32 or
+                                   bias->type() == circle::TensorType_INT64 or
+                                   bias->type() == circle::TensorType_FLOAT32);
+    if (status != Ok)
+      return status;
+
+    if (input->type() == circle::TensorType_FLOAT32)
+    {
+      // hybrid mode
+      // Check it is channel wise quantization
+      status = utils::checkCondition(weight->quantization() != nullptr and
+                                     weight->quantization()->scale() != nullptr);
+      if (status != Ok)
+        return status;
+    }
+  }
+  else if (weight->type() == circle::TensorType_INT16)
+  {
+
+    status = utils::checkCondition(input->type() == circle::TensorType_INT16 and
+                                   output->type() == circle::TensorType_INT16 and
+                                   (!bias or bias->type() == circle::TensorType_INT32));
+    if (status != Ok)
+      return status;
+  }
+#endif // DIS_QUANT
 
   core::OMRuntimeShape weight_shape(weight);
   core::OMRuntimeShape bias_shape(bias);

From a1612dab59f652c7d2a63c2e3c0f16ff195b0e06 Mon Sep 17 00:00:00 2001
From: seongwoo chae <mhs4670go@naver.com>
Date: Thu, 24 Oct 2024 12:41:51 +0900
Subject: [PATCH 10/46] [fme-apply] Apply abs to weight (#14243)

This commit applies abs to weight.

ONE-DCO-1.0-Signed-off-by: seongwoo <mhs4670go@naver.com>
---
 compiler/fme-apply/src/InsertScaleShift.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/compiler/fme-apply/src/InsertScaleShift.cpp b/compiler/fme-apply/src/InsertScaleShift.cpp
index 27469a1327f..904128ed95f 100644
--- a/compiler/fme-apply/src/InsertScaleShift.cpp
+++ b/compiler/fme-apply/src/InsertScaleShift.cpp
@@ -125,7 +125,8 @@ bool calculate_smooth_quant_scale(luci::CircleNode *node, EqualizePattern *p)
         cur = i;
         for (uint32_t j = 0; j < norm_dim; j++)
         {
-          weight_max.at(i) = std::max(weight_max.at(i), weight->at<loco::DataType::FLOAT32>(cur));
+          weight_max.at(i) =
+            std::max(weight_max.at(i), std::abs(weight->at<loco::DataType::FLOAT32>(cur)));
           cur += weight_I;
         }
       }
@@ -166,7 +167,8 @@ bool calculate_smooth_quant_scale(luci::CircleNode *node, EqualizePattern *p)
         cur = i;
         for (uint32_t j = 0; j < weight_O; j++)
         {
-          weight_max.at(i) = std::max(weight_max.at(i), weight->at<loco::DataType::FLOAT32>(cur));
+          weight_max.at(i) =
+            std::max(weight_max.at(i), std::abs(weight->at<loco::DataType::FLOAT32>(cur)));
           cur += weight_I;
         }
       }

From 9d5e68dcffd86d59695def1cccbd2fecfe920498 Mon Sep 17 00:00:00 2001
From: seongwoo chae <mhs4670go@naver.com>
Date: Thu, 24 Oct 2024 12:42:24 +0900
Subject: [PATCH 11/46] [dalgona] Add bool type (#14248)

This commit adds bool type in numpyArray function.

ONE-DCO-1.0-Signed-off-by: seongwoo <mhs4670go@naver.com>
---
 compiler/dalgona/src/Utils.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/compiler/dalgona/src/Utils.cpp b/compiler/dalgona/src/Utils.cpp
index a5b0bb52985..bcd97038bde 100644
--- a/compiler/dalgona/src/Utils.cpp
+++ b/compiler/dalgona/src/Utils.cpp
@@ -68,6 +68,8 @@ py::array numpyArray(const Tensor *tensor)
       return py::array_t<int64_t, py::array::c_style>(shape, tensor->data<int64_t>());
     case loco::DataType::U8:
       return py::array_t<uint8_t, py::array::c_style>(shape, tensor->data<uint8_t>());
+    case loco::DataType::BOOL:
+      return py::array_t<bool, py::array::c_style>(shape, tensor->data<bool>());
     default:
       throw std::runtime_error("Unsupported data type");
   }

From e24c111c0e55811eabce01232664ba6c403b71e6 Mon Sep 17 00:00:00 2001
From: seongwoo chae <mhs4670go@naver.com>
Date: Thu, 24 Oct 2024 12:43:07 +0900
Subject: [PATCH 12/46] [dalgona] Consider no bias in FullyConnected (#14249)

This commit considers FullyConnected with no bias.

ONE-DCO-1.0-Signed-off-by: seongwoo <mhs4670go@naver.com>
---
 compiler/dalgona/src/PostOperatorHook.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/compiler/dalgona/src/PostOperatorHook.h b/compiler/dalgona/src/PostOperatorHook.h
index 00c5d461c0c..0020a457596 100644
--- a/compiler/dalgona/src/PostOperatorHook.h
+++ b/compiler/dalgona/src/PostOperatorHook.h
@@ -187,12 +187,17 @@ class PostOperatorHook final : public luci::CircleNodeVisitor<void>
     POST_OPERATOR_HOOK_PROLOGUE(FullyConnected)
 
     auto fused_act = node->fusedActivationFunction();
-
+    py::dict bias;
+    // bias is optional
+    if (inputs.size() == 3)
+    {
+      bias = inputs[2];
+    }
     pySafeCall(hook,
                node->name(),       // name
                inputs[0],          // input
                inputs[1],          // weights
-               inputs[2],          // bias
+               bias,               // bias
                output,             // output
                toString(fused_act) // fused activation
     );

From 9c49d99d4a165d702b1a46ccd8076ebfdd2594a8 Mon Sep 17 00:00:00 2001
From: seongwoo chae <mhs4670go@naver.com>
Date: Thu, 24 Oct 2024 12:44:04 +0900
Subject: [PATCH 13/46] [fme-apply] Consider no bias FullyConnected (#14250)

This commit considers FullyConnected with no bias.

ONE-DCO-1.0-Signed-off-by: seongwoo <mhs4670go@naver.com>
---
 .../fme-apply/src/pass/FusePostScalePass.cpp  | 46 ++++++++++++-------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/compiler/fme-apply/src/pass/FusePostScalePass.cpp b/compiler/fme-apply/src/pass/FusePostScalePass.cpp
index 6097eda32e2..a8519c10edd 100644
--- a/compiler/fme-apply/src/pass/FusePostScalePass.cpp
+++ b/compiler/fme-apply/src/pass/FusePostScalePass.cpp
@@ -248,7 +248,7 @@ struct FusePostScale final : public luci::CircleNodeMutableVisitor<bool>
     auto param =
       loco::must_cast<luci::CircleConst *>(post_scale->inputs(1)); // FIX_PostScale_UNLESS
     auto filter = loco::must_cast<luci::CircleConst *>(node->weights());
-    auto bias = loco::must_cast<luci::CircleConst *>(node->bias());
+    luci::CircleConst *bias = dynamic_cast<luci::CircleConst *>(node->bias());
 
     uint32_t filter_o = filter->dim(0).value();
     uint32_t filter_i = filter->dim(1).value();
@@ -259,26 +259,34 @@ struct FusePostScale final : public luci::CircleNodeMutableVisitor<bool>
       throw std::runtime_error("Mismatch between scale size and filter output channel size: " +
                                std::to_string(filter_o) + " != " + std::to_string(param_size));
     }
-    const auto bias_size = bias->size<loco::DataType::FLOAT32>();
-    if (bias_size != param_size)
+    if (bias)
     {
-      throw std::runtime_error("Mismatch between scale size and bias size: " +
-                               std::to_string(bias_size) + " != " + std::to_string(param_size));
+      const auto bias_size = bias->size<loco::DataType::FLOAT32>();
+      if (bias_size != param_size)
+      {
+        throw std::runtime_error("Mismatch between scale size and bias size: " +
+                                 std::to_string(bias_size) + " != " + std::to_string(param_size));
+      }
     }
 
     auto cloned_fc = luci::clone_node(node, node->graph());
     assert(cloned_fc != nullptr); // FIX_CALLER_UNLESS
     auto fused_fc = loco::must_cast<luci::CircleFullyConnected *>(cloned_fc);
     auto fused_filter = luci::clone(filter);
-    auto fused_bias = luci::clone(bias);
 
     fused_fc->name(node->name() + "_fused_" + random_str());
     fused_filter->name(filter->name() + "_fused_" + random_str());
-    fused_bias->name(bias->name() + "_fused_" + random_str());
 
     add_origin(fused_fc, luci::get_origin(node));
     add_origin(fused_filter, luci::get_origin(filter));
-    add_origin(fused_bias, luci::get_origin(bias));
+
+    luci::CircleConst *fused_bias = nullptr;
+    if (bias)
+    {
+      fused_bias = luci::clone(bias);
+      fused_bias->name(bias->name() + "_fused_" + random_str());
+      add_origin(fused_bias, luci::get_origin(bias));
+    }
 
     // Multiply param to weights
     for (uint32_t o = 0; o < filter_o; o++)
@@ -294,17 +302,21 @@ struct FusePostScale final : public luci::CircleNodeMutableVisitor<bool>
       }
     }
 
-    // Multiply param to bias
-    for (uint32_t c = 0; c < filter_o; ++c)
-    {
-      float scale = param->at<loco::DataType::FLOAT32>(c);
-      fused_bias->at<loco::DataType::FLOAT32>(c) =
-        fused_bias->at<loco::DataType::FLOAT32>(c) * scale;
-    }
-
     fused_fc->input(node->input());
     fused_fc->weights(fused_filter);
-    fused_fc->bias(fused_bias);
+    fused_fc->bias(node->bias());
+
+    if (bias)
+    {
+      // Multiply param to bias
+      for (uint32_t c = 0; c < filter_o; ++c)
+      {
+        float scale = param->at<loco::DataType::FLOAT32>(c);
+        fused_bias->at<loco::DataType::FLOAT32>(c) =
+          fused_bias->at<loco::DataType::FLOAT32>(c) * scale;
+      }
+      fused_fc->bias(fused_bias);
+    }
 
     loco::replace(post_scale).with(fused_fc);
 

From 1ba697000029bad486a492b1b78efceeffeb69bc Mon Sep 17 00:00:00 2001
From: seockho-kim <seockho.kim@samsung.com>
Date: Thu, 24 Oct 2024 16:31:52 +0900
Subject: [PATCH 14/46] [compute/cker] Fix RMSNorm shape assert error (#14247)

* [compute/cker] Fix RMSNorm shape assert error

This commit fixes shape assert error when running model including RMSNorm operation.

ONE-DCO-1.0-Signed-off-by: Seockho Kim seockho.kim@samsung.com

* [compute/cker] Add RMSNorm unittests

Unit test added for RMSNorm to test rank 3 input.

ONE-DCO-1.0-Signed-off-by: Seockho Kim seockho.kim@samsung.com
---
 compute/cker/include/cker/operation/RmsNorm.h |  6 ++---
 compute/cker/src/RmsNorm.test.cc              | 25 ++++++++++++++++++-
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/compute/cker/include/cker/operation/RmsNorm.h b/compute/cker/include/cker/operation/RmsNorm.h
index dee3f618428..19eb3981e15 100644
--- a/compute/cker/include/cker/operation/RmsNorm.h
+++ b/compute/cker/include/cker/operation/RmsNorm.h
@@ -68,9 +68,9 @@ inline void RmsNorm(const RmsNormParams &params, const Shape &input_shape, const
   }
   else if (input_shape.DimensionsCount() == 3)
   {
-    const int32_t heights = MatchingDim(input_shape, 1, output_shape, 0);
-    const int32_t widths = MatchingDim(input_shape, 2, output_shape, 1);
-    const int32_t channels = MatchingDim(input_shape, 3, output_shape, 2);
+    const int32_t heights = MatchingDim(input_shape, 0, output_shape, 0);
+    const int32_t widths = MatchingDim(input_shape, 1, output_shape, 1);
+    const int32_t channels = MatchingDim(input_shape, 2, output_shape, 2);
 
     for (int32_t height = 0; height < heights; height++)
     {
diff --git a/compute/cker/src/RmsNorm.test.cc b/compute/cker/src/RmsNorm.test.cc
index 926524d5860..6b84b49ff62 100644
--- a/compute/cker/src/RmsNorm.test.cc
+++ b/compute/cker/src/RmsNorm.test.cc
@@ -43,7 +43,7 @@ TEST(CKer_Operation, RmsNorm)
       EXPECT_NEAR(output[i], expected_output[i], 1e-5f);
   }
 
-  // Default gamma
+  // rank 4
   {
     std::vector<float> input = {0, 1, 2, 3, 4, 5, 6, 7};
     nnfw::cker::Shape input_shape{1, 2, 2, 2};
@@ -65,6 +65,29 @@ TEST(CKer_Operation, RmsNorm)
     for (size_t i = 0; i < expected_output.size(); ++i)
       EXPECT_NEAR(output[i], expected_output[i], 1e-5f);
   }
+
+  // rank 3
+  {
+    std::vector<float> input = {0, 1, 2, 3, 4, 5, 6, 7};
+    nnfw::cker::Shape input_shape{2, 2, 2};
+
+    std::vector<float> expected_output = {0,        1.412802, 0.784404, 1.176606,
+                                          0.883431, 1.104288, 0.920347, 1.073738};
+    std::vector<float> output(expected_output.size());
+    nnfw::cker::Shape output_shape{2, 2, 2};
+
+    std::vector<float> gamma = {1, 1};
+    nnfw::cker::Shape gamma_shape{2};
+
+    nnfw::cker::RmsNormParams param;
+    param.epsilon = 0.001f;
+
+    nnfw::cker::RmsNorm(param, input_shape, input.data(), gamma_shape, gamma.data(), output_shape,
+                        output.data());
+
+    for (size_t i = 0; i < expected_output.size(); ++i)
+      EXPECT_NEAR(output[i], expected_output[i], 1e-5f);
+  }
 }
 
 TEST(CKer_Operation, neg_RmsNormWrongInputDims)

From 153edec38dbc91dd3354bda7fb1e237d24928865 Mon Sep 17 00:00:00 2001
From: chunseoklee <chunseok.lee@samsung.com>
Date: Thu, 24 Oct 2024 19:25:08 +0900
Subject: [PATCH 15/46] [res] Add recipe for Decomposed GRU (#14251)

This PR adds recipe for decomposed GRU(This recipe will be used for
validating --fuse_gru pass in circle2circle)

ONE-DCO-1.0-Signed-off-by: Artem Balyshev <a.balyshev@samsung.com>
ONE-DCO-1.0-Signed-off-by: Chunseok Lee <chunseok.lee@samsung.com>
---
 .../Net_Decomposed_GRU_000/test.recipe        | 857 ++++++++++++++++++
 1 file changed, 857 insertions(+)
 create mode 100644 res/TensorFlowLiteRecipes/Net_Decomposed_GRU_000/test.recipe

diff --git a/res/TensorFlowLiteRecipes/Net_Decomposed_GRU_000/test.recipe b/res/TensorFlowLiteRecipes/Net_Decomposed_GRU_000/test.recipe
new file mode 100644
index 00000000000..2d8c7bfe783
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Decomposed_GRU_000/test.recipe
@@ -0,0 +1,857 @@
+operand {
+  name: "TensorArrayV2_1"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 2
+  }
+  filler {
+    tag: "explicit"
+    arg: "0"
+    arg: "0"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "time"
+  type: INT32
+  shape {
+  }
+  filler {
+    tag: "explicit"
+    arg: "0"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+
+operand {
+  name: "sequential/gru/zeros"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 2
+  }
+  filler {
+    tag: "explicit"
+    arg: "0"
+    arg: "0"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "transpose1"
+  type: FLOAT32
+  shape {
+    dim: 2
+    dim: 1
+    dim: 2
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "while"
+  type: INT32
+  shape {
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "while1"
+  type: INT32
+  shape {
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "while2"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 2
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "while3"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 2
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "while4"
+  type: FLOAT32
+  shape {
+    dim: 2
+    dim: 1
+    dim: 2
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "StatefulPartitionedCall:0"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 2
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operation {
+  type: "While"
+  input: "time"
+  input: "time"
+  input: "TensorArrayV2_1"
+  input: "sequential/gru/zeros"
+  input: "transpose1"
+  output: "while"
+  output: "while1"
+  output: "while2"
+  output: "while3"
+  output: "while4"
+  while_options {
+    cond_subgraph_index: 1
+    body_subgraph_index: 2
+  }
+}
+input: "transpose1"
+output: "while2"
+# This is cond subgraph
+graph {
+  operand {
+    name: "arg0"
+    type: INT32
+    shape {
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "arg1"
+    type: INT32
+    shape {
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "arg2"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "arg3"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "arg4"
+    type: FLOAT32
+    shape {
+      dim: 2
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "strided_slice1"
+    type: INT32
+    shape {
+    }
+    filler {
+      tag: "explicit"
+      arg: "2"
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/Less"
+    type: BOOL
+    shape {
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operation {
+    type: "Less"
+    input: "arg1"
+    input: "strided_slice1"
+    output: "while/Less"
+  }
+  input: "arg0"
+  input: "arg1"
+  input: "arg2"
+  input: "arg3"
+  input: "arg4"
+  output: "while/Less"
+}
+
+# This is body subgraph
+graph {
+  operand {
+    name: "arg0"
+    type: INT32
+    shape {
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "arg1"
+    type: INT32
+    shape {
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "arg2"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "arg3"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "arg4"
+    type: FLOAT32
+    shape {
+      dim: 2
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/TensorArrayV2Write/TensorListSetItem"
+    type: INT32
+    shape {
+      dim: 3
+    }
+    filler {
+      tag: "explicit"
+      arg: "1"
+      arg: "1"
+      arg: "2"
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/MatMul_11"
+    type: FLOAT32
+    shape {
+      dim: 6
+      dim: 2
+    }
+    filler {
+      tag: "explicit"
+      arg: "-0.192813"
+      arg: "-0.458227"
+      arg: "-0.178845"
+      arg: "-0.275436"
+      arg: "0.704787"
+      arg: "0.187431"
+      arg: "-0.280711"
+      arg: "-0.406058"
+      arg: "-0.415622"
+      arg: "0.675278"
+      arg: "0.427286"
+      arg: "-0.241141"
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/MatMul"
+    type: FLOAT32
+    shape {
+      dim: 6
+      dim: 2
+    }
+    filler {
+      tag: "explicit"
+      arg: "0.807328"
+      arg: "-0.521874"
+      arg: "0.116675"
+      arg: "0.331105"
+      arg: "0.277033"
+      arg: "0.237678"
+      arg: "0.129396"
+      arg: "0.171752"
+      arg: "-0.15585"
+      arg: "0.813781"
+      arg: "-0.26672"
+      arg: "-0.230285"
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/add_4/y"
+    type: INT32
+    shape {
+    }
+    filler {
+      tag: "explicit"
+      arg: "1"
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/sub/x"
+    type: FLOAT32
+    shape {
+    }
+    filler {
+      tag: "explicit"
+      arg: "1"
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/add_4"
+    type: INT32
+    shape {
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/MatMul_12"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 6
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/split_1"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/split_11"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/split_12"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/TensorArrayV2Read/TensorListGetItem;time"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/MatMul1"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 6
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/split"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/split1"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/split2"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/add"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/Sigmoid"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/mul_1"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/sub"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/add_1"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/Sigmoid_1"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/mul"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/add_2"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/Tanh"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/mul_2"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/add_3"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/TensorArrayV2Write/TensorListSetItem1"
+    type: FLOAT32
+    shape {
+      dim: 1
+      dim: 1
+      dim: 2
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operand {
+    name: "while/add_5"
+    type: INT32
+    shape {
+    }
+    quant {
+      quantized_dimension: 0
+    }
+    is_variable: false
+  }
+  operation {
+    type: "Add"
+    input: "arg1"
+    input: "while/add_4/y"
+    output: "while/add_4"
+    add_options {
+      activation: NONE
+    }
+  }
+  operation {
+    type: "FullyConnected"
+    input: "arg3"
+    input: "while/MatMul_11"
+    input: ""
+    output: "while/MatMul_12"
+    fullyconnected_options {
+      activation: NONE
+      keep_num_dims: false
+    }
+  }
+  operation {
+    type: "Split"
+    input: "while/add_4/y"
+    input: "while/MatMul_12"
+    output: "while/split_1"
+    output: "while/split_11"
+    output: "while/split_12"
+    split_options {
+      num_splits: 3
+    }
+  }
+  operation {
+    type: "Gather"
+    input: "arg4"
+    input: "arg1"
+    output: "while/TensorArrayV2Read/TensorListGetItem;time"
+    gather_options {
+      axis: 0
+    }
+  }
+  operation {
+    type: "FullyConnected"
+    input: "while/TensorArrayV2Read/TensorListGetItem;time"
+    input: "while/MatMul"
+    input: ""
+    output: "while/MatMul1"
+    fullyconnected_options {
+      activation: NONE
+      keep_num_dims: false
+    }
+  }
+  operation {
+    type: "Split"
+    input: "while/add_4/y"
+    input: "while/MatMul1"
+    output: "while/split"
+    output: "while/split1"
+    output: "while/split2"
+    split_options {
+      num_splits: 3
+    }
+  }
+  operation {
+    type: "Add"
+    input: "while/split"
+    input: "while/split_1"
+    output: "while/add"
+    add_options {
+      activation: NONE
+    }
+  }
+  operation {
+    type: "Logistic"
+    input: "while/add"
+    output: "while/Sigmoid"
+  }
+  operation {
+    type: "Mul"
+    input: "while/Sigmoid"
+    input: "arg3"
+    output: "while/mul_1"
+    mul_options {
+      activation: NONE
+    }
+  }
+  operation {
+    type: "Sub"
+    input: "while/sub/x"
+    input: "while/Sigmoid"
+    output: "while/sub"
+    sub_options {
+      activation: NONE
+    }
+  }
+  operation {
+    type: "Add"
+    input: "while/split1"
+    input: "while/split_11"
+    output: "while/add_1"
+    add_options {
+      activation: NONE
+    }
+  }
+  operation {
+    type: "Logistic"
+    input: "while/add_1"
+    output: "while/Sigmoid_1"
+  }
+  operation {
+    type: "Mul"
+    input: "while/Sigmoid_1"
+    input: "while/split_12"
+    output: "while/mul"
+    mul_options {
+      activation: NONE
+    }
+  }
+  operation {
+    type: "Add"
+    input: "while/split2"
+    input: "while/mul"
+    output: "while/add_2"
+    add_options {
+      activation: NONE
+    }
+  }
+  operation {
+    type: "Tanh"
+    input: "while/add_2"
+    output: "while/Tanh"
+  }
+  operation {
+    type: "Mul"
+    input: "while/sub"
+    input: "while/Tanh"
+    output: "while/mul_2"
+    mul_options {
+      activation: NONE
+    }
+  }
+  operation {
+    type: "Add"
+    input: "while/mul_1"
+    input: "while/mul_2"
+    output: "while/add_3"
+    add_options {
+      activation: NONE
+    }
+  }
+  operation {
+    type: "Reshape"
+    input: "while/add_3"
+    input: "while/TensorArrayV2Write/TensorListSetItem"
+    output: "while/TensorArrayV2Write/TensorListSetItem1"
+  }
+  operation {
+    type: "Add"
+    input: "arg0"
+    input: "while/add_4/y"
+    output: "while/add_5"
+    add_options {
+      activation: NONE
+    }
+  }
+  input: "arg0"
+  input: "arg1"
+  input: "arg2"
+  input: "arg3"
+  input: "arg4"
+  output: "while/add_5"
+  output: "while/add_4"
+  output: "while/TensorArrayV2Write/TensorListSetItem1"
+  output: "while/add_3"
+  output: "arg4"
+}

From d20420b8c129d44c5c75291b103fea9a2cf98996 Mon Sep 17 00:00:00 2001
From: chunseoklee <chunseok.lee@samsung.com>
Date: Mon, 28 Oct 2024 10:27:54 +0900
Subject: [PATCH 16/46] [res] test.rule for Decomposed GRU recipe (#14253)

- Add rule.test for dredd test of fuse_gru pass

ONE-DCO-1.0-Signed-off-by: Chunseok Lee <chunseok.lee@samsung.com>
---
 .../Net_Decomposed_GRU_000/test.rule                  | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 res/TensorFlowLiteRecipes/Net_Decomposed_GRU_000/test.rule

diff --git a/res/TensorFlowLiteRecipes/Net_Decomposed_GRU_000/test.rule b/res/TensorFlowLiteRecipes/Net_Decomposed_GRU_000/test.rule
new file mode 100644
index 00000000000..65c834f29dd
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Decomposed_GRU_000/test.rule
@@ -0,0 +1,11 @@
+# To check whether tfl model is converted to circle GRU op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "GRU_EXIST"               $(op_count GRU) '=' 1
+RULE    "NO_WHILE"                $(op_count WHILE) '=' 0
+RULE    "NO_SPLIT"                $(op_count SPLIT) '=' 0
+RULE    "NO_LOGISTIC"             $(op_count LOGISTIC) '=' 0
+RULE    "NO_TANH"                 $(op_count TANH) '=' 0
+RULE    "NO_GATHER"               $(op_count GATHER) '=' 0
+RULE    "NO_LESS"                 $(op_count LESS) '=' 0

From 7c5ecc59b347f2682b27dd037e4144bd02604835 Mon Sep 17 00:00:00 2001
From: chunseoklee <chunseok.lee@samsung.com>
Date: Mon, 28 Oct 2024 11:14:40 +0900
Subject: [PATCH 17/46] [luci-interpreter] GRU kernel (#14254)

- Enable CircleGRU operation in luci-interpreter
- Overall implemenatation is borrowed from onert-micro 2.0

ONE-DCO-1.0-Signed-off-by: Chunseok Lee <chunseok.lee@samsung.com>
---
 .../pal/linux/KernelsToBuild.lst              |   1 +
 compiler/luci-interpreter/pal/linux/PALGRU.h  | 173 ++++++++++++++++++
 .../luci-interpreter/src/core/KernelParams.h  |   7 +
 compiler/luci-interpreter/src/kernels/GRU.cpp |  82 +++++++++
 compiler/luci-interpreter/src/kernels/GRU.h   |  53 ++++++
 .../luci-interpreter/src/kernels/GRU.test.cpp | 170 +++++++++++++++++
 .../luci-interpreter/src/loader/nodes/GRU.cpp |  48 +++++
 7 files changed, 534 insertions(+)
 create mode 100644 compiler/luci-interpreter/pal/linux/PALGRU.h
 create mode 100644 compiler/luci-interpreter/src/kernels/GRU.cpp
 create mode 100644 compiler/luci-interpreter/src/kernels/GRU.h
 create mode 100644 compiler/luci-interpreter/src/kernels/GRU.test.cpp
 create mode 100644 compiler/luci-interpreter/src/loader/nodes/GRU.cpp

diff --git a/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst b/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst
index df47427a3df..989163524f3 100644
--- a/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst
+++ b/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst
@@ -27,6 +27,7 @@ REGISTER_KERNEL(Gather)
 REGISTER_KERNEL(Gelu)
 REGISTER_KERNEL(Greater)
 REGISTER_KERNEL(GreaterEqual)
+REGISTER_KERNEL(GRU)
 REGISTER_KERNEL(HardSwish)
 REGISTER_KERNEL(If)
 REGISTER_KERNEL(InstanceNorm)
diff --git a/compiler/luci-interpreter/pal/linux/PALGRU.h b/compiler/luci-interpreter/pal/linux/PALGRU.h
new file mode 100644
index 00000000000..2d2e74d81f3
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALGRU.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_GRU_H
+#define LUCI_INTERPRETER_PAL_GRU_H
+
+#include <tensorflow/lite/kernels/internal/reference/fully_connected.h>
+#include "PALreference_ops.h"
+namespace luci_interpreter_pal
+{
+
+// tflite's Logistic does not provide inplace Logistic kernel
+void Logistic(const int flat_size, const float *input_data, float *output_data)
+{
+  const float cutoff_upper = 16.619047164916992188f;
+  const float cutoff_lower = -9.f;
+
+  // Rational for using approximation in reference kernel.
+  // 0. This approximation gives enough precision for float.
+  // 1. This works around an issue on an embedded chipset where exp() does not
+  // return correctly as expected - exp(x) should return inf when overflown
+  // not 1.701417   IEEE 754 defines representation for inf.
+  // 2. This will speed up calculation and is matching the behavior in the
+  // optimized kernels. (check the definition of scalar_logistic_op<float>)
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    float val = input_data[i];
+    float result;
+    if (val > cutoff_upper)
+    {
+      result = 1.0f;
+    }
+    else if (val < cutoff_lower)
+    {
+      result = std::exp(val);
+    }
+    else
+    {
+      result = 1.f / (1.f + std::exp(-val));
+    }
+    output_data[i] = result;
+  }
+}
+
+void calculateGRU(const float *input_data, const float *weight_input_data,
+                  const float *weight_hidden_data, const float *bias_input_data,
+                  const float *bias_hidden_data, float *output_data,
+                  const tflite::RuntimeShape &input_shape, const tflite::RuntimeShape &output_shape,
+                  const tflite::RuntimeShape &weight_input_shape,
+                  const tflite::RuntimeShape &weight_hidden_shape, float *output_input_data,
+                  float *output_hidden_data, const tflite::RuntimeShape &output_shape_fc)
+{
+  tflite::FullyConnectedParams op_params{};
+  // As FC nodes doesn't have any activations inside GRU, let' use just numeric limits
+  op_params.float_activation_min = std::numeric_limits<float>::lowest();
+  op_params.float_activation_max = std::numeric_limits<float>::max();
+
+  // FC Input
+  tflite::RuntimeShape bias_input_shape{weight_input_shape.Dims(0)};
+  tflite::reference_ops::FullyConnected(op_params, output_shape, output_data, weight_input_shape,
+                                        weight_input_data, bias_input_shape, bias_input_data,
+                                        output_shape_fc, output_input_data);
+
+  // FC Hidden
+  tflite::RuntimeShape bias_hidden_shape{weight_hidden_shape.Dims(0)};
+  // Note: input for this FC node will be saved without intermediate buffer
+  tflite::reference_ops::FullyConnected(op_params, input_shape, input_data, weight_hidden_shape,
+                                        weight_hidden_data, bias_hidden_shape, bias_hidden_data,
+                                        output_shape_fc, output_hidden_data);
+
+  int num_elements = output_shape_fc.Dims(1) / 3;
+
+  float *second_hidden_part = output_hidden_data + num_elements;
+  float *second_input_part = output_input_data + num_elements;
+
+  float *third_hidden_part = second_hidden_part + num_elements;
+  float *third_input_part = second_input_part + num_elements;
+
+  // Calculate Left part
+  for (int i = 0; i < num_elements; ++i)
+  {
+    output_input_data[i] += output_hidden_data[i];
+  }
+
+  Logistic(num_elements, output_input_data, output_input_data);
+
+  // Calculate most left mul
+  float *most_left_part_final = output_input_data;
+  float *first_part = output_input_data;
+  for (int i = 0; i < num_elements; ++i)
+  {
+    output_data[i] *= most_left_part_final[i];
+    first_part[i] = 1.0f - first_part[i];
+  }
+
+  // Calc second part
+  for (int i = 0; i < num_elements; ++i)
+  {
+    second_hidden_part[i] += second_input_part[i];
+  }
+
+  Logistic(num_elements, second_hidden_part, second_hidden_part);
+
+  for (int i = 0; i < num_elements; ++i)
+  {
+    second_hidden_part[i] *= third_input_part[i];
+    second_hidden_part[i] += third_hidden_part[i];
+  }
+
+  for (int i = 0; i < num_elements; ++i)
+  {
+    if (second_hidden_part[i] > 19)
+    {
+      second_hidden_part[i] = 1;
+    }
+    else if (second_hidden_part[i] < -19)
+    {
+      second_hidden_part[i] = -1;
+    }
+    else
+    {
+      second_hidden_part[i] = std::tanh(second_hidden_part[i]);
+    }
+  }
+
+  for (int i = 0; i < num_elements; ++i)
+  {
+    second_hidden_part[i] *= first_part[i];
+    output_data[i] += second_hidden_part[i];
+  }
+}
+
+void GRU(const float *input_data, const float *weight_input_data, const float *weight_hidden_data,
+         const float *bias_input_data, const float *bias_hidden_data,
+         const float *hidden_state_data, float *output_data, float *output_input_data,
+         float *output_hidden_data, const tflite::RuntimeShape &input_shape,
+         const tflite::RuntimeShape &output_shape, const tflite::RuntimeShape &weight_input_shape,
+         const tflite::RuntimeShape &weight_hidden_shape)
+{
+  const int32_t time = input_shape.Dims(0);
+
+  tflite::RuntimeShape output_shape_fc(2);
+  output_shape_fc.SetDim(0, 1);
+  output_shape_fc.SetDim(1, weight_hidden_shape.Dims(0));
+
+  std::memcpy(output_data, hidden_state_data, output_shape.FlatSize() * sizeof(float));
+
+  for (int i = 0; i < time; ++i)
+  {
+    calculateGRU(input_data, weight_input_data, weight_hidden_data, bias_input_data,
+                 bias_hidden_data, output_data, input_shape, output_shape, weight_input_shape,
+                 weight_hidden_shape, output_input_data, output_hidden_data, output_shape_fc);
+    input_data += input_shape.Dims(2);
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_GRU_H
diff --git a/compiler/luci-interpreter/src/core/KernelParams.h b/compiler/luci-interpreter/src/core/KernelParams.h
index cc6a83e08c7..d7ea8629370 100644
--- a/compiler/luci-interpreter/src/core/KernelParams.h
+++ b/compiler/luci-interpreter/src/core/KernelParams.h
@@ -111,6 +111,13 @@ struct GeluParams
   bool approximate;
 };
 
+struct GRUParams
+{
+  Activation fused_act_function = Activation::NONE;
+  bool return_sequences = false;
+  bool time_major = false;
+};
+
 struct InstanceNormParams
 {
   float epsilon;
diff --git a/compiler/luci-interpreter/src/kernels/GRU.cpp b/compiler/luci-interpreter/src/kernels/GRU.cpp
new file mode 100644
index 00000000000..2419e0bb779
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/GRU.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/GRU.h"
+
+#include "kernels/Utils.h"
+
+#include "PALFullyConnected.h"
+#include "PALGRU.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+GRU::GRU(const Tensor *input, const Tensor *hidden_hidden, const Tensor *hidden_hidden_bias,
+         const Tensor *hidden_input, const Tensor *hidden_input_bias, const Tensor *state,
+         Tensor *output, const GRUParams &params)
+  : KernelWithParams<GRUParams>(
+      {input, hidden_hidden, hidden_hidden_bias, hidden_input, hidden_input_bias, state}, {output},
+      params)
+{
+}
+
+void GRU::configure()
+{
+  auto hidden_hidden_shape = getTensorShape(hidden_hidden());
+  auto hidden_input_shape = getTensorShape(hidden_input());
+  LUCI_INTERPRETER_CHECK(hidden_hidden_shape.Dims(0) == hidden_input_shape.Dims(0));
+
+  output()->resize(state()->shape());
+
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+}
+
+void GRU::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    default:
+      throw std::runtime_error("luci-GRU Unsupported data type.");
+  }
+}
+
+void GRU::evalFloat() const
+{
+  uint8_t *output_hidden_data;
+  uint8_t *output_input_data;
+
+  // allocate output datas above
+  output_hidden_data = new uint8_t[getTensorShape(hidden_hidden()).FlatSize() * sizeof(float)];
+  output_input_data = new uint8_t[getTensorShape(hidden_input()).FlatSize() * sizeof(float)];
+
+  luci_interpreter_pal::GRU(
+    getTensorData<float>(input()), getTensorData<float>(hidden_input()),
+    getTensorData<float>(hidden_hidden()), getTensorData<float>(hidden_input_bias()),
+    getTensorData<float>(hidden_hidden_bias()), getTensorData<float>(state()),
+    getTensorData<float>(output()), reinterpret_cast<float *>(output_input_data),
+    reinterpret_cast<float *>(output_hidden_data), getTensorShape(input()),
+    getTensorShape(output()), getTensorShape(hidden_input()), getTensorShape(hidden_hidden()));
+
+  delete output_hidden_data;
+  delete output_input_data;
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/GRU.h b/compiler/luci-interpreter/src/kernels/GRU.h
new file mode 100644
index 00000000000..ac5ec085b26
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/GRU.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_GRU_H
+#define LUCI_INTERPRETER_KERNELS_GRU_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class GRU : public KernelWithParams<GRUParams>
+{
+public:
+  GRU(const Tensor *input, const Tensor *hidden_hidden, const Tensor *hidden_hidden_bias,
+      const Tensor *hidden_input, const Tensor *hidden_input_bias, const Tensor *state,
+      Tensor *output, const GRUParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *hidden_hidden() const { return _inputs[1]; }
+  const Tensor *hidden_hidden_bias() const { return _inputs[2]; }
+  const Tensor *hidden_input() const { return _inputs[3]; }
+  const Tensor *hidden_input_bias() const { return _inputs[4]; }
+  const Tensor *state() const { return _inputs[5]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_ROPE_H
diff --git a/compiler/luci-interpreter/src/kernels/GRU.test.cpp b/compiler/luci-interpreter/src/kernels/GRU.test.cpp
new file mode 100644
index 00000000000..586286b9189
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/GRU.test.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/GRU.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class GRUTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(GRUTest, floatTest)
+{
+  Shape input_shape{2, 1, 2};
+  std::vector<float> input_data{0.98045033, 0.39546537, 0.5209594, 0.72873044};
+
+  Shape ref_output_shape{1, 1, 2};
+  std::vector<float> ref_output_data{0.22777566, -0.1976251};
+
+  Shape hidden_hidden_shape{6, 2};
+  std::vector<float> hidden_hidden_data{
+    0.8073279857635498,   -0.5218740105628967, 0.1166749969124794,  0.33110499382019043,
+    0.2770330011844635,   0.23767800629138947, 0.1293960064649582,  0.17175200581550598,
+    -0.15584999322891235, 0.8137810230255127,  -0.2667199969291687, -0.23028500378131866};
+  Shape hidden_input_shape{6, 2};
+  std::vector<float> hidden_input_data{
+    -0.1928129941225052, -0.4582270085811615, -0.17884500324726105, -0.27543601393699646,
+    0.704787015914917,   0.1874309927225113,  -0.28071099519729614, -0.40605801343917847,
+    -0.4156219959259033, 0.6752780079841614,  0.4272859990596771,   -0.24114100635051727};
+
+  Shape state_shape{1, 2};
+  std::vector<float> state_data{0.0, 0.0};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Tensor hidden_hidden_tensor = makeInputTensor<DataType::FLOAT32>(
+    hidden_hidden_shape, hidden_hidden_data, _memory_manager.get());
+
+  Tensor hidden_input_tensor = makeInputTensor<DataType::FLOAT32>(
+    hidden_input_shape, hidden_input_data, _memory_manager.get());
+
+  Tensor state_tensor =
+    makeInputTensor<DataType::FLOAT32>(state_shape, state_data, _memory_manager.get());
+
+  GRUParams params{};
+
+  GRU kernel(&input_tensor, &hidden_hidden_tensor, nullptr, &hidden_input_tensor, nullptr,
+             &state_tensor, &output_tensor, params);
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<float>(output_tensor),
+              ::testing::ElementsAreArray(ref_output_data));
+}
+
+TEST_F(GRUTest, Unmatched_io_type_NEG)
+{
+  Shape input_shape{2, 1, 2};
+  std::vector<float> input_data{0.98045033, 0.39546537, 0.5209594, 0.72873044};
+
+  Shape ref_output_shape{1, 1, 2};
+  std::vector<float> ref_output_data{0.22777566, -0.1976251};
+
+  Shape hidden_hidden_shape{6, 2};
+  std::vector<float> hidden_hidden_data{
+    0.8073279857635498,   -0.5218740105628967, 0.1166749969124794,  0.33110499382019043,
+    0.2770330011844635,   0.23767800629138947, 0.1293960064649582,  0.17175200581550598,
+    -0.15584999322891235, 0.8137810230255127,  -0.2667199969291687, -0.23028500378131866};
+  Shape hidden_input_shape{6, 2};
+  std::vector<float> hidden_input_data{
+    -0.1928129941225052, -0.4582270085811615, -0.17884500324726105, -0.27543601393699646,
+    0.704787015914917,   0.1874309927225113,  -0.28071099519729614, -0.40605801343917847,
+    -0.4156219959259033, 0.6752780079841614,  0.4272859990596771,   -0.24114100635051727};
+
+  Shape state_shape{1, 2};
+  std::vector<float> state_data{0.0, 0.0};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::U32);
+
+  Tensor hidden_hidden_tensor = makeInputTensor<DataType::FLOAT32>(
+    hidden_hidden_shape, hidden_hidden_data, _memory_manager.get());
+
+  Tensor hidden_input_tensor = makeInputTensor<DataType::FLOAT32>(
+    hidden_input_shape, hidden_input_data, _memory_manager.get());
+
+  Tensor state_tensor =
+    makeInputTensor<DataType::FLOAT32>(state_shape, state_data, _memory_manager.get());
+
+  GRUParams params{};
+
+  GRU kernel(&input_tensor, &hidden_hidden_tensor, nullptr, &hidden_input_tensor, nullptr,
+             &state_tensor, &output_tensor, params);
+
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(GRUTest, Unmatched_weight_size_NEG)
+{
+  Shape input_shape{2, 1, 2};
+  std::vector<float> input_data{0.98045033, 0.39546537, 0.5209594, 0.72873044};
+
+  Shape ref_output_shape{1, 1, 2};
+  std::vector<float> ref_output_data{0.22777566, -0.1976251};
+
+  Shape hidden_hidden_shape{1, 2};
+  std::vector<float> hidden_hidden_data{-0.2667199969291687, -0.23028500378131866};
+  Shape hidden_input_shape{6, 2};
+  std::vector<float> hidden_input_data{
+    -0.1928129941225052, -0.4582270085811615, -0.17884500324726105, -0.27543601393699646,
+    0.704787015914917,   0.1874309927225113,  -0.28071099519729614, -0.40605801343917847,
+    -0.4156219959259033, 0.6752780079841614,  0.4272859990596771,   -0.24114100635051727};
+
+  Shape state_shape{1, 2};
+  std::vector<float> state_data{0.0, 0.0};
+
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get());
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  Tensor hidden_hidden_tensor = makeInputTensor<DataType::FLOAT32>(
+    hidden_hidden_shape, hidden_hidden_data, _memory_manager.get());
+
+  Tensor hidden_input_tensor = makeInputTensor<DataType::FLOAT32>(
+    hidden_input_shape, hidden_input_data, _memory_manager.get());
+
+  Tensor state_tensor =
+    makeInputTensor<DataType::FLOAT32>(state_shape, state_data, _memory_manager.get());
+
+  GRUParams params{};
+
+  GRU kernel(&input_tensor, &hidden_hidden_tensor, nullptr, &hidden_input_tensor, nullptr,
+             &state_tensor, &output_tensor, params);
+
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/GRU.cpp b/compiler/luci-interpreter/src/loader/nodes/GRU.cpp
new file mode 100644
index 00000000000..f6e5ddbc000
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/GRU.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/GRU.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleGRU(const luci::CircleNode *circle_node,
+                                               KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleGRU *>(circle_node);
+  assert(node->arity() == 6);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *hidden_hidden = helper.getInputTensor(node->hidden_hidden());
+  const Tensor *hidden_hidden_bias = helper.getInputTensor(node->hidden_hidden_bias());
+  const Tensor *hidden_input = helper.getInputTensor(node->hidden_input());
+  const Tensor *hidden_input_bias = helper.getInputTensor(node->hidden_input_bias());
+  const Tensor *state = helper.getInputTensor(node->state());
+
+  Tensor *output = helper.getOutputTensor(node);
+
+  GRUParams params{};
+  params.fused_act_function = node->fusedActivationFunction();
+  params.return_sequences = node->returnSequences();
+  params.time_major = node->timeMajor();
+
+  return std::make_unique<kernels::GRU>(input, hidden_hidden, hidden_hidden_bias, hidden_input,
+                                        hidden_input_bias, state, output, params);
+}
+
+} // namespace luci_interpreter

From 1c7bbb9cab70df42ea1b320bc383188ff76f77f9 Mon Sep 17 00:00:00 2001
From: Hyukjin Jeong <hj1.jeong@samsung.com>
Date: Mon, 28 Oct 2024 11:18:30 +0900
Subject: [PATCH 18/46] [record-minmax] Introduce DataSetIterator (#14258)

This introduces a base class for iterator.

ONE-DCO-1.0-Signed-off-by: Hyukjin Jeong <hj1.jeong@samsung.com>
---
 compiler/record-minmax/include/DataBuffer.h   | 37 ++++++++++++++++
 .../record-minmax/include/DataSetIterator.h   | 43 +++++++++++++++++++
 2 files changed, 80 insertions(+)
 create mode 100644 compiler/record-minmax/include/DataBuffer.h
 create mode 100644 compiler/record-minmax/include/DataSetIterator.h

diff --git a/compiler/record-minmax/include/DataBuffer.h b/compiler/record-minmax/include/DataBuffer.h
new file mode 100644
index 00000000000..1cd51a9e0ae
--- /dev/null
+++ b/compiler/record-minmax/include/DataBuffer.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RECORD_MINMAX_DATA_BUFFER_H__
+#define __RECORD_MINMAX_DATA_BUFFER_H__
+
+#include <loco/IR/DataType.h>
+#include <loco/IR/Dimension.h>
+
+#include <vector>
+
+namespace record_minmax
+{
+
+struct DataBuffer
+{
+  loco::DataType dtype = loco::DataType::Unknown;
+  std::vector<loco::Dimension> shape;
+  std::vector<char> data;
+};
+
+} // namespace record_minmax
+
+#endif // __RECORD_MINMAX_DATA_BUFFER_H__
diff --git a/compiler/record-minmax/include/DataSetIterator.h b/compiler/record-minmax/include/DataSetIterator.h
new file mode 100644
index 00000000000..4b55a65418f
--- /dev/null
+++ b/compiler/record-minmax/include/DataSetIterator.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RECORD_MINMAX_DATASET_ITERATOR_H__
+#define __RECORD_MINMAX_DATASET_ITERATOR_H__
+
+#include "DataBuffer.h"
+
+#include <vector>
+
+namespace record_minmax
+{
+
+// Base class for dataset iterator
+class DataSetIterator
+{
+public:
+  virtual bool hasNext() const = 0;
+
+  virtual std::vector<DataBuffer> next() = 0;
+
+  // Revisit this interface later
+  virtual bool check_type_shape() const = 0;
+
+  virtual ~DataSetIterator() = default;
+};
+
+} // namespace record_minmax
+
+#endif // __RECORD_MINMAX_DATASET_ITERATOR_H__

From 067baf84aeeec1fe747b0e8513d1992cd1c1ea14 Mon Sep 17 00:00:00 2001
From: SaeHie Park <saehie.park@gmail.com>
Date: Mon, 28 Oct 2024 13:34:35 +0900
Subject: [PATCH 19/46] [luci/import] model_data as const (#14260)

This will revise importModule method to accept const argument.

Signed-off-by: SaeHie Park <saehie.park@gmail.com>
---
 compiler/luci/import/include/luci/ImporterEx.h | 2 +-
 compiler/luci/import/src/ImporterEx.cpp        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler/luci/import/include/luci/ImporterEx.h b/compiler/luci/import/include/luci/ImporterEx.h
index e00712c724c..52d808a92d1 100644
--- a/compiler/luci/import/include/luci/ImporterEx.h
+++ b/compiler/luci/import/include/luci/ImporterEx.h
@@ -50,7 +50,7 @@ class ImporterEx final
   // embedded-import-value-test uses constant data from file(actually ROM)
   // so unloading file will break the precondition
   // TODO remove this after embedded-import-value-test has moved to onert-micro
-  std::unique_ptr<Module> importModule(std::vector<char> &model_data) const;
+  std::unique_ptr<Module> importModule(const std::vector<char> &model_data) const;
 
 private:
   const GraphBuilderSource *_source = nullptr;
diff --git a/compiler/luci/import/src/ImporterEx.cpp b/compiler/luci/import/src/ImporterEx.cpp
index 88b125f1657..1a67f7a7610 100644
--- a/compiler/luci/import/src/ImporterEx.cpp
+++ b/compiler/luci/import/src/ImporterEx.cpp
@@ -65,9 +65,9 @@ std::unique_ptr<Module> ImporterEx::importVerifyModule(const std::string &input_
   return importer.importModule(data_data, data_size);
 }
 
-std::unique_ptr<Module> ImporterEx::importModule(std::vector<char> &model_data) const
+std::unique_ptr<Module> ImporterEx::importModule(const std::vector<char> &model_data) const
 {
-  auto data_data = reinterpret_cast<uint8_t *>(model_data.data());
+  auto data_data = reinterpret_cast<const uint8_t *>(model_data.data());
   auto data_size = model_data.size();
 
   Importer importer(_source);

From b443c68cd1fb8fa08b34d0c997774614aacef58c Mon Sep 17 00:00:00 2001
From: Hyukjin Jeong <hj1.jeong@samsung.com>
Date: Mon, 28 Oct 2024 13:41:16 +0900
Subject: [PATCH 20/46] [record-minmax] Introduce RandomIterator (#14259)

* [record-minmax] Introduce RandomIterator

This introduces an iterator for random data.

ONE-DCO-1.0-Signed-off-by: Hyukjin Jeong <hj1.jeong@samsung.com>

* Update CMakeLists.txt
---
 compiler/record-minmax/CMakeLists.txt         |   1 +
 .../record-minmax/include/RandomIterator.h    |  52 +++++++
 compiler/record-minmax/include/Utils.h        |  36 +++++
 compiler/record-minmax/src/RandomIterator.cpp | 145 ++++++++++++++++++
 compiler/record-minmax/src/Utils.cpp          |  49 ++++++
 compiler/record-minmax/tests/Utils.test.cpp   |  75 +++++++++
 6 files changed, 358 insertions(+)
 create mode 100644 compiler/record-minmax/include/RandomIterator.h
 create mode 100644 compiler/record-minmax/include/Utils.h
 create mode 100644 compiler/record-minmax/src/RandomIterator.cpp
 create mode 100644 compiler/record-minmax/src/Utils.cpp
 create mode 100644 compiler/record-minmax/tests/Utils.test.cpp

diff --git a/compiler/record-minmax/CMakeLists.txt b/compiler/record-minmax/CMakeLists.txt
index 3feca330ad0..6755de36eda 100644
--- a/compiler/record-minmax/CMakeLists.txt
+++ b/compiler/record-minmax/CMakeLists.txt
@@ -57,6 +57,7 @@ endif("${CMAKE_SIZEOF_VOID_P}" STREQUAL "8" AND FALSE)
 # record-minmax is executable, so we do not link it to the test.
 # Instead, we use TEST_SOURCES to specify sources uesd for tests.
 set(TEST_SOURCES
+    "src/Utils.cpp"
     "src/RecordFunction.cpp"
     "src/MinMaxComputer.cpp")
 
diff --git a/compiler/record-minmax/include/RandomIterator.h b/compiler/record-minmax/include/RandomIterator.h
new file mode 100644
index 00000000000..2a58f588fb7
--- /dev/null
+++ b/compiler/record-minmax/include/RandomIterator.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RECORD_MINMAX_RANDOM_ITERATOR_H__
+#define __RECORD_MINMAX_RANDOM_ITERATOR_H__
+
+#include "DataBuffer.h"
+#include "DataSetIterator.h"
+
+#include <luci/IR/Module.h>
+#include <luci/IR/CircleNodes.h>
+
+#include <random>
+#include <vector>
+
+namespace record_minmax
+{
+
+class RandomIterator final : public DataSetIterator
+{
+public:
+  RandomIterator(luci::Module *module);
+
+  bool hasNext() const override;
+
+  std::vector<DataBuffer> next() override;
+
+  bool check_type_shape() const override;
+
+private:
+  std::mt19937 _gen;
+  std::vector<const luci::CircleInput *> _input_nodes;
+  uint32_t _curr_idx = 0;
+  uint32_t _num_data = 0;
+};
+
+} // namespace record_minmax
+
+#endif // __RECORD_MINMAX_RANDOM_ITERATOR_H__
diff --git a/compiler/record-minmax/include/Utils.h b/compiler/record-minmax/include/Utils.h
new file mode 100644
index 00000000000..7c74fe25e1e
--- /dev/null
+++ b/compiler/record-minmax/include/Utils.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RECORD_MINMAX_UTILS_H__
+#define __RECORD_MINMAX_UTILS_H__
+
+#include <luci/IR/CircleNodes.h>
+
+#include <vector>
+#include <string>
+
+namespace record_minmax
+{
+
+// Return total number of elements of the node's output tensor
+uint32_t numElements(const luci::CircleNode *node);
+
+// Return the node's output tensor size in bytes
+size_t getTensorSize(const luci::CircleNode *node);
+
+} // namespace record_minmax
+
+#endif // __RECORD_MINMAX_UTILS_H__
diff --git a/compiler/record-minmax/src/RandomIterator.cpp b/compiler/record-minmax/src/RandomIterator.cpp
new file mode 100644
index 00000000000..fbe9fa0886e
--- /dev/null
+++ b/compiler/record-minmax/src/RandomIterator.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "RandomIterator.h"
+#include "DataBuffer.h"
+#include "Utils.h"
+
+#include <luci/IR/Module.h>
+#include <luci/IR/CircleNodes.h>
+
+#include <vector>
+#include <cstring>
+
+namespace
+{
+
+std::vector<float> genRandomData(std::mt19937 &gen, uint32_t num_elements, float min, float max)
+{
+  std::uniform_real_distribution<float> dist(min, max);
+  std::vector<float> input_data(num_elements);
+
+  // Write random data
+  {
+    auto const generator = [&gen, &dist]() { return static_cast<float>(dist(gen)); };
+    std::generate(begin(input_data), end(input_data), generator);
+  }
+
+  return input_data;
+}
+
+template <typename T>
+std::vector<T> genRandomIntData(std::mt19937 &gen, uint32_t num_elements, T min, T max)
+{
+  std::uniform_int_distribution<T> dist(min, max);
+  std::vector<T> input_data(num_elements);
+
+  // Write random data
+  {
+    auto const generator = [&gen, &dist]() { return dist(gen); };
+    std::generate(begin(input_data), end(input_data), generator);
+  }
+
+  return input_data;
+}
+
+} // namespace
+
+namespace record_minmax
+{
+
+RandomIterator::RandomIterator(luci::Module *module)
+{
+  assert(module); // FIX_CALLER_UNLESS
+
+  std::random_device rd;
+  std::mt19937 _gen(rd());
+
+  auto input_nodes = loco::input_nodes(module->graph());
+  for (auto input_node : input_nodes)
+  {
+    const auto cnode = loco::must_cast<const luci::CircleInput *>(input_node);
+    _input_nodes.emplace_back(cnode);
+  }
+
+  // Hardcoded
+  _num_data = 3;
+}
+
+bool RandomIterator::hasNext() const { return _curr_idx < _num_data; }
+
+std::vector<DataBuffer> RandomIterator::next()
+{
+  std::vector<DataBuffer> res;
+
+  for (auto input_node : _input_nodes)
+  {
+    DataBuffer buf;
+
+    const auto dtype = input_node->dtype();
+    const auto num_elements = numElements(input_node);
+
+    buf.data.resize(getTensorSize(input_node));
+
+    switch (dtype)
+    {
+      case loco::DataType::FLOAT32:
+      {
+        const auto input_data = genRandomData(_gen, num_elements, -5, 5);
+        const auto data_size = input_data.size() * sizeof(float);
+        assert(buf.data.size() == data_size);
+        memcpy(buf.data.data(), input_data.data(), data_size);
+        break;
+      }
+      case loco::DataType::S32:
+      {
+        const auto input_data = genRandomIntData<int32_t>(_gen, num_elements, 0, 100);
+        const auto data_size = input_data.size() * sizeof(int32_t);
+        assert(buf.data.size() == data_size);
+        memcpy(buf.data.data(), input_data.data(), data_size);
+        break;
+      }
+      case loco::DataType::S64:
+      {
+        const auto input_data = genRandomIntData<int64_t>(_gen, num_elements, 0, 100);
+        const auto data_size = input_data.size() * sizeof(int64_t);
+        assert(buf.data.size() == data_size);
+        memcpy(buf.data.data(), input_data.data(), data_size);
+        break;
+      }
+      case loco::DataType::BOOL:
+      {
+        const auto input_data = genRandomIntData<uint8_t>(_gen, num_elements, 0, 1);
+        const auto data_size = input_data.size() * sizeof(uint8_t);
+        assert(buf.data.size() == data_size);
+        memcpy(buf.data.data(), input_data.data(), data_size);
+        break;
+      }
+      default:
+        throw std::runtime_error("Unsupported datatype");
+    }
+
+    res.emplace_back(buf);
+  }
+
+  _curr_idx++; // move to the next index
+
+  return res;
+}
+
+bool RandomIterator::check_type_shape() const { return false; }
+
+} // namespace record_minmax
diff --git a/compiler/record-minmax/src/Utils.cpp b/compiler/record-minmax/src/Utils.cpp
new file mode 100644
index 00000000000..dd8fa7a05fe
--- /dev/null
+++ b/compiler/record-minmax/src/Utils.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Utils.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/DataTypeHelper.h>
+
+namespace record_minmax
+{
+
+uint32_t numElements(const luci::CircleNode *node)
+{
+  assert(node); // FIX_CALLER_UNLESS
+
+  uint32_t num_elements = 1;
+  for (uint32_t i = 0; i < node->rank(); i++)
+  {
+    if (not node->dim(i).known())
+      throw std::runtime_error("Unknown dimension found in " + node->name());
+
+    num_elements *= node->dim(i).value();
+  }
+
+  return num_elements;
+}
+
+size_t getTensorSize(const luci::CircleNode *node)
+{
+  assert(node); // FIX_CALLER_UNLESS
+
+  uint32_t elem_size = luci::size(node->dtype());
+  return numElements(node) * elem_size;
+}
+
+} // namespace record_minmax
diff --git a/compiler/record-minmax/tests/Utils.test.cpp b/compiler/record-minmax/tests/Utils.test.cpp
new file mode 100644
index 00000000000..6bac8fc6cb8
--- /dev/null
+++ b/compiler/record-minmax/tests/Utils.test.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Utils.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+using namespace record_minmax;
+
+TEST(UtilsTest, num_elements)
+{
+  luci::CircleAdd node;
+  node.rank(3);
+  node.dim(0).set(1);
+  node.dim(1).set(2);
+  node.dim(2).set(3);
+  node.dtype(loco::DataType::FLOAT32);
+
+  EXPECT_EQ(6, numElements(&node));
+}
+
+TEST(UtilsTest, num_elements_NEG)
+{
+  luci::CircleAdd node;
+  node.rank(3);
+  node.dim(0).set(1);
+  node.dim(1).set(2);
+  node.dim(2).set(3);
+  node.dtype(loco::DataType::FLOAT32);
+
+  node.dim(0).unset();
+
+  EXPECT_ANY_THROW(numElements(&node));
+}
+
+TEST(UtilsTest, get_tensor_size)
+{
+  luci::CircleAdd node;
+  node.rank(3);
+  node.dim(0).set(1);
+  node.dim(1).set(2);
+  node.dim(2).set(3);
+  node.dtype(loco::DataType::FLOAT32);
+
+  EXPECT_EQ(24, getTensorSize(&node));
+}
+
+TEST(UtilsTest, get_tensor_size_NEG)
+{
+  luci::CircleAdd node;
+  node.rank(3);
+  node.dim(0).set(1);
+  node.dim(1).set(2);
+  node.dim(2).set(3);
+  node.dtype(loco::DataType::FLOAT32);
+
+  node.dim(0).unset();
+
+  EXPECT_ANY_THROW(getTensorSize(&node));
+}

From 094bd4a4c2950fb363c7d6fc9cb4481f094e6a54 Mon Sep 17 00:00:00 2001
From: Hyukjin Jeong <hj1.jeong@samsung.com>
Date: Mon, 28 Oct 2024 14:37:09 +0900
Subject: [PATCH 21/46] [record-minmax] Introduce DirectoryIterator (#14262)

This introduces an iterator for directory format.

ONE-DCO-1.0-Signed-off-by: Hyukjin Jeong <hj1.jeong@samsung.com>
---
 .../record-minmax/include/DirectoryIterator.h |  53 +++++++++
 compiler/record-minmax/include/Utils.h        |   3 +
 .../record-minmax/src/DirectoryIterator.cpp   | 102 ++++++++++++++++++
 compiler/record-minmax/src/Utils.cpp          |  17 +++
 4 files changed, 175 insertions(+)
 create mode 100644 compiler/record-minmax/include/DirectoryIterator.h
 create mode 100644 compiler/record-minmax/src/DirectoryIterator.cpp

diff --git a/compiler/record-minmax/include/DirectoryIterator.h b/compiler/record-minmax/include/DirectoryIterator.h
new file mode 100644
index 00000000000..9a68c74bd22
--- /dev/null
+++ b/compiler/record-minmax/include/DirectoryIterator.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RECORD_MINMAX_DIRECTORY_ITERATOR_H__
+#define __RECORD_MINMAX_DIRECTORY_ITERATOR_H__
+
+#include "DataBuffer.h"
+#include "DataSetIterator.h"
+
+#include <luci/IR/Module.h>
+#include <luci/IR/CircleNodes.h>
+
+#include <string>
+#include <vector>
+#include <dirent.h>
+
+namespace record_minmax
+{
+
+class DirectoryIterator final : public DataSetIterator
+{
+public:
+  DirectoryIterator(const std::string &dir_path, luci::Module *module);
+
+  bool hasNext() const override;
+
+  std::vector<DataBuffer> next() override;
+
+  bool check_type_shape() const override;
+
+private:
+  std::vector<dirent *> _entries;
+  uint32_t _curr_idx = 0;
+  std::string _dir_path;
+  std::vector<const luci::CircleInput *> _input_nodes;
+};
+
+} // namespace record_minmax
+
+#endif // __RECORD_MINMAX_DIRECTORY_ITERATOR_H__
diff --git a/compiler/record-minmax/include/Utils.h b/compiler/record-minmax/include/Utils.h
index 7c74fe25e1e..d7333add27a 100644
--- a/compiler/record-minmax/include/Utils.h
+++ b/compiler/record-minmax/include/Utils.h
@@ -31,6 +31,9 @@ uint32_t numElements(const luci::CircleNode *node);
 // Return the node's output tensor size in bytes
 size_t getTensorSize(const luci::CircleNode *node);
 
+// Read data from file into buffer with specified size in bytes
+void readDataFromFile(const std::string &filename, std::vector<char> &data, size_t data_size);
+
 } // namespace record_minmax
 
 #endif // __RECORD_MINMAX_UTILS_H__
diff --git a/compiler/record-minmax/src/DirectoryIterator.cpp b/compiler/record-minmax/src/DirectoryIterator.cpp
new file mode 100644
index 00000000000..555a123b16b
--- /dev/null
+++ b/compiler/record-minmax/src/DirectoryIterator.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DirectoryIterator.h"
+#include "DataBuffer.h"
+#include "Utils.h"
+
+#include <luci/IR/Module.h>
+
+#include <vector>
+#include <string>
+#include <cstring> // For memcpy
+
+#include <dirent.h>
+
+namespace record_minmax
+{
+
+DirectoryIterator::DirectoryIterator(const std::string &dir_path, luci::Module *module)
+  : _dir_path(dir_path)
+{
+  auto dir = opendir(dir_path.c_str());
+  if (not dir)
+    throw std::runtime_error("Cannot open directory. Please check \"" + _dir_path +
+                             "\" is a directory.\n");
+
+  dirent *entry = nullptr;
+  while ((entry = readdir(dir)))
+  {
+    if (entry->d_type != DT_REG)
+      continue;
+
+    _entries.emplace_back(entry);
+  }
+
+  auto input_nodes = loco::input_nodes(module->graph());
+  for (auto input_node : input_nodes)
+  {
+    const auto cnode = loco::must_cast<const luci::CircleInput *>(input_node);
+    _input_nodes.emplace_back(cnode);
+  }
+}
+
+bool DirectoryIterator::hasNext() const { return _curr_idx < _entries.size(); }
+
+std::vector<DataBuffer> DirectoryIterator::next()
+{
+  auto entry = _entries.at(_curr_idx++);
+  assert(entry); // FIX_ME_UNLESS
+
+  // Get total input size
+  uint32_t total_input_size = 0;
+  for (auto input : _input_nodes)
+  {
+    const auto *input_node = loco::must_cast<const luci::CircleInput *>(input);
+    total_input_size += getTensorSize(input_node);
+  }
+
+  const std::string filename = entry->d_name;
+
+  // Read data from file to buffer
+  // Assumption: For a multi-input model, the binary file should have inputs concatenated in the
+  // same order with the input index.
+  std::vector<char> input_data(total_input_size);
+  readDataFromFile(_dir_path + "/" + filename, input_data, total_input_size);
+
+  std::vector<DataBuffer> res;
+
+  uint32_t offset = 0;
+  for (auto input_node : _input_nodes)
+  {
+    DataBuffer buf;
+
+    const auto input_size = getTensorSize(input_node);
+
+    buf.data.resize(input_size);
+    memcpy(buf.data.data(), input_data.data() + offset, input_size);
+
+    offset += input_size;
+
+    res.emplace_back(buf);
+  }
+
+  return res;
+}
+
+bool DirectoryIterator::check_type_shape() const { return false; }
+
+} // namespace record_minmax
diff --git a/compiler/record-minmax/src/Utils.cpp b/compiler/record-minmax/src/Utils.cpp
index dd8fa7a05fe..76c3dc22ed0 100644
--- a/compiler/record-minmax/src/Utils.cpp
+++ b/compiler/record-minmax/src/Utils.cpp
@@ -19,6 +19,10 @@
 #include <luci/IR/CircleNodes.h>
 #include <luci/IR/DataTypeHelper.h>
 
+#include <vector>
+#include <string>
+#include <fstream>
+
 namespace record_minmax
 {
 
@@ -46,4 +50,17 @@ size_t getTensorSize(const luci::CircleNode *node)
   return numElements(node) * elem_size;
 }
 
+void readDataFromFile(const std::string &filename, std::vector<char> &data, size_t data_size)
+{
+  assert(data.size() == data_size); // FIX_CALLER_UNLESS
+
+  std::ifstream fs(filename, std::ifstream::binary);
+  if (fs.fail())
+    throw std::runtime_error("Cannot open file \"" + filename + "\".\n");
+  if (fs.read(data.data(), data_size).fail())
+    throw std::runtime_error("Failed to read data from file \"" + filename + "\".\n");
+  if (fs.peek() != EOF)
+    throw std::runtime_error("Input tensor size mismatches with \"" + filename + "\".\n");
+}
+
 } // namespace record_minmax

From 043e9e2f7a8c7b21b28f742d2871e8297b7d2901 Mon Sep 17 00:00:00 2001
From: Hyukjin Jeong <hj1.jeong@samsung.com>
Date: Mon, 28 Oct 2024 14:37:20 +0900
Subject: [PATCH 22/46] [record-minmax] Introduce HDF5Iterator (#14261)

This introduces an iterator for HDF5 format.

ONE-DCO-1.0-Signed-off-by: Hyukjin Jeong <hj1.jeong@samsung.com>
---
 compiler/record-minmax/include/HDF5Iterator.h |  54 ++++++++++
 compiler/record-minmax/src/HDF5Iterator.cpp   | 100 ++++++++++++++++++
 2 files changed, 154 insertions(+)
 create mode 100644 compiler/record-minmax/include/HDF5Iterator.h
 create mode 100644 compiler/record-minmax/src/HDF5Iterator.cpp

diff --git a/compiler/record-minmax/include/HDF5Iterator.h b/compiler/record-minmax/include/HDF5Iterator.h
new file mode 100644
index 00000000000..a810aaa3f26
--- /dev/null
+++ b/compiler/record-minmax/include/HDF5Iterator.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RECORD_MINMAX_HDF5_ITERATOR_H__
+#define __RECORD_MINMAX_HDF5_ITERATOR_H__
+
+#include "DataBuffer.h"
+#include "DataSetIterator.h"
+
+#include <luci/IR/Module.h>
+#include <luci/IR/CircleNodes.h>
+#include <dio_hdf5/HDF5Importer.h>
+
+#include <string>
+#include <vector>
+
+namespace record_minmax
+{
+
+class HDF5Iterator final : public DataSetIterator
+{
+public:
+  HDF5Iterator(const std::string &file_path, luci::Module *module);
+
+  bool hasNext() const override;
+
+  std::vector<DataBuffer> next() override;
+
+  bool check_type_shape() const override;
+
+private:
+  dio::hdf5::HDF5Importer _importer;
+  std::vector<const luci::CircleInput *> _input_nodes;
+  bool _is_raw_data = false;
+  uint32_t _curr_idx = 0;
+  uint32_t _num_data = 0;
+};
+
+} // namespace record_minmax
+
+#endif // __RECORD_MINMAX_HDF5_ITERATOR_H__
diff --git a/compiler/record-minmax/src/HDF5Iterator.cpp b/compiler/record-minmax/src/HDF5Iterator.cpp
new file mode 100644
index 00000000000..aaadc674b63
--- /dev/null
+++ b/compiler/record-minmax/src/HDF5Iterator.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "HDF5Iterator.h"
+#include "DataBuffer.h"
+#include "Utils.h"
+
+#include <luci/IR/Module.h>
+
+#include <vector>
+#include <string>
+
+namespace record_minmax
+{
+
+HDF5Iterator::HDF5Iterator(const std::string &file_path, luci::Module *module)
+  : _importer(file_path)
+{
+  try
+  {
+    _importer.importGroup("value");
+
+    _is_raw_data = _importer.isRawData();
+
+    _num_data = _importer.numData();
+  }
+  catch (const H5::Exception &e)
+  {
+    H5::Exception::printErrorStack();
+    throw std::runtime_error("HDF5 error occurred during initialization.");
+  }
+
+  auto input_nodes = loco::input_nodes(module->graph());
+  for (auto input_node : input_nodes)
+  {
+    const auto cnode = loco::must_cast<const luci::CircleInput *>(input_node);
+    _input_nodes.emplace_back(cnode);
+  }
+}
+
+bool HDF5Iterator::hasNext() const { return _curr_idx < _num_data; }
+
+std::vector<DataBuffer> HDF5Iterator::next()
+{
+  std::vector<DataBuffer> res;
+
+  try
+  {
+    for (int32_t input_idx = 0; input_idx < _importer.numInputs(_curr_idx); input_idx++)
+    {
+      DataBuffer buf;
+
+      const auto input_node = _input_nodes.at(input_idx);
+      const auto input_size = getTensorSize(input_node);
+      buf.data.resize(input_size);
+
+      if (check_type_shape())
+      {
+        _importer.readTensor(_curr_idx, input_idx, &buf.dtype, &buf.shape, buf.data.data(),
+                             input_size);
+      }
+      else
+      {
+        _importer.readTensor(_curr_idx, input_idx, buf.data.data(), input_size);
+      }
+
+      res.emplace_back(buf);
+    }
+  }
+  catch (const H5::Exception &e)
+  {
+    H5::Exception::printErrorStack();
+    throw std::runtime_error("HDF5 error occurred during iteration.");
+  }
+
+  _curr_idx++; // move to the next index
+
+  return res;
+}
+
+bool HDF5Iterator::check_type_shape() const
+{
+  // If it's raw data, we don't need to check type and shape
+  return not _is_raw_data;
+}
+
+} // namespace record_minmax

From ad94b9bd10704b56db99360f946778cbe79a2dd6 Mon Sep 17 00:00:00 2001
From: Hyukjin Jeong <hj1.jeong@samsung.com>
Date: Mon, 28 Oct 2024 16:29:36 +0900
Subject: [PATCH 23/46] [record-minmax] Introduce ListFileIterator (#14263)

* [record-minmax] Introduce ListFileIterator

This introduces an iterator for list format.

ONE-DCO-1.0-Signed-off-by: Hyukjin Jeong <hj1.jeong@samsung.com>

* Update comments
---
 .../record-minmax/include/ListFileIterator.h  |  51 ++++++
 .../record-minmax/src/ListFileIterator.cpp    | 153 ++++++++++++++++++
 2 files changed, 204 insertions(+)
 create mode 100644 compiler/record-minmax/include/ListFileIterator.h
 create mode 100644 compiler/record-minmax/src/ListFileIterator.cpp

diff --git a/compiler/record-minmax/include/ListFileIterator.h b/compiler/record-minmax/include/ListFileIterator.h
new file mode 100644
index 00000000000..969863b7391
--- /dev/null
+++ b/compiler/record-minmax/include/ListFileIterator.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RECORD_MINMAX_LIST_FILE_ITERATOR_H__
+#define __RECORD_MINMAX_LIST_FILE_ITERATOR_H__
+
+#include "DataBuffer.h"
+#include "DataSetIterator.h"
+
+#include <luci/IR/Module.h>
+#include <luci/IR/CircleNodes.h>
+
+#include <string>
+#include <vector>
+
+namespace record_minmax
+{
+
+class ListFileIterator final : public DataSetIterator
+{
+public:
+  ListFileIterator(const std::string &input_path, luci::Module *module);
+
+  bool hasNext() const override;
+
+  std::vector<DataBuffer> next() override;
+
+  bool check_type_shape() const override;
+
+private:
+  std::vector<std::string> _lines;
+  uint32_t _curr_idx = 0;
+  std::vector<const luci::CircleInput *> _input_nodes;
+};
+
+} // namespace record_minmax
+
+#endif // __RECORD_MINMAX_LIST_FILE_ITERATOR_H__
diff --git a/compiler/record-minmax/src/ListFileIterator.cpp b/compiler/record-minmax/src/ListFileIterator.cpp
new file mode 100644
index 00000000000..b03a308f8ec
--- /dev/null
+++ b/compiler/record-minmax/src/ListFileIterator.cpp
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ListFileIterator.h"
+#include "DataBuffer.h"
+#include "Utils.h"
+
+#include <luci/IR/Module.h>
+
+#include <vector>
+#include <fstream>
+#include <sstream> // For std::stringstream
+
+namespace
+{
+
+// Return a string with no whitespace from both ends
+std::string trim(std::string s)
+{
+  // Trim left side
+  s.erase(s.begin(),
+          std::find_if(s.begin(), s.end(), [](unsigned char ch) { return !std::isspace(ch); }));
+
+  // Trim right side
+  s.erase(
+    std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) { return !std::isspace(ch); }).base(),
+    s.end());
+
+  return s;
+}
+
+// Return a vector of strings after splitting by space
+std::vector<std::string> parse_line(const std::string &line)
+{
+  auto trimmed = trim(line);
+  std::stringstream ss(trimmed);
+
+  std::vector<std::string> res;
+
+  std::string filename;
+  while (getline(ss, filename, ' '))
+  {
+    res.emplace_back(filename);
+  }
+  return res;
+}
+
+} // namespace
+
+namespace record_minmax
+{
+
+ListFileIterator::ListFileIterator(const std::string &input_path, luci::Module *module)
+{
+  std::ifstream input_file(input_path);
+  if (input_file.fail())
+    throw std::runtime_error("Cannot open file \"" + input_path + "\".\n");
+
+  auto input_nodes = loco::input_nodes(module->graph());
+  for (auto input_node : input_nodes)
+  {
+    const auto cnode = loco::must_cast<const luci::CircleInput *>(input_node);
+    _input_nodes.emplace_back(cnode);
+  }
+
+  std::string record;
+  while (getline(input_file, record))
+  {
+    _lines.emplace_back(record);
+  }
+
+  if (_lines.size() == 0)
+    throw std::runtime_error("The input data file does not contain any record.");
+}
+
+bool ListFileIterator::hasNext() const { return _curr_idx < _lines.size(); }
+
+std::vector<DataBuffer> ListFileIterator::next()
+{
+  const auto line = _lines.at(_curr_idx++);
+
+  const auto file_names = parse_line(line);
+
+  std::vector<DataBuffer> res;
+
+  // Space-separated input files are written in a single line
+  // This is the recommended way to write the list file
+  if (file_names.size() == _input_nodes.size())
+  {
+    for (uint32_t i = 0; i < file_names.size(); i++)
+    {
+      DataBuffer buf;
+      {
+        const auto file_name = file_names.at(i);
+        const auto input_node = _input_nodes.at(i);
+        const auto input_size = getTensorSize(input_node);
+
+        buf.data.resize(input_size);
+
+        readDataFromFile(file_name, buf.data, input_size);
+      }
+
+      res.emplace_back(buf);
+    }
+  }
+  else
+  {
+    // Must have a single file in one line (inputs are concatenated)
+    if (file_names.size() != 1)
+      throw std::runtime_error(
+        "Wrong number of inputs are given. Model has " + std::to_string(_input_nodes.size()) +
+        " inputs, but list file gives " + std::to_string(file_names.size()) + " inputs.");
+
+    // Read data from file to buffer
+    // Assumption: For a multi-input model, the binary file should have inputs concatenated in the
+    // same order with the input index.
+    // NOTE This is a legacy way to support multiple inputs.
+    DataBuffer buf;
+    {
+      // Get total input size
+      uint32_t total_input_size = 0;
+      for (auto input_node : _input_nodes)
+      {
+        total_input_size += getTensorSize(input_node);
+      }
+
+      buf.data.resize(total_input_size);
+
+      readDataFromFile(file_names.at(0), buf.data, total_input_size);
+    }
+
+    res.emplace_back(buf);
+  }
+
+  return res;
+}
+
+bool ListFileIterator::check_type_shape() const { return false; }
+
+} // namespace record_minmax

From a97f907ccce0af37890ded9875939a29ded1213c Mon Sep 17 00:00:00 2001
From: Hyukjin Jeong <hj1.jeong@samsung.com>
Date: Mon, 28 Oct 2024 17:41:15 +0900
Subject: [PATCH 24/46] [record-minmax] Use unified profile function (#14266)

This uses unified profile function.

ONE-DCO-1.0-Signed-off-by: Hyukjin Jeong <hj1.jeong@samsung.com>
---
 compiler/record-minmax/driver/Driver.cpp      |  21 +++-
 compiler/record-minmax/include/RecordMinMax.h |  31 ++++-
 compiler/record-minmax/include/Utils.h        |   5 +
 compiler/record-minmax/src/RecordMinMax.cpp   | 119 ++++++++++++++++--
 compiler/record-minmax/src/Utils.cpp          |  12 ++
 compiler/record-minmax/tests/Utils.test.cpp   |  38 ++++++
 6 files changed, 210 insertions(+), 16 deletions(-)

diff --git a/compiler/record-minmax/driver/Driver.cpp b/compiler/record-minmax/driver/Driver.cpp
index 8f25ebff5e8..d472b5e5622 100644
--- a/compiler/record-minmax/driver/Driver.cpp
+++ b/compiler/record-minmax/driver/Driver.cpp
@@ -43,6 +43,8 @@ int entry(const int argc, char **argv)
 {
   using namespace record_minmax;
 
+  using DataSetFormat = RecordMinMax::DataSetFormat;
+
   LOGGER(l);
 
   arser::Arser arser(
@@ -160,6 +162,8 @@ int entry(const int argc, char **argv)
   {
     auto input_data_path = arser.get<std::string>("--input_data");
 
+    rmm.setInputDataPath(input_data_path);
+
     // TODO: support parallel record from file and dir input data format
     if (num_threads > 1 and not(input_data_format == "h5") and not(input_data_format == "hdf5"))
     {
@@ -170,11 +174,18 @@ int entry(const int argc, char **argv)
     {
       // Profile min/max while executing the H5 data
       if (num_threads == 1)
-        rmm.profileData(input_data_path);
+      {
+        rmm.setDataSetFormat(DataSetFormat::H5);
+      }
       else
       {
         INFO(l) << "Using parallel recording" << std::endl;
         rmm.profileDataInParallel(input_data_path);
+
+        // Save profiled values to the model
+        rmm.saveModel(output_model_path);
+
+        return EXIT_SUCCESS;
       }
     }
     // input_data is a text file having a file path in each line.
@@ -188,13 +199,13 @@ int entry(const int argc, char **argv)
     else if (input_data_format == "list" || input_data_format == "filelist")
     {
       // Profile min/max while executing the list of Raw data
-      rmm.profileRawData(input_data_path);
+      rmm.setDataSetFormat(DataSetFormat::LIST_FILE);
     }
     else if (input_data_format == "directory" || input_data_format == "dir")
     {
       // Profile min/max while executing all files under the given directory
       // The contents of each file is same as the raw data in the 'list' type
-      rmm.profileRawDataDirectory(input_data_path);
+      rmm.setDataSetFormat(DataSetFormat::DIRECTORY);
     }
     else
     {
@@ -205,9 +216,11 @@ int entry(const int argc, char **argv)
   else
   {
     // Profile min/max while executing random input data
-    rmm.profileDataWithRandomInputs();
+    rmm.setDataSetFormat(DataSetFormat::RANDOM);
   }
 
+  rmm.profileData();
+
   // Save profiled values to the model
   rmm.saveModel(output_model_path);
 
diff --git a/compiler/record-minmax/include/RecordMinMax.h b/compiler/record-minmax/include/RecordMinMax.h
index 758e8a92436..b11898c9640 100644
--- a/compiler/record-minmax/include/RecordMinMax.h
+++ b/compiler/record-minmax/include/RecordMinMax.h
@@ -20,6 +20,7 @@
 #include <luci/IR/Module.h>
 #include <luci_interpreter/Interpreter.h>
 
+#include "DataSetIterator.h"
 #include "MinMaxObserver.h"
 #include "MinMaxComputer.h"
 
@@ -36,6 +37,15 @@ using WholeOutput = std::vector<Output>;
 class RecordMinMax
 {
 public:
+  enum DataSetFormat
+  {
+    UNKNOWN, // To check if format is set properly
+    RANDOM,
+    H5,
+    DIRECTORY,
+    LIST_FILE,
+  };
+
   explicit RecordMinMax(uint32_t num_threads, std::unique_ptr<MinMaxComputer> &&minmax_computer)
     : _threads_size(num_threads), _minmax_computer(std::move(minmax_computer))
   {
@@ -47,16 +57,28 @@ class RecordMinMax
 
   void initialize(const std::string &input_model_path);
 
-  // TODO Refactor profile functions
-  void profileData(const std::string &input_data_path);
+  void setDataSetFormat(DataSetFormat format)
+  {
+    assert(format != DataSetFormat::UNKNOWN); // FIX_CALLER UNLESS
+    _data_set_format = format;
+  }
+
+  DataSetFormat getDataSetFormat() const { return _data_set_format; }
+
+  void setInputDataPath(const std::string &input_data_path) { _input_data_path = input_data_path; }
+
+  void profileData();
 
   void profileDataInParallel(const std::string &input_data_path);
 
+// TODO Remove unused code
+#if 0
   void profileRawData(const std::string &input_data_path);
 
   void profileRawDataDirectory(const std::string &input_data_path);
 
   void profileDataWithRandomInputs(void);
+#endif
 
   void saveModel(const std::string &output_model_path);
 
@@ -71,6 +93,8 @@ class RecordMinMax
     return _observers[0].get();
   }
 
+  std::unique_ptr<DataSetIterator> createIterator();
+
   WholeOutput importH5Data(const std::string &input_data_path);
 
   std::unique_ptr<luci::Module> _module;
@@ -81,6 +105,9 @@ class RecordMinMax
 
   uint32_t _threads_size = 0;
   std::unique_ptr<MinMaxComputer> _minmax_computer;
+
+  DataSetFormat _data_set_format = UNKNOWN;
+  std::string _input_data_path;
 };
 
 } // namespace record_minmax
diff --git a/compiler/record-minmax/include/Utils.h b/compiler/record-minmax/include/Utils.h
index d7333add27a..a979b9f0fb3 100644
--- a/compiler/record-minmax/include/Utils.h
+++ b/compiler/record-minmax/include/Utils.h
@@ -34,6 +34,11 @@ size_t getTensorSize(const luci::CircleNode *node);
 // Read data from file into buffer with specified size in bytes
 void readDataFromFile(const std::string &filename, std::vector<char> &data, size_t data_size);
 
+// Throw exception if input has one of the following conditions.
+// 1. Have unknown dimension
+// 2. Number of elements is 0
+void checkInputDimension(const luci::CircleInput *input);
+
 } // namespace record_minmax
 
 #endif // __RECORD_MINMAX_UTILS_H__
diff --git a/compiler/record-minmax/src/RecordMinMax.cpp b/compiler/record-minmax/src/RecordMinMax.cpp
index 5f7f65417a6..1c646b11b9d 100644
--- a/compiler/record-minmax/src/RecordMinMax.cpp
+++ b/compiler/record-minmax/src/RecordMinMax.cpp
@@ -16,23 +16,19 @@
 
 #include "RecordMinMax.h"
 #include "MinMaxObserver.h"
+#include "DataSetIterator.h"
+#include "HDF5Iterator.h"
+#include "RandomIterator.h"
+#include "DirectoryIterator.h"
+#include "ListFileIterator.h"
+#include "Utils.h"
 
-#include <luci/IR/DataTypeHelper.h>
 #include <luci/ImporterEx.h>
 #include <luci/CircleExporter.h>
 #include <luci/CircleFileExpContract.h>
-#include <luci/IR/CircleQuantParam.h>
 #include <luci/Log.h>
-#include <dio_hdf5/HDF5Importer.h>
 
-#include <dirent.h>
-#include <algorithm>
-#include <cmath>
-#include <fstream>
-#include <numeric>
 #include <stdexcept>
-#include <iostream>
-#include <random>
 
 using Shape = std::vector<loco::Dimension>;
 using DataType = loco::DataType;
@@ -40,6 +36,8 @@ using DataType = loco::DataType;
 namespace
 {
 
+// TODO Remove unused code
+#if 0
 // Return a string with no whitespace from both ends
 std::string trim(std::string s)
 {
@@ -69,6 +67,7 @@ std::vector<std::string> parse_line(const std::string &line)
   }
   return res;
 }
+#endif
 
 // Max h5 file size for parallel recording in bytes = 1 GB
 const long h5_max_size_bytes = 1000000000;
@@ -81,6 +80,8 @@ long getH5FileSize(const std::string &input_data_path)
   return in_file.tellg();
 }
 
+// TODO Remove unused code
+#if 0
 uint32_t numElements(const luci::CircleNode *node)
 {
   uint32_t num_elements = 1;
@@ -153,6 +154,7 @@ template <typename NodeT> size_t getTensorSize(const NodeT *node)
     tensor_size *= node->dim(i).value();
   return tensor_size;
 }
+#endif
 
 /**
  * @brief  verifyTypeShape checks the type and the shape of CircleInput
@@ -207,6 +209,9 @@ void RecordMinMax::initialize(const std::string &input_model_path)
   }
 }
 
+// TODO Remove unused code
+#if 0
+
 // input_data_path is a path to the directory
 // The directory should contain binary files each of which is a raw data,
 // ready to be consumed by the input circle model without any modification
@@ -368,6 +373,7 @@ void RecordMinMax::profileRawData(const std::string &input_data_path)
 
   _minmax_computer->update_qparam(getObserver()->minMaxData()->getMap());
 }
+#endif
 
 WholeOutput RecordMinMax::importH5Data(const std::string &input_data_path)
 {
@@ -427,6 +433,95 @@ WholeOutput RecordMinMax::importH5Data(const std::string &input_data_path)
   }
 }
 
+std::unique_ptr<DataSetIterator> RecordMinMax::createIterator()
+{
+  assert(_data_set_format != DataSetFormat::UNKNOWN); // FIX_CALLER_UNLESS
+
+  std::unique_ptr<DataSetIterator> iterator;
+  switch (_data_set_format)
+  {
+    case DataSetFormat::H5:
+      assert(not _input_data_path.empty()); // FIX_CALLER_UNLESS
+      iterator = std::make_unique<HDF5Iterator>(_input_data_path, _module.get());
+      break;
+    case DataSetFormat::RANDOM:
+      iterator = std::make_unique<RandomIterator>(_module.get());
+      break;
+    case DataSetFormat::DIRECTORY:
+      iterator = std::make_unique<DirectoryIterator>(_input_data_path, _module.get());
+      break;
+    case DataSetFormat::LIST_FILE:
+      iterator = std::make_unique<ListFileIterator>(_input_data_path, _module.get());
+      break;
+    default:
+      throw std::runtime_error("Unsupported dataset format");
+  }
+
+  assert(iterator.get() != nullptr); // FIX_ME_UNLESS
+
+  return iterator;
+}
+
+void RecordMinMax::profileData()
+{
+  assert(getDataSetFormat() != DataSetFormat::UNKNOWN); // FIX_CALLER_UNLESS
+
+  const auto input_nodes = loco::input_nodes(_module->graph());
+  for (auto input_node : input_nodes)
+  {
+    const auto *input_cnode = loco::must_cast<const luci::CircleInput *>(input_node);
+    checkInputDimension(input_cnode);
+  }
+
+  const auto num_inputs = input_nodes.size();
+
+  auto iter = createIterator();
+
+  bool check_type_shape = iter->check_type_shape();
+
+  if (not iter->hasNext())
+    throw std::runtime_error("The input data file does not contain any record.");
+
+  uint32_t record_idx = 0;
+  while (iter->hasNext())
+  {
+    const auto &record = iter->next();
+
+    if (num_inputs != record.size())
+      throw std::runtime_error("Wrong number of inputs.");
+
+    std::cout << "Recording " << record_idx << "'th data" << std::endl;
+
+    // Write input data to interpreter
+    for (uint32_t input_idx = 0; input_idx < num_inputs; input_idx++)
+    {
+      const auto *input_node = loco::must_cast<const luci::CircleInput *>(input_nodes[input_idx]);
+      assert(input_node->index() == input_idx);
+
+      const auto input_data = record.at(input_idx);
+
+      if (check_type_shape)
+      {
+        // Check the type and the shape of the input data is valid
+        verifyTypeShape(input_node, input_data.dtype, input_data.shape);
+      }
+
+      getInterpreter()->writeInputTensor(input_node, input_data.data.data(),
+                                         input_data.data.size());
+    }
+
+    getInterpreter()->interpret();
+
+    record_idx++;
+  }
+
+  std::cout << "Recording finished. Number of recorded data: " << record_idx << std::endl;
+
+  _minmax_computer->update_qparam(getObserver()->minMaxData()->getMap());
+}
+
+// TODO Remove unused code
+#if 0
 void RecordMinMax::profileData(const std::string &input_data_path)
 {
   try
@@ -491,6 +586,7 @@ void RecordMinMax::profileData(const std::string &input_data_path)
 
   _minmax_computer->update_qparam(getObserver()->minMaxData()->getMap());
 }
+#endif
 
 void RecordMinMax::profileDataInParallel(const std::string &input_data_path)
 {
@@ -578,6 +674,8 @@ void RecordMinMax::profileDataInParallel(const std::string &input_data_path)
   _minmax_computer->update_qparam(main_min_max_map.getMap());
 }
 
+// TODO Remove unused code
+#if 0
 void RecordMinMax::profileDataWithRandomInputs(void)
 {
   // We use three randomly-generated records
@@ -648,6 +746,7 @@ void RecordMinMax::profileDataWithRandomInputs(void)
 
   _minmax_computer->update_qparam(getObserver()->minMaxData()->getMap());
 }
+#endif
 
 void RecordMinMax::saveModel(const std::string &output_model_path)
 {
diff --git a/compiler/record-minmax/src/Utils.cpp b/compiler/record-minmax/src/Utils.cpp
index 76c3dc22ed0..116878b6891 100644
--- a/compiler/record-minmax/src/Utils.cpp
+++ b/compiler/record-minmax/src/Utils.cpp
@@ -26,6 +26,18 @@
 namespace record_minmax
 {
 
+void checkInputDimension(const luci::CircleInput *input)
+{
+  assert(input); // FIX_CALLER_UNLESS
+
+  for (uint32_t i = 0; i < input->rank(); i++)
+    if (!input->dim(i).known())
+      throw std::runtime_error(input->name() + " has unknown dimension");
+
+  if (numElements(input) == 0)
+    throw std::runtime_error(input->name() + " is a zero-sized input");
+}
+
 uint32_t numElements(const luci::CircleNode *node)
 {
   assert(node); // FIX_CALLER_UNLESS
diff --git a/compiler/record-minmax/tests/Utils.test.cpp b/compiler/record-minmax/tests/Utils.test.cpp
index 6bac8fc6cb8..7ae7da40959 100644
--- a/compiler/record-minmax/tests/Utils.test.cpp
+++ b/compiler/record-minmax/tests/Utils.test.cpp
@@ -73,3 +73,41 @@ TEST(UtilsTest, get_tensor_size_NEG)
 
   EXPECT_ANY_THROW(getTensorSize(&node));
 }
+
+TEST(UtilsTest, check_input_dimension)
+{
+  luci::CircleInput node;
+  node.rank(3);
+  node.dim(0).set(1);
+  node.dim(1).set(2);
+  node.dim(2).set(3);
+  node.dtype(loco::DataType::FLOAT32);
+
+  EXPECT_NO_THROW(checkInputDimension(&node));
+}
+
+TEST(UtilsTest, check_input_dimension_unknown_dim_NEG)
+{
+  luci::CircleInput node;
+  node.rank(3);
+  node.dim(0).set(1);
+  node.dim(1).set(2);
+  node.dim(2).set(3);
+  node.dtype(loco::DataType::FLOAT32);
+
+  node.dim(0).unset();
+
+  EXPECT_ANY_THROW(checkInputDimension(&node));
+}
+
+TEST(UtilsTest, check_input_dimension_zero_dim_NEG)
+{
+  luci::CircleInput node;
+  node.rank(3);
+  node.dim(0).set(1);
+  node.dim(1).set(2);
+  node.dim(2).set(0);
+  node.dtype(loco::DataType::FLOAT32);
+
+  EXPECT_ANY_THROW(checkInputDimension(&node));
+}

From 2fc073c9634fcba6e4b65861fd07897219d686d6 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Mon, 28 Oct 2024 22:39:44 +0100
Subject: [PATCH 25/46] [caffegen] Minor doc fix (#14269)

This corrects typos and formatting.

ONE-DCO-1.0-Signed-off-by: Piotr Fusik <p.fusik@samsung.com>
---
 compiler/caffegen/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/compiler/caffegen/README.md b/compiler/caffegen/README.md
index c322721b3cc..4be0c8a3908 100644
--- a/compiler/caffegen/README.md
+++ b/compiler/caffegen/README.md
@@ -5,16 +5,16 @@
 ## How caffegen works
 
 Some of commands in `caffegen` use standard input for reading data and standard output for exporting result.
-In this case, we strongly recommand you to use pipe, not copy & paste the content of file itself.
+In this case, we strongly recommend you to use pipe, not copy & paste the content of file itself.
 
 Otherwise, `caffegen` use arguments to pass some directories.
 
 ## Supported command
 
-Basically, caffgen command is used as `caffegen [COMMAND]` and there are four `COMMAND` types.
+Basically, caffegen command is used as `caffegen [COMMAND]` and there are four `COMMAND` types:
  - init : initialize parameters using prototxt.
- - encode : make a binary file(caffemodel) using initialized data
- - decode : decode a binary file(caffemodel) and reproduce the initialized data
+ - encode : make a binary file (caffemodel) using initialized data
+ - decode : decode a binary file (caffemodel) and reproduce the initialized data
  - merge : copy the trained weights from a caffemodel into a prototxt file
 
 ## How to use each command

From 1167c7824235c5aa613240be07300aabf6b9ae7f Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Mon, 28 Oct 2024 22:40:27 +0100
Subject: [PATCH 26/46] [circle2circle] Minor doc fix (#14270)

This adds a missing article.

ONE-DCO-1.0-Signed-off-by: Piotr Fusik <p.fusik@samsung.com>
---
 compiler/circle2circle/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/circle2circle/README.md b/compiler/circle2circle/README.md
index 3e94d25402e..6c26a794fbd 100644
--- a/compiler/circle2circle/README.md
+++ b/compiler/circle2circle/README.md
@@ -1,3 +1,3 @@
 # circle2circle
 
-_circle2circle_ provides Circle optimizations as executable tool
+_circle2circle_ provides Circle optimizations as an executable tool

From 819f2edd057cf3863a7604efa237ef1d4c1e61b9 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Mon, 28 Oct 2024 22:41:23 +0100
Subject: [PATCH 27/46] [kuma] Fix a typo (#14271)

This corrects a doc typo.

ONE-DCO-1.0-Signed-off-by: Piotr Fusik <p.fusik@samsung.com>
---
 compiler/kuma/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/kuma/README.md b/compiler/kuma/README.md
index 7e5123968b2..f9a99145781 100644
--- a/compiler/kuma/README.md
+++ b/compiler/kuma/README.md
@@ -4,4 +4,4 @@ _kuma_ is a collection of offline memory allocators.
 
 ## What does "kuma" mean?
 
-_kuma_ originates from _cooma_ which is an abbreviation of **C**ollection **O**f **O**ffline **M**emory **A**lloators.
+_kuma_ originates from _cooma_ which is an abbreviation of **C**ollection **O**f **O**ffline **M**emory **A**llocators.

From 1a3007cc345cafa9a554e0e1dfc25a274b3d52ea Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Mon, 28 Oct 2024 22:48:52 +0100
Subject: [PATCH 28/46] [angkor] Minor doc fix (#14268)

This corrects a typo and formatting.

ONE-DCO-1.0-Signed-off-by: Piotr Fusik <p.fusik@samsung.com>
---
 compiler/angkor/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler/angkor/README.md b/compiler/angkor/README.md
index f761b874060..54907bc55d3 100644
--- a/compiler/angkor/README.md
+++ b/compiler/angkor/README.md
@@ -2,11 +2,11 @@
 
 ## Purpose
 
-_angkor_ is a `nncc` core library
+_angkor_ is an `nncc` core library
 
 ## How to use
 
-_angkor_ implements abstract data type(ADT) for feature, kernel, tensor.
+_angkor_ implements abstract data type (ADT) for feature, kernel, tensor.
 There are layout, shape information and enumerator and so on.
 
 To use some of these things, just insert `include`!

From 00dc8a9c543ef2836f30e2eb6a4a6a98de69b4bb Mon Sep 17 00:00:00 2001
From: Hyukjin Jeong <hj1.jeong@samsung.com>
Date: Tue, 29 Oct 2024 09:55:58 +0900
Subject: [PATCH 29/46] [record-minmax] Tidy RecordMinMax (#14267)

This tidies RecordMinMax source code.
- Remove unused code
- Use proper test name

ONE-DCO-1.0-Signed-off-by: Hyukjin Jeong <hj1.jeong@samsung.com>
---
 compiler/record-minmax/CMakeLists.txt         |   8 +-
 compiler/record-minmax/include/RecordMinMax.h |   9 -
 compiler/record-minmax/src/RecordMinMax.cpp   | 417 ------------------
 3 files changed, 4 insertions(+), 430 deletions(-)

diff --git a/compiler/record-minmax/CMakeLists.txt b/compiler/record-minmax/CMakeLists.txt
index 6755de36eda..db02fee7625 100644
--- a/compiler/record-minmax/CMakeLists.txt
+++ b/compiler/record-minmax/CMakeLists.txt
@@ -64,7 +64,7 @@ set(TEST_SOURCES
 file(GLOB_RECURSE TESTS "tests/*.test.cpp")
 
 nnas_find_package(GTest REQUIRED)
-GTest_AddTest(record_minmax_function_test ${TESTS} ${TEST_SOURCES})
-target_include_directories(record_minmax_function_test PRIVATE include)
-target_link_libraries(record_minmax_function_test luci_lang)
-target_link_libraries(record_minmax_function_test nncc_coverage)
+GTest_AddTest(record_minmax_unittest ${TESTS} ${TEST_SOURCES})
+target_include_directories(record_minmax_unittest PRIVATE include)
+target_link_libraries(record_minmax_unittest luci_lang)
+target_link_libraries(record_minmax_unittest nncc_coverage)
diff --git a/compiler/record-minmax/include/RecordMinMax.h b/compiler/record-minmax/include/RecordMinMax.h
index b11898c9640..e6b289f361b 100644
--- a/compiler/record-minmax/include/RecordMinMax.h
+++ b/compiler/record-minmax/include/RecordMinMax.h
@@ -71,15 +71,6 @@ class RecordMinMax
 
   void profileDataInParallel(const std::string &input_data_path);
 
-// TODO Remove unused code
-#if 0
-  void profileRawData(const std::string &input_data_path);
-
-  void profileRawDataDirectory(const std::string &input_data_path);
-
-  void profileDataWithRandomInputs(void);
-#endif
-
   void saveModel(const std::string &output_model_path);
 
 private:
diff --git a/compiler/record-minmax/src/RecordMinMax.cpp b/compiler/record-minmax/src/RecordMinMax.cpp
index 1c646b11b9d..9069a8adfaa 100644
--- a/compiler/record-minmax/src/RecordMinMax.cpp
+++ b/compiler/record-minmax/src/RecordMinMax.cpp
@@ -36,39 +36,6 @@ using DataType = loco::DataType;
 namespace
 {
 
-// TODO Remove unused code
-#if 0
-// Return a string with no whitespace from both ends
-std::string trim(std::string s)
-{
-  // Trim left side
-  s.erase(s.begin(),
-          std::find_if(s.begin(), s.end(), [](unsigned char ch) { return !std::isspace(ch); }));
-
-  // Trim right side
-  s.erase(
-    std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) { return !std::isspace(ch); }).base(),
-    s.end());
-
-  return s;
-}
-
-std::vector<std::string> parse_line(const std::string &line)
-{
-  auto trimmed = trim(line);
-  std::stringstream ss(trimmed);
-
-  std::vector<std::string> res;
-
-  std::string filename;
-  while (getline(ss, filename, ' '))
-  {
-    res.emplace_back(filename);
-  }
-  return res;
-}
-#endif
-
 // Max h5 file size for parallel recording in bytes = 1 GB
 const long h5_max_size_bytes = 1000000000;
 
@@ -80,82 +47,6 @@ long getH5FileSize(const std::string &input_data_path)
   return in_file.tellg();
 }
 
-// TODO Remove unused code
-#if 0
-uint32_t numElements(const luci::CircleNode *node)
-{
-  uint32_t num_elements = 1;
-  for (uint32_t i = 0; i < node->rank(); i++)
-    num_elements *= node->dim(i).value();
-
-  return num_elements;
-}
-
-// Throw exception if input has one of the following conditions.
-// 1. Have unknown dimension
-// 2. Number of elements is 0
-void checkInputDimension(const luci::CircleInput *input)
-{
-  for (uint32_t i = 0; i < input->rank(); i++)
-    if (!input->dim(i).known())
-      throw std::runtime_error(input->name() + " has unknown dimension");
-
-  if (numElements(input) == 0)
-    throw std::runtime_error(input->name() + " is a zero-sized input");
-}
-
-void readDataFromFile(const std::string &filename, std::vector<char> &data, size_t data_size)
-{
-  assert(data.size() == data_size); // FIX_CALLER_UNLESS
-
-  std::ifstream fs(filename, std::ifstream::binary);
-  if (fs.fail())
-    throw std::runtime_error("Cannot open file \"" + filename + "\".\n");
-  if (fs.read(data.data(), data_size).fail())
-    throw std::runtime_error("Failed to read data from file \"" + filename + "\".\n");
-  if (fs.peek() != EOF)
-    throw std::runtime_error("Input tensor size mismatches with \"" + filename + "\".\n");
-}
-
-std::vector<uint8_t> genRandomBoolData(std::mt19937 &gen, uint32_t num_elements)
-{
-  std::uniform_int_distribution<> dist(0, 1);
-  std::vector<uint8_t> input_data(num_elements);
-
-  // Write random data
-  for (auto &iter : input_data)
-    iter = static_cast<uint8_t>(dist(gen));
-
-  return input_data;
-}
-
-template <typename T>
-std::vector<T> genRandomIntData(std::mt19937 &gen, uint32_t num_elements, T min, T max)
-{
-  std::uniform_int_distribution<T> dist(min, max);
-  std::vector<T> input_data(num_elements);
-
-  // Write random data
-  {
-    auto const generator = [&gen, &dist]() { return dist(gen); };
-    std::generate(begin(input_data), end(input_data), generator);
-  }
-
-  return input_data;
-}
-
-/**
- * @brief  getTensorSize will return size in bytes
- */
-template <typename NodeT> size_t getTensorSize(const NodeT *node)
-{
-  uint32_t tensor_size = luci::size(node->dtype());
-  for (uint32_t i = 0; i < node->rank(); ++i)
-    tensor_size *= node->dim(i).value();
-  return tensor_size;
-}
-#endif
-
 /**
  * @brief  verifyTypeShape checks the type and the shape of CircleInput
  *         This throws an exception if type or shape does not match
@@ -209,172 +100,6 @@ void RecordMinMax::initialize(const std::string &input_model_path)
   }
 }
 
-// TODO Remove unused code
-#if 0
-
-// input_data_path is a path to the directory
-// The directory should contain binary files each of which is a raw data,
-// ready to be consumed by the input circle model without any modification
-// TODO reduce duplicate codes with profileRawData
-void RecordMinMax::profileRawDataDirectory(const std::string &input_data_path)
-{
-  struct dirent *entry = nullptr;
-  DIR *dp = nullptr;
-
-  dp = opendir(input_data_path.c_str());
-  if (not dp)
-    throw std::runtime_error("Cannot open directory. Please check \"" + input_data_path +
-                             "\" is a directory.\n");
-
-  uint32_t num_records = 0;
-  const auto input_nodes = loco::input_nodes(_module->graph());
-
-  // Get total input size
-  uint32_t total_input_size = 0;
-  for (auto input : input_nodes)
-  {
-    const auto *input_node = loco::must_cast<const luci::CircleInput *>(input);
-    checkInputDimension(input_node);
-    total_input_size += getTensorSize(input_node);
-  }
-
-  while ((entry = readdir(dp)))
-  {
-    // Skip if the entry is not a regular file
-    if (entry->d_type != DT_REG)
-      continue;
-
-    const std::string filename = entry->d_name;
-    std::cout << "Recording " << num_records << "'th data" << std::endl;
-
-    // Read data from file to buffer
-    // Assumption: For a multi-input model, the binary file should have inputs concatenated in the
-    // same order with the input index.
-    std::vector<char> input_data(total_input_size);
-    readDataFromFile(input_data_path + "/" + filename, input_data, total_input_size);
-
-    // Write data from buffer to interpreter
-    uint32_t offset = 0;
-    for (auto input : input_nodes)
-    {
-      const auto *input_node = loco::must_cast<const luci::CircleInput *>(input);
-      const auto input_size = getTensorSize(input_node);
-      getInterpreter()->writeInputTensor(input_node, input_data.data() + offset, input_size);
-
-      offset += input_size;
-    }
-
-    getInterpreter()->interpret();
-
-    num_records++;
-  }
-
-  closedir(dp);
-
-  if (num_records == 0)
-    throw std::runtime_error("The input data file does not contain any record.");
-
-  std::cout << "Recording finished. Number of recorded data: " << num_records << std::endl;
-
-  _minmax_computer->update_qparam(getObserver()->minMaxData()->getMap());
-}
-
-// input_data_path is a text file which specifies the representative data
-// The text file should contain absolute file path per line.
-// The pointed file should be a binary file containing one representative data,
-// ready to be consumed by the input circle model without any modification
-// NOTE If a model has multiple inputs, the binary file should have inputs concatenated in the same
-// order with the input index of the circle model.
-void RecordMinMax::profileRawData(const std::string &input_data_path)
-{
-  std::ifstream input_file(input_data_path);
-  if (input_file.fail())
-    throw std::runtime_error("Cannot open file \"" + input_data_path + "\".\n");
-
-  std::string record;
-  uint32_t num_records = 0;
-  const auto input_nodes = loco::input_nodes(_module->graph());
-
-  // Get total input size
-  uint32_t total_input_size = 0;
-  for (auto input : input_nodes)
-  {
-    const auto *input_node = loco::must_cast<const luci::CircleInput *>(input);
-    checkInputDimension(input_node);
-    total_input_size += getTensorSize(input_node);
-  }
-
-  while (getline(input_file, record))
-  {
-    std::cout << "Recording " << num_records << "'th data" << std::endl;
-
-    auto file_names = parse_line(record);
-
-    // Have multiple files in one line
-    if (file_names.size() == input_nodes.size())
-    {
-      std::vector<std::vector<char>> input_data;
-      for (uint32_t i = 0; i < file_names.size(); i++)
-      {
-        const auto file_name = file_names[i];
-        const auto input_node = loco::must_cast<const luci::CircleInput *>(input_nodes[i]);
-        const auto input_size = getTensorSize(input_node);
-
-        input_data.emplace_back(input_size);
-
-        // Read data from file
-        readDataFromFile(file_name, input_data[i], input_size);
-
-        // Write data from buffer to interpreter
-        getInterpreter()->writeInputTensor(input_node, input_data[i].data(), input_size);
-      }
-
-      getInterpreter()->interpret();
-
-      num_records++;
-    }
-    else
-    {
-      // Must have a single file in one line (inputs are concatenated)
-      if (file_names.size() != 1)
-        throw std::runtime_error(
-          "Wrong number of inputs are given. Model has " + std::to_string(input_nodes.size()) +
-          " inputs, but list file gives " + std::to_string(file_names.size()) + " inputs.");
-
-      // clang-format off
-    // Read data from file to buffer
-    // Assumption: For a multi-input model, the binary file should have inputs concatenated in the
-    // same order with the input index.
-    std::vector<char> input_data(total_input_size);
-    readDataFromFile(record, input_data, total_input_size);
-
-    // Write data from buffer to interpreter
-    uint32_t offset = 0;
-    for (auto input : input_nodes)
-    {
-      const auto *input_node = loco::must_cast<const luci::CircleInput *>(input);
-      const auto input_size = getTensorSize(input_node);
-      getInterpreter()->writeInputTensor(input_node, input_data.data() + offset, input_size);
-
-      offset += input_size;
-    }
-
-    getInterpreter()->interpret();
-
-    num_records++;
-      // clang-format on
-    }
-  }
-
-  if (num_records == 0)
-    throw std::runtime_error("The input data file does not contain any record.");
-
-  std::cout << "Recording finished. Number of recorded data: " << num_records << std::endl;
-
-  _minmax_computer->update_qparam(getObserver()->minMaxData()->getMap());
-}
-#endif
-
 WholeOutput RecordMinMax::importH5Data(const std::string &input_data_path)
 {
   try
@@ -520,74 +245,6 @@ void RecordMinMax::profileData()
   _minmax_computer->update_qparam(getObserver()->minMaxData()->getMap());
 }
 
-// TODO Remove unused code
-#if 0
-void RecordMinMax::profileData(const std::string &input_data_path)
-{
-  try
-  {
-    dio::hdf5::HDF5Importer importer(input_data_path);
-    importer.importGroup("value");
-
-    bool is_raw_data = importer.isRawData();
-
-    const auto num_records = importer.numData();
-    if (num_records == 0)
-      throw std::runtime_error("The input data file does not contain any record.");
-
-    const auto input_nodes = loco::input_nodes(_module->graph());
-    const auto num_inputs = input_nodes.size();
-
-    for (int32_t record_idx = 0; record_idx < num_records; record_idx++)
-    {
-      if (num_inputs != static_cast<uint32_t>(importer.numInputs(record_idx)))
-        throw std::runtime_error("Wrong number of inputs.");
-
-      std::cout << "Recording " << record_idx << "'th data" << std::endl;
-
-      for (uint32_t input_idx = 0; input_idx < num_inputs; input_idx++)
-      {
-        const auto *input_node = loco::must_cast<const luci::CircleInput *>(input_nodes[input_idx]);
-        assert(input_node->index() == input_idx);
-        checkInputDimension(input_node);
-        std::vector<char> input_data(getTensorSize(input_node));
-
-        if (!is_raw_data)
-        {
-          DataType dtype;
-          Shape shape;
-          importer.readTensor(record_idx, input_idx, &dtype, &shape, input_data.data(),
-                              input_data.size());
-
-          // Check the type and the shape of the input data is valid
-          verifyTypeShape(input_node, dtype, shape);
-        }
-        else
-        {
-          // Skip type/shape check for raw data
-          importer.readTensor(record_idx, input_idx, input_data.data(), input_data.size());
-        }
-
-        // TODO: Input data is copied twice (file -> buffer (input_data) -> interpreter inputs)
-        //       We can redcue the copy by directly writing data from file to interpreter inputs
-        getInterpreter()->writeInputTensor(input_node, input_data.data(), input_data.size());
-      }
-
-      getInterpreter()->interpret();
-    }
-
-    std::cout << "Recording finished. Number of recorded data: " << num_records << std::endl;
-  }
-  catch (const H5::Exception &e)
-  {
-    H5::Exception::printErrorStack();
-    throw std::runtime_error("HDF5 error occurred.");
-  }
-
-  _minmax_computer->update_qparam(getObserver()->minMaxData()->getMap());
-}
-#endif
-
 void RecordMinMax::profileDataInParallel(const std::string &input_data_path)
 {
   LOGGER(l);
@@ -674,80 +331,6 @@ void RecordMinMax::profileDataInParallel(const std::string &input_data_path)
   _minmax_computer->update_qparam(main_min_max_map.getMap());
 }
 
-// TODO Remove unused code
-#if 0
-void RecordMinMax::profileDataWithRandomInputs(void)
-{
-  // We use three randomly-generated records
-  const uint32_t num_records = 3;
-
-  const auto input_nodes = loco::input_nodes(_module->graph());
-  const auto num_inputs = input_nodes.size();
-
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_real_distribution<> dist(-5, 5);
-
-  for (uint32_t record_idx = 0; record_idx < num_records; record_idx++)
-  {
-    std::cout << "Recording " << record_idx << "'th data" << std::endl;
-
-    for (uint32_t input_idx = 0; input_idx < num_inputs; input_idx++)
-    {
-      const auto *input_node = loco::must_cast<const luci::CircleInput *>(input_nodes[input_idx]);
-      assert(input_node->index() == input_idx);
-      checkInputDimension(input_node);
-
-      const auto num_elements = numElements(input_node);
-
-      // TODO Support more input data types
-      assert(input_node->dtype() == loco::DataType::FLOAT32 ||
-             input_node->dtype() == loco::DataType::BOOL ||
-             input_node->dtype() == loco::DataType::S32 ||
-             input_node->dtype() == loco::DataType::S64);
-
-      if (input_node->dtype() == DataType::FLOAT32)
-      {
-        std::vector<float> input_data(num_elements);
-
-        // Write random data
-        for (auto &iter : input_data)
-          iter = static_cast<float>(dist(gen));
-
-        // TODO: Input data is copied twice (file -> buffer (input_data) -> interpreter inputs)
-        //       We can redcue the copy by directly writing data from file to interpreter inputs
-        getInterpreter()->writeInputTensor(input_node, input_data.data(),
-                                           input_data.size() * sizeof(float));
-      }
-      else if (input_node->dtype() == DataType::BOOL)
-      {
-        auto input_data = genRandomBoolData(gen, num_elements);
-        getInterpreter()->writeInputTensor(input_node, input_data.data(),
-                                           input_data.size() * sizeof(uint8_t));
-      }
-      else if (input_node->dtype() == DataType::S32)
-      {
-        auto input_data = genRandomIntData<int32_t>(gen, num_elements, 0, 100);
-        getInterpreter()->writeInputTensor(input_node, input_data.data(),
-                                           input_data.size() * sizeof(int32_t));
-      }
-      else if (input_node->dtype() == DataType::S64)
-      {
-        auto input_data = genRandomIntData<int64_t>(gen, num_elements, 0, 100);
-        getInterpreter()->writeInputTensor(input_node, input_data.data(),
-                                           input_data.size() * sizeof(int64_t));
-      }
-    }
-
-    getInterpreter()->interpret();
-  }
-
-  std::cout << "Recording finished. Number of recorded data: " << num_records << std::endl;
-
-  _minmax_computer->update_qparam(getObserver()->minMaxData()->getMap());
-}
-#endif
-
 void RecordMinMax::saveModel(const std::string &output_model_path)
 {
   // Export to output Circle file

From b6416b7e8050a1b44953d4aa1f080c9ab55c1553 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Tue, 29 Oct 2024 10:25:18 +0100
Subject: [PATCH 30/46] [mir] Minor doc fix (#14276)

This removes trailing whitespace and adds a missing article.

ONE-DCO-1.0-Signed-off-by: Piotr Fusik <p.fusik@samsung.com>
---
 compiler/mir/Readme.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/compiler/mir/Readme.md b/compiler/mir/Readme.md
index 9fb1348a569..76175d5568f 100644
--- a/compiler/mir/Readme.md
+++ b/compiler/mir/Readme.md
@@ -21,7 +21,7 @@ special attributes specific to different operation types.
 Mir has a protobuf serializer/deserializer for shapes and tensors (see `mir.proto` schema).
 
 For list of currently supported operations, see `mir/ops/operations.lst.h`.
- 
+
 ### How to use
 Can be included as a `CMake` target.
 
@@ -29,8 +29,8 @@ Can be included as a `CMake` target.
 
 * Expand serialization
 * Add More to readme 
- 
+
 ### Dependencies
 
-Mir depends on `adtitas` library, which provides the `small_vector` data type.
-    
+Mir depends on the `adtitas` library, which provides the `small_vector` data type.
+

From 878b4c34a5c36e1314d47140285cc719f365ac5d Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Tue, 29 Oct 2024 10:27:55 +0100
Subject: [PATCH 31/46] [tf2tfliteV2] Minor doc fix (#14283)

This fixes typos and formatting.

ONE-DCO-1.0-Signed-off-by: Piotr Fusik <p.fusik@samsung.com>
---
 compiler/tf2tfliteV2/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/compiler/tf2tfliteV2/README.md b/compiler/tf2tfliteV2/README.md
index 0a90735cbd1..3364253e059 100644
--- a/compiler/tf2tfliteV2/README.md
+++ b/compiler/tf2tfliteV2/README.md
@@ -3,11 +3,11 @@
 _tf2tfliteV2_ is a TensorFlow to TensorFlow Lite model Converter.
 
 ## Where does V2 come from?
-Even though we alreay have _tf2tflite_, we cannot cover all opeartors in TensorFlow. To expand coverage, we introduce _tf2tfliteV2_ which uses `TensorFlow Lite Converter`(by Google) internally.
+Even though we already have _tf2tflite_, we cannot cover all operators in TensorFlow. To expand coverage, we introduce _tf2tfliteV2_ which internally uses `TensorFlow Lite Converter`(by Google).
 
 ## Prerequisite
-- Frozen graph from TensorFlow 1.13.1 in binary(`*.pb`) or text(`*.pbtxt`) format
-- Desired version of TensorFlow(You can use python virtualenv, docker, etc.)
+- Frozen graph from TensorFlow 1.13.1 in binary (`*.pb`) or text (`*.pbtxt`) format
+- Desired version of TensorFlow (you can use python virtualenv, docker, etc.)
 
 ## Example
 ```
@@ -42,7 +42,7 @@ python tf2tfliteV2.py \
 > --output_arrays=output,output:1,output:2
 ```
 
-## optional argument
+## Optional arguments
 ```
   -h, --help            show this help message and exit
   --v1                  Use TensorFlow Lite Converter 1.x

From a6402337144d468f990269bdf3b94fb552d5801b Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Tue, 29 Oct 2024 10:43:43 +0100
Subject: [PATCH 32/46] [loco] Minor doc fix (#14272)

This corrects a typo and a missing article.

ONE-DCO-1.0-Signed-off-by: Piotr Fusik <p.fusik@samsung.com>
---
 compiler/loco/doc/LEP_000_Dialect_Service.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler/loco/doc/LEP_000_Dialect_Service.md b/compiler/loco/doc/LEP_000_Dialect_Service.md
index f6f6dc80922..852d9df0456 100644
--- a/compiler/loco/doc/LEP_000_Dialect_Service.md
+++ b/compiler/loco/doc/LEP_000_Dialect_Service.md
@@ -64,7 +64,7 @@ struct GraphOutputIndexQueryService : public DialectService
 
 This proposal extends ``Dialect`` class with ``service`` method.
 
-Each dialect SHOULD return a valid pointer on ``service<Service>`` method call if it implements that service. Otherwise, it SHOULD return a null pointer otherwise.
+Each dialect SHOULD return a valid pointer on ``service<Service>`` method call if it implements that service. Otherwise, it SHOULD return a null pointer.
 
 **WARNING** It is impossible to use ``get``. ``get`` is currently reserved for singleton accessor.
 
@@ -106,7 +106,7 @@ std::vector<loco::Node *> output_nodes(loco::Graph *g)
 
 ### How to register a service
 
-Each dialect should invoke protected ``service`` method during its construction.
+Each dialect should invoke the protected ``service`` method during its construction.
 ```cxx
 AwesomeDialect::AwesomeDialect()
 {

From 7b5a0c29ba796215c30d5ae8d000ccba31ab0fcc Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Tue, 29 Oct 2024 10:44:21 +0100
Subject: [PATCH 33/46] [locomotiv] Minor doc fix (#14274)

This corrects formatting and grammar.

ONE-DCO-1.0-Signed-off-by: Piotr Fusik <p.fusik@samsung.com>
---
 compiler/locomotiv/README.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/compiler/locomotiv/README.md b/compiler/locomotiv/README.md
index 9569f6ea3a7..fbbb4e618ae 100644
--- a/compiler/locomotiv/README.md
+++ b/compiler/locomotiv/README.md
@@ -2,7 +2,7 @@
 _locomotiv_ is a reference interpreter for _loco_ IR.
 
 # Purpose
-- _locomotiv_ would serve as code level specification and reference implementation for loco IR.
+- _locomotiv_ would serve as code level specification and a reference implementation for loco IR.
 - _locomotiv_ is required for loco-related tools to be tested.
 
 # Sample code to use locomotiv library
@@ -60,31 +60,31 @@ case loco::DataType::FLOAT32:
 4. Test new node execution at `locomotiv/src/Node/TheNode.test.cpp` if possible.
 
 ### Note on internal data layout rule
-For each domain(see `loco::Domain`), `locomotiv` has fixed layout rule on how to store its data in memory.
+For each domain (see `loco::Domain`), `locomotiv` has fixed layout rule on how to store its data in memory.
 - Feature is represented as NHWC layout
-  - That is number of batch(N), height(H), width(W) and channel depth(C)
+  - That is number of batch (N), height (H), width (W) and channel depth (C)
 - Filter is represented as NHWC layout
-  - That is number of filter(N), height(H), width(W) and input channel depth(C)
+  - That is number of filter (N), height (H), width (W) and input channel depth (C)
 - DepthwiseFilter is represented as HWCM layout
-  - That is height(H), width(W), input channel depth(C) and depth multiplier(M)
+  - That is height (H), width (W), input channel depth (C) and depth multiplier (M)
 - Matrix is represented as HW layout
-  - That is height(H), width(W)
+  - That is height (H), width (W)
 
 ### Notes on step 3
 - Mocking Tensorflow lite `reference_op.h` might be a good place to start.
-- `execute()` can be called multiple time. It just recalculates and updates annotated data. So it should `erase_annot_data()` before newly `annot_data()`.
+- `execute()` can be called multiple times. It just recalculates and updates annotated data. So it should `erase_annot_data()` before newly `annot_data()`.
 - Most node execution behaviour would be implemented for each data type.
 - `execute()` should throw runtime error on invalid cases. Some of these cases are explained:
   - Invalid argument node
-    - e.g.) Pull -> MaxPool2D is invalid as MaxPool2D requires feature map as its argument.
+    - e.g. Pull -> MaxPool2D is invalid as MaxPool2D requires feature map as its argument.
   - Lack of argument data
-    - e.g.) Given 'Pull -> Push' graph. On execution of Push, if no NodeData annotated to Pull, it is invalid.
+    - e.g. Given 'Pull -> Push' graph. On execution of Push, if no NodeData annotated to Pull, it is invalid.
   - Mismatch of argument shapes
-    - e.g.) Addition between 2x2 and 3x3 tensor is invalid
-    - e.g.) MaxPool2D expects its ifm to be 4D feature, otherwise invalid.
+    - e.g. Addition between 2x2 and 3x3 tensor is invalid
+    - e.g. MaxPool2D expects its ifm to be 4D feature, otherwise invalid.
   - Mismatch between node's own information and inferred information
     - Some node already have attributes like shape or data type. If inferred information is different with existing node's, it is invalid.
 
 ### Recommendation on step 4 (test)
 - If the node has no arguments, create a node object and `NodeExecution::run()` on it. Check whether it operates correctly.
-- If the node has N(>= 1) arguments, make N pull node inputs, source them to the node to be tested. FeatureEncode or FilterEncode node may be required inbetween depending on the node's argument type. Then annotate N pull nodes with its data, `NodeExecution::run()` on the node to test, and check whether it operates correctly.
+- If the node has N (>= 1) arguments, make N pull node inputs, source them to the node to be tested. FeatureEncode or FilterEncode node may be required inbetween depending on the node's argument type. Then annotate N pull nodes with its data, `NodeExecution::run()` on the node to test, and check whether it operates correctly.

From 213282390b5b70a386827767af67b9eb6871a25b Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Tue, 29 Oct 2024 10:44:50 +0100
Subject: [PATCH 34/46] [moco-log] Minor doc fix (#14278)

This adds a missing article.

ONE-DCO-1.0-Signed-off-by: Piotr Fusik <p.fusik@samsung.com>
---
 compiler/moco-log/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/moco-log/README.md b/compiler/moco-log/README.md
index d8289ab9b10..9f89fc865d1 100644
--- a/compiler/moco-log/README.md
+++ b/compiler/moco-log/README.md
@@ -1,3 +1,3 @@
 # moco-log
 
-_moco-log_ is a logging framework for _moco_ compiler framework.
+_moco-log_ is a logging framework for the _moco_ compiler framework.

From acf0e0f73b217f463581f51824a3cc6e2ee508d2 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Tue, 29 Oct 2024 10:46:52 +0100
Subject: [PATCH 35/46] [nnc] Minor doc fixes (#14280)

This removes an unneeded table of contents and fixes some typos.

ONE-DCO-1.0-Signed-off-by: Piotr Fusik <p.fusik@samsung.com>
---
 compiler/nnc/README.md                    | 2 +-
 compiler/nnc/utils/model_runner/readme.md | 8 +-------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/compiler/nnc/README.md b/compiler/nnc/README.md
index 538811f2d46..0fd38ec296d 100644
--- a/compiler/nnc/README.md
+++ b/compiler/nnc/README.md
@@ -4,7 +4,7 @@ Neural Network Compiler
 ### DESCRIPTION
 
 nnc is a neural network compiler that transforms neural networks of various formats into source or machine code.
-> At this moment only two NN are supported (MobileNet and InceptionV3) in Tensorflow Lite or Caffe format.
+> At this moment, only two NN are supported (MobileNet and InceptionV3) in Tensorflow Lite or Caffe format.
 
 ### SYNOPSIS
 
diff --git a/compiler/nnc/utils/model_runner/readme.md b/compiler/nnc/utils/model_runner/readme.md
index 51ff65b6fa8..6d5f9f0e004 100644
--- a/compiler/nnc/utils/model_runner/readme.md
+++ b/compiler/nnc/utils/model_runner/readme.md
@@ -1,14 +1,8 @@
 # here I write how I run model on my computer
 
-sections: 
-a) goal of this script
-b) examples of code running in author's local machine
-c) parametrs and short comment
-____
 ## goal of this script
 
 Here the author has attempted to implement a program capable of running any of the 4 models (caffe, caffe2, tflite, onnx) in a simple and user-friendly manner. The goal of the program is to get the file containing the output of the computation graph at the program output.
-_______
 
 ## examples of code running in author's local machine
 The purpose of the examples below is to demonstrate which arguments and in which order you should use to run this script correctly.
@@ -32,7 +26,7 @@ $ python model_runner.py  -m onnx_runer/model.onnx -i RANDOM.hdf5
 
  ------
  
- ## parametrs and short comment
+ ## parameters and short comment
  
  -m mean pre learned model which you run
  -i mean model's input

From aada331490becc11ff0cb7ce91d52a0eea8e56dd Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Wed, 30 Oct 2024 01:15:48 +0100
Subject: [PATCH 36/46] [luci] Minor doc fix (#14275)

This adds missing "the" articles.

ONE-DCO-1.0-Signed-off-by: Piotr Fusik <p.fusik@samsung.com>
---
 compiler/luci/log/README.md   | 2 +-
 compiler/luci/logex/README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler/luci/log/README.md b/compiler/luci/log/README.md
index 512bc96d241..24937074198 100644
--- a/compiler/luci/log/README.md
+++ b/compiler/luci/log/README.md
@@ -1,3 +1,3 @@
 # luci-log
 
-_luci-log_ is a logging framework for _luci_ compiler framework.
+_luci-log_ is a logging framework for the _luci_ compiler framework.
diff --git a/compiler/luci/logex/README.md b/compiler/luci/logex/README.md
index 03b6baf35d6..0f62c6a6148 100644
--- a/compiler/luci/logex/README.md
+++ b/compiler/luci/logex/README.md
@@ -1,3 +1,3 @@
 # luci-logex
 
-_luci-logex_ is a extended logging utility for _luci_ compiler framework.
+_luci-logex_ is a extended logging utility for the _luci_ compiler framework.

From 81f3b25212cd0ef0163c12ef9fb4e1ea243aedb6 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Wed, 30 Oct 2024 01:16:53 +0100
Subject: [PATCH 37/46] [moco] Fix a doc typo (#14277)

This fixes a typo in README.

ONE-DCO-1.0-Signed-off-by: Piotr Fusik <p.fusik@samsung.com>
---
 compiler/moco/support/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/moco/support/README.md b/compiler/moco/support/README.md
index 081f65d3906..7ccdebe448e 100644
--- a/compiler/moco/support/README.md
+++ b/compiler/moco/support/README.md
@@ -1,3 +1,3 @@
 # support
 
-_support_ privides _moco_ support libraries
+_support_ provides _moco_ support libraries

From 8fa02392c06e6f37eb7a60eb351f71422139d6dc Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Wed, 30 Oct 2024 01:17:39 +0100
Subject: [PATCH 38/46] [moco-tf] Minor doc fix (#14279)

This fixes grammar in README.

ONE-DCO-1.0-Signed-off-by: Piotr Fusik <p.fusik@samsung.com>
---
 compiler/moco-tf/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler/moco-tf/README.md b/compiler/moco-tf/README.md
index add1159e1eb..3dccd014a1f 100644
--- a/compiler/moco-tf/README.md
+++ b/compiler/moco-tf/README.md
@@ -4,7 +4,7 @@ _moco-tf_ translates a TensorFlow model into _loco_
 
 ## Purpose
 
-_moco-tf_ is to convert TensorFlow generated model file to in-memory _loco_ IR Graph.
+_moco-tf_ converts a TensorFlow generated model file to in-memory _loco_ IR Graph.
 
 ## How to use
 
@@ -22,7 +22,7 @@ _moco-tf_ is to convert TensorFlow generated model file to in-memory _loco_ IR G
 
 ## Dependency
 
-Please refer [requires.cmake](./requires.cmake) for dependant modules.
+Please refer to [requires.cmake](./requires.cmake) for dependant modules.
 
 ## Naming rules
 

From ced5e4b19c0dfab084a2b1acd56dc5816932ec1b Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Wed, 30 Oct 2024 01:18:09 +0100
Subject: [PATCH 39/46] [plier-tf] Minor doc fix (#14282)

This fixes grammar in README.

ONE-DCO-1.0-Signed-off-by: Piotr Fusik <p.fusik@samsung.com>
---
 compiler/plier-tf/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/plier-tf/README.md b/compiler/plier-tf/README.md
index b7c1d6116f8..cb463f74764 100644
--- a/compiler/plier-tf/README.md
+++ b/compiler/plier-tf/README.md
@@ -1,3 +1,3 @@
 # plier-tf
 
-_plier-tf_ is a collection of small tools to handle TensorFlow model.
+_plier-tf_ is a collection of small tools to handle TensorFlow models.

From f8185d4500435013f953f03e648a6fe95ba82ce9 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Wed, 30 Oct 2024 01:18:34 +0100
Subject: [PATCH 40/46] [onnx2circle] Minor doc fix (#14281)

This fixes the "an" article in README.

ONE-DCO-1.0-Signed-off-by: Piotr Fusik <p.fusik@samsung.com>
---
 compiler/onnx2circle/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/onnx2circle/README.md b/compiler/onnx2circle/README.md
index 55b73870ed1..98e4e8153cb 100644
--- a/compiler/onnx2circle/README.md
+++ b/compiler/onnx2circle/README.md
@@ -1,3 +1,3 @@
 # onnx2circle
 
-_onnx2circle_ is a ONNX-to-Circle model converter.
+_onnx2circle_ is an ONNX-to-Circle model converter.

From f5ddd42e238adb4d166f7fb0df7d888c03e5e754 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Wed, 30 Oct 2024 11:04:04 +0100
Subject: [PATCH 41/46] [locoex-customop] Improve README (#14273)

This corrects grammar and formatting in README.

ONE-DCO-1.0-Signed-off-by: Piotr Fusik <p.fusik@samsung.com>
---
 compiler/locoex-customop/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/compiler/locoex-customop/README.md b/compiler/locoex-customop/README.md
index 3f71140f964..2838e146b14 100644
--- a/compiler/locoex-customop/README.md
+++ b/compiler/locoex-customop/README.md
@@ -1,9 +1,9 @@
 # locoex
 
-_locoex_ is an extention of loco. Classes with `COp` prefix enables *Custom Operation*.
+_locoex_ is an extension of loco. Classes with the `COp` prefix enable *Custom Operation*.
 In this version, a *custom operation* means one of the following:
 
-1. an op that is supported by Tensorflow but not supported both by the moco and the onert
-1. an op that is not supported by Tensorflow, moco, and the onert
+1. an op that is supported by Tensorflow but not by moco and onert
+2. an op that is not supported by Tensorflow, moco or onert
 
-`COpCall` node will represent IR entity that calls custom operations and kernels.
+`COpCall` node will represent an IR entity that calls custom operations and kernels.

From 4f61fcb62ffaae4ea7c73621a7b952ec2addb398 Mon Sep 17 00:00:00 2001
From: seockho-kim <seockho.kim@samsung.com>
Date: Thu, 31 Oct 2024 14:58:57 +0900
Subject: [PATCH 42/46] [docs/howto] Fix broken link in 'how to use nnfw api'
 (#14289)

This fixes a broken link in the how-to-use-nnfw-api.md file.

ONE-DCO-1.0-Signed-off-by: Seockho Kim seockho.kim@samsung.com
---
 docs/howto/how-to-use-nnfw-api.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/howto/how-to-use-nnfw-api.md b/docs/howto/how-to-use-nnfw-api.md
index 1f7203999bd..1712e25a5fc 100644
--- a/docs/howto/how-to-use-nnfw-api.md
+++ b/docs/howto/how-to-use-nnfw-api.md
@@ -10,7 +10,7 @@ Please see [model2nnpkg](https://github.com/Samsung/ONE/tree/master/tools/nnpack
 
 ## Build app with NNFW API
 
-Here are basic steps to build app with [NNFW C API](https://github.com/Samsung/ONE/blob/master/runtime/onert/api/include/nnfw.h)
+Here are basic steps to build app with [NNFW C API](https://github.com/Samsung/ONE/blob/master/runtime/onert/api/nnfw/include/nnfw.h)
 
 1) Initialize nnfw_session
 ``` c

From cf45dcca8ef823719ac38abb9b7759e3f5e04384 Mon Sep 17 00:00:00 2001
From: seockho-kim <seockho.kim@samsung.com>
Date: Fri, 1 Nov 2024 09:15:53 +0900
Subject: [PATCH 43/46] [onert/backend] Fix checking indices tensor shape in
 GatherLayer (#14284)

This fixes checking indices tensor shape in GatherLayer.
If dimension count is 4 and dim(0) != 0, it should throw exception, but it didn't.

ONE-DCO-1.0-Signed-off-by: Seockho Kim seockho.kim@samsung.com
---
 runtime/onert/backend/cpu/ops/GatherLayer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime/onert/backend/cpu/ops/GatherLayer.cc b/runtime/onert/backend/cpu/ops/GatherLayer.cc
index d445c721d66..2b23546488c 100644
--- a/runtime/onert/backend/cpu/ops/GatherLayer.cc
+++ b/runtime/onert/backend/cpu/ops/GatherLayer.cc
@@ -83,7 +83,7 @@ void GatherLayer::runByGGMLQuantInputType()
   if (getShape(_input).DimensionsCount() != 2)
     throw std::runtime_error("Gather: block quantized input tensor must be rank 2");
 
-  if (getShape(_indices).DimensionsCount() > 4 &&
+  if (getShape(_indices).DimensionsCount() >= 4 &&
       (getShape(_indices).DimensionsCount() != 4 || getShape(_indices).Dims(0) != 1))
     throw std::runtime_error("Gather: invalid indices tensor shape");
 

From ccf6510ae96d131ed6d8a7fb5bcbdd4ce0e12950 Mon Sep 17 00:00:00 2001
From: BLEE <61487178+BLee-bot@users.noreply.github.com>
Date: Fri, 1 Nov 2024 16:56:19 +0900
Subject: [PATCH 44/46] [record-hessian] Introduce HessianComputer (#14265)

This commit introduce gessian computer and hessian vector.

ONE-DCO-1.0-Signed-off-by: Banseok Lee <bs93.lee@samsung.com>
---
 compiler/record-hessian/CMakeLists.txt        |  36 +++
 compiler/record-hessian/README.md             |   3 +
 .../include/record-hessian/HessianComputer.h  |  62 +++++
 .../include/record-hessian/HessianVector.h    |  60 +++++
 compiler/record-hessian/requires.cmake        |   3 +
 .../record-hessian/src/HessianComputer.cpp    | 218 ++++++++++++++++++
 .../src/HessianComputer.test.cpp              | 108 +++++++++
 7 files changed, 490 insertions(+)
 create mode 100644 compiler/record-hessian/CMakeLists.txt
 create mode 100644 compiler/record-hessian/README.md
 create mode 100644 compiler/record-hessian/include/record-hessian/HessianComputer.h
 create mode 100644 compiler/record-hessian/include/record-hessian/HessianVector.h
 create mode 100644 compiler/record-hessian/requires.cmake
 create mode 100644 compiler/record-hessian/src/HessianComputer.cpp
 create mode 100644 compiler/record-hessian/src/HessianComputer.test.cpp

diff --git a/compiler/record-hessian/CMakeLists.txt b/compiler/record-hessian/CMakeLists.txt
new file mode 100644
index 00000000000..75281e6c8cb
--- /dev/null
+++ b/compiler/record-hessian/CMakeLists.txt
@@ -0,0 +1,36 @@
+file(GLOB_RECURSE SOURCES "src/*.cpp")
+file(GLOB_RECURSE TESTS "src/*.test.cpp")
+list(REMOVE_ITEM SOURCES ${TESTS})
+
+add_library(record-hessian STATIC ${SOURCES})
+
+target_include_directories(record-hessian PUBLIC include)
+target_include_directories(record-hessian PRIVATE src)
+
+target_link_libraries(record-hessian luci_import)
+target_link_libraries(record-hessian luci_env)
+target_link_libraries(record-hessian luci_export)
+target_link_libraries(record-hessian luci_interpreter)
+target_link_libraries(record-hessian luci_log)
+target_link_libraries(record-hessian dio_hdf5)
+
+install(TARGETS record-hessian DESTINATION lib)
+install(DIRECTORY include/ DESTINATION include
+        FILES_MATCHING PATTERN "*.h")
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
+GTest_AddTest(record_hessian_tests ${TESTS})
+target_include_directories(record_hessian_tests PRIVATE include)
+target_include_directories(record_hessian_tests PRIVATE src)
+target_link_libraries(record_hessian_tests luci_lang)
+target_link_libraries(record_hessian_tests luci_pass)
+target_link_libraries(record_hessian_tests loco)
+target_link_libraries(record_hessian_tests dio_hdf5)
+target_link_libraries(record_hessian_tests nncc_coverage)
+target_link_libraries(record_hessian_tests luci_interpreter)
+target_link_libraries(record_hessian_tests record-hessian)
diff --git a/compiler/record-hessian/README.md b/compiler/record-hessian/README.md
new file mode 100644
index 00000000000..49e6b2d9365
--- /dev/null
+++ b/compiler/record-hessian/README.md
@@ -0,0 +1,3 @@
+# record-hessian
+
+_record-hessian_ calculates hessian metrix of activations for quantization.
diff --git a/compiler/record-hessian/include/record-hessian/HessianComputer.h b/compiler/record-hessian/include/record-hessian/HessianComputer.h
new file mode 100644
index 00000000000..fc3cdebcb93
--- /dev/null
+++ b/compiler/record-hessian/include/record-hessian/HessianComputer.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RECORD_HESSIAN_HESSIANCOMPUTER_H__
+#define __RECORD_HESSIAN_HESSIANCOMPUTER_H__
+
+#include "record-hessian/HessianVector.h"
+
+#include <luci/IR/CircleNode.h>
+#include <luci_interpreter/Interpreter.h>
+
+#include <memory>
+#include <vector>
+#include <unordered_map>
+
+namespace record_hessian
+{
+/**
+ * @brief Record approximated hessian matrix from
+ * GPTQ paper(https://arxiv.org/abs/2210.17323).
+ */
+using HessianMap = std::unordered_map<const luci::CircleNode *, std::vector<float>>;
+using HessianVectorMap = std::unordered_map<const luci::CircleNode *, HessianVector>;
+
+class HessianComputer
+{
+public:
+  // Record min/max of node
+  void recordHessian(const luci::CircleNode *node, const luci_interpreter::Tensor *input_tensor);
+
+  std::unique_ptr<HessianMap> getMap();
+
+private:
+  HessianVectorMap _hessian_map;
+  const luci_interpreter::Tensor *_input_tensor = nullptr;
+
+  void recordHessianForConv2D(const luci::CircleNode *node);
+
+  void recordHessianForFullyConnected(const luci::CircleNode *node);
+};
+
+void unfold(std::vector<float> &buf, uint32_t input_n, uint32_t input_h, uint32_t input_w,
+            uint32_t input_c, uint32_t stride_h, uint32_t stride_w, uint32_t dilation_h,
+            uint32_t dilation_w, uint32_t kernel_oc, uint32_t kernel_h, uint32_t kernel_w,
+            uint32_t kernel_ic);
+
+} // namespace record_hessian
+
+#endif // __RECORD_HESSIAN_HESSIANCOMPUTER_H__
diff --git a/compiler/record-hessian/include/record-hessian/HessianVector.h b/compiler/record-hessian/include/record-hessian/HessianVector.h
new file mode 100644
index 00000000000..400b477616f
--- /dev/null
+++ b/compiler/record-hessian/include/record-hessian/HessianVector.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RECORD_HESSIAN_HESSIANVECTOR_H__
+#define __RECORD_HESSIAN_HESSIANVECTOR_H__
+
+#include <luci/IR/CircleNodes.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace record_hessian
+{
+
+struct HessianVector
+{
+  std::vector<float> hessian;
+  size_t count;
+
+  HessianVector() : count(0) {}
+
+  void update(const std::vector<float> &new_hessian)
+  {
+    if (count == 0)
+    {
+      hessian.resize(new_hessian.size());
+    }
+    else if (hessian.size() != new_hessian.size())
+    {
+      hessian.resize(new_hessian.size());
+    }
+
+    size_t numel = new_hessian.size();
+    float alpha = 1.f / static_cast<float>(count + 1);
+
+    for (size_t i = 0; i < numel; ++i)
+    {
+      hessian[i] = (hessian[i] * count + new_hessian[i]) * alpha;
+    }
+
+    count++;
+  };
+};
+
+} // namespace record_hessian
+
+#endif // __RECORD_HESSIAN_HESSIANVECTOR_H__
diff --git a/compiler/record-hessian/requires.cmake b/compiler/record-hessian/requires.cmake
new file mode 100644
index 00000000000..bfba787368f
--- /dev/null
+++ b/compiler/record-hessian/requires.cmake
@@ -0,0 +1,3 @@
+require("luci")
+require("luci-interpreter")
+require("dio-hdf5")
diff --git a/compiler/record-hessian/src/HessianComputer.cpp b/compiler/record-hessian/src/HessianComputer.cpp
new file mode 100644
index 00000000000..6ae36cf0797
--- /dev/null
+++ b/compiler/record-hessian/src/HessianComputer.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "record-hessian/HessianComputer.h"
+
+#include <luci/IR/CircleQuantParam.h>
+
+namespace record_hessian
+{
+
+/**
+ * @brief unfold the vector with NHWC shape, inherently acting in an in-place manner.
+ * @note (N, H, W, C) -> (N, L, K_h * K_w * C).
+ * See details(https://pytorch.org/docs/stable/generated/torch.nn.Unfold.html).
+ */
+void unfold(std::vector<float> &buf, uint32_t input_n, uint32_t input_h, uint32_t input_w,
+            uint32_t input_c, uint32_t stride_h, uint32_t stride_w, uint32_t dilation_h,
+            uint32_t dilation_w, uint32_t kernel_oc, uint32_t kernel_h, uint32_t kernel_w,
+            uint32_t kernel_ic)
+{
+  assert(input_n > 0 && input_h > 0 && input_w > 0 && input_c > 0);
+  assert(stride_h > 0 && stride_w > 0);
+  assert(kernel_oc > 0 && kernel_h > 0 && kernel_w > 0 && kernel_ic > 0);
+
+  if (input_c != kernel_ic)
+    throw std::runtime_error("RecordHessian: Input channels do not match kernel channels.");
+  int out_height = (input_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1;
+  int out_width = (input_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1;
+  int patch_size = kernel_h * kernel_w * kernel_ic;
+  std::vector<float> unfolded_buf(input_n * out_height * out_width * patch_size, 0.0f);
+
+  int index = 0;
+  int in_y, in_x;
+  for (int n = 0; n < input_n; ++n)
+  {
+    for (int y = 0; y < out_height; ++y)
+    {
+      for (int x = 0; x < out_width; ++x)
+      {
+        for (int in_c = 0; in_c < input_c; ++in_c)
+        {
+          for (int ky = 0; ky < kernel_h; ++ky)
+          {
+            for (int kx = 0; kx < kernel_w; ++kx)
+            {
+              in_y = y * stride_h + ky * dilation_h;
+              in_x = x * stride_w + kx * dilation_w;
+              if (in_y < input_h && in_x < input_w)
+              {
+                unfolded_buf[index] = buf[((n * input_h + in_y) * input_w + in_x) * input_c + in_c];
+              }
+              index++;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  buf.swap(unfolded_buf);
+}
+
+void HessianComputer::recordHessianForFullyConnected(const luci::CircleNode *node)
+{
+  assert(_input_tensor->shape().num_dims() < 4);
+  assert(_input_tensor->element_type() == luci_interpreter::DataType::FLOAT32);
+
+  uint32_t size_in_ch;
+  uint32_t length;
+
+  const auto data = _input_tensor->data<float>();
+  const auto num_elements = _input_tensor->shape().num_elements();
+  std::vector<float> buf(data, data + num_elements);
+
+  if (_input_tensor->shape().num_dims() == 3)
+  {
+    size_in_ch = _input_tensor->shape().dim(2); // input_tensor [batch, length, channel]
+  }
+  else if (_input_tensor->shape().num_dims() == 2)
+  {
+    size_in_ch = _input_tensor->shape().dim(1); // input_tensor [length, channel]
+  }
+  else
+  {
+    throw std::runtime_error("RecordHessian: Unsupported node rank");
+  }
+  assert(size_in_ch != 0);
+  length = num_elements / size_in_ch;
+
+  std::vector<float> hessian(size_in_ch * size_in_ch, 0);
+
+  for (int i = 0; i < size_in_ch; ++i)
+  {
+    for (int j = 0; j < size_in_ch; ++j)
+    {
+      float sum = 0;
+      for (int k = 0; k < length; ++k)
+      {
+        sum += buf[i + k * size_in_ch] * buf[j + k * size_in_ch];
+      }
+      hessian[i * size_in_ch + j] = 2 * sum;
+    }
+  }
+
+  HessianVector &vector = _hessian_map[node];
+  vector.update(hessian);
+}
+
+void HessianComputer::recordHessianForConv2D(const luci::CircleNode *node)
+{
+  assert(_input_tensor->shape().num_dims() == 4);
+  assert(_input_tensor->element_type() == luci_interpreter::DataType::FLOAT32);
+
+  const auto circle_conv2d = loco::must_cast<const luci::CircleConv2D *>(node);
+  const auto node_filter = loco::must_cast<luci::CircleConst *>((circle_conv2d)->filter());
+  assert(circle_conv2d->rank() >= 4);
+  assert(node_filter->dtype() == loco::DataType::FLOAT32);
+  assert(node_filter->rank() == 4);
+
+  uint32_t size_filter = node_filter->size<loco::DataType::FLOAT32>();
+  uint32_t size_in_ch =
+    node_filter->size<loco::DataType::FLOAT32>() / circle_conv2d->dim(3).value();
+
+  uint32_t input_n = _input_tensor->shape().dim(0);
+  uint32_t input_h = _input_tensor->shape().dim(1);
+  uint32_t input_w = _input_tensor->shape().dim(2);
+  uint32_t input_c = _input_tensor->shape().dim(3);
+
+  uint32_t stride_h = circle_conv2d->stride()->h();
+  uint32_t stride_w = circle_conv2d->stride()->w();
+  uint32_t dilation_h = circle_conv2d->dilation()->h();
+  uint32_t dilation_w = circle_conv2d->dilation()->w();
+
+  uint32_t kernel_oc = node_filter->dim(0).value();
+  uint32_t kernel_h = node_filter->dim(1).value();
+  uint32_t kernel_w = node_filter->dim(2).value();
+  uint32_t kernel_ic = node_filter->dim(3).value();
+
+  const auto data = _input_tensor->data<float>();
+  const auto num_elements = _input_tensor->shape().num_elements();
+  assert(data != 0);
+  assert(num_elements != 0);
+  std::vector<float> buf(data, data + num_elements);
+
+  unfold(buf, input_n, input_h, input_w, input_c, stride_h, stride_w, dilation_h, dilation_w,
+         kernel_oc, kernel_h, kernel_w, kernel_ic);
+  assert(size_in_ch != 0);
+  uint32_t length = buf.size() / size_in_ch;
+
+  std::vector<float> hessian(size_in_ch * size_in_ch, 0);
+  for (int i = 0; i < size_in_ch; ++i)
+  {
+    for (int j = 0; j < size_in_ch; ++j)
+    {
+      float sum = 0;
+      for (int k = 0; k < length; ++k)
+      {
+        sum += buf[i + k * size_in_ch] * buf[j + k * size_in_ch];
+      }
+      hessian[i * size_in_ch + j] = 2 * sum;
+    }
+  }
+
+  HessianVector &vector = _hessian_map[node];
+  vector.update(hessian);
+}
+
+void HessianComputer::recordHessian(const luci::CircleNode *node,
+                                    const luci_interpreter::Tensor *input_tensor)
+{
+  if (node == nullptr || input_tensor == nullptr)
+    throw std::invalid_argument("RecordHessian: node or input_tensor is null.");
+
+  if (input_tensor->element_type() != luci_interpreter::DataType::FLOAT32)
+    throw std::runtime_error("RecordHessian: Unsupported dtype: only FLOAT32 is supported.");
+
+  _input_tensor = input_tensor;
+
+  switch (node->opcode())
+  {
+    case luci::CircleOpcode::FULLY_CONNECTED:
+      recordHessianForFullyConnected(node);
+      break;
+    case luci::CircleOpcode::CONV_2D:
+      recordHessianForConv2D(node);
+      break;
+    default:
+      throw std::runtime_error("RecordHessian: " + node->name() + " is unsupported op.");
+  }
+}
+
+std::unique_ptr<HessianMap> HessianComputer::getMap()
+{
+  auto hessian_map = std::make_unique<HessianMap>();
+
+  for (auto item : _hessian_map)
+  {
+    auto &vec = (*hessian_map)[item.first];
+    vec = item.second.hessian;
+  }
+
+  return hessian_map;
+}
+
+} // namespace record_hessian
diff --git a/compiler/record-hessian/src/HessianComputer.test.cpp b/compiler/record-hessian/src/HessianComputer.test.cpp
new file mode 100644
index 00000000000..d64ab99678d
--- /dev/null
+++ b/compiler/record-hessian/src/HessianComputer.test.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "record-hessian/HessianComputer.h"
+
+#include <luci/IR/CircleNode.h>
+#include <luci_interpreter/Interpreter.h>
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+using namespace record_hessian;
+
+TEST(HessianComputerTest, recordHessianValidInput)
+{
+  luci::CircleFullyConnected node;
+
+  std::vector<float> input_data = {1.0, 2.0, 3.0, 4.0};
+
+  luci_interpreter::DataType data_type = luci_interpreter::DataType::FLOAT32;
+  luci_interpreter::Shape shape({1, 4});
+  luci_interpreter::AffineQuantization quantization;
+
+  std::string tensor_name = "input_tensor";
+
+  luci_interpreter::Tensor input_tensor(data_type, shape, quantization, tensor_name);
+
+  size_t data_size = input_data.size() * sizeof(float);
+  std::vector<uint8_t> buffer(data_size);
+
+  input_tensor.set_data_buffer(buffer.data());
+  input_tensor.writeData(input_data.data(), data_size);
+
+  HessianComputer computer;
+
+  EXPECT_NO_THROW(computer.recordHessian(&node, &input_tensor));
+}
+
+TEST(HessianComputerTest, recordHessian_wrong_op_NEG)
+{
+  luci::CircleAdd node;
+
+  std::vector<float> input_data = {1.0, 2.0, 3.0, 4.0};
+
+  luci_interpreter::DataType data_type = luci_interpreter::DataType::FLOAT32;
+  luci_interpreter::Shape shape({1, 2, 2, 1});
+  luci_interpreter::AffineQuantization quantization;
+
+  std::string tensor_name = "input_tensor";
+
+  luci_interpreter::Tensor input_tensor(data_type, shape, quantization, tensor_name);
+
+  size_t data_size = input_data.size() * sizeof(float);
+  std::vector<uint8_t> buffer(data_size);
+
+  input_tensor.set_data_buffer(buffer.data());
+  input_tensor.writeData(input_data.data(), data_size);
+
+  HessianComputer computer;
+
+  EXPECT_ANY_THROW(computer.recordHessian(&node, &input_tensor));
+}
+
+TEST(HessianComputerTest, recordHessianNullTensor_NEG)
+{
+  luci::CircleAdd node;
+  HessianComputer computer;
+  EXPECT_ANY_THROW(computer.recordHessian(&node, nullptr));
+}
+
+TEST(HessianComputerTest, unfoldValidInput)
+{
+  std::vector<float> buf = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
+  uint32_t input_n = 1, input_h = 2, input_w = 2, input_c = 2;
+  uint32_t stride_h = 1, stride_w = 1, dilation_h = 1, dilation_w = 1;
+  uint32_t kernel_oc = 1, kernel_h = 2, kernel_w = 2, kernel_ic = 2;
+
+  unfold(buf, input_n, input_h, input_w, input_c, stride_h, stride_w, dilation_h, dilation_w,
+         kernel_oc, kernel_h, kernel_w, kernel_ic);
+  std::vector<float> expected_output = {1.0, 3.0, 5.0, 7.0, 2.0, 4.0, 6.0, 8.0};
+
+  EXPECT_EQ(buf, expected_output);
+}
+
+TEST(HessianComputerTest, unfoldInvalidInput_NEG)
+{
+  std::vector<float> buf = {1.0, 2.0, 3.0, 4.0};
+  uint32_t input_n = 1, input_h = 2, input_w = 2, input_c = 1;
+  uint32_t stride_h = 1, stride_w = 1, dilation_h = 1, dilation_w = 1;
+  uint32_t kernel_oc = 1, kernel_h = 2, kernel_w = 2, kernel_ic = 2;
+
+  EXPECT_ANY_THROW(unfold(buf, input_n, input_h, input_w, input_c, stride_h, stride_w, dilation_h,
+                          dilation_w, kernel_oc, kernel_h, kernel_w, kernel_ic));
+}

From 8c470d82b943f77cf0c7fe8ce173d7720b74f61d Mon Sep 17 00:00:00 2001
From: Sanggyu Lee <takepencil@naver.com>
Date: Mon, 4 Nov 2024 09:16:33 +0900
Subject: [PATCH 45/46] [onert] Fix multimodel connection in executors (#14286)

It fixes to read output, not input on interpreting `from`.

ONE-DCO-1.0-Signed-off-by: Sanggyu Lee <sg5.lee@samsung.com>
---
 runtime/onert/core/src/exec/MultiModelExecutors.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runtime/onert/core/src/exec/MultiModelExecutors.cc b/runtime/onert/core/src/exec/MultiModelExecutors.cc
index 011d500d6c3..2c5a074d623 100644
--- a/runtime/onert/core/src/exec/MultiModelExecutors.cc
+++ b/runtime/onert/core/src/exec/MultiModelExecutors.cc
@@ -174,8 +174,8 @@ void MultiModelExecutors::createEdgeQuantLayers()
     const auto &from_io_index = std::get<ir::IOIndex>(from_iodesc);
 
     const auto from_executor = _executors.at({from_model_index, from_subg_index}).get();
-    const auto &from_info = from_executor->inputInfo(from_io_index.value());
-    const auto from_layout = from_executor->inputLayout(from_io_index.value());
+    const auto &from_info = from_executor->outputInfo(from_io_index.value());
+    const auto from_layout = from_executor->outputLayout(from_io_index.value());
     _edge_tensors[from_iodesc] = std::make_unique<EdgeTensor>(from_info, from_layout);
   }
 

From 85bcba8e564dc6f5bd8b12aaa128f1acfdd1cc8f Mon Sep 17 00:00:00 2001
From: BLEE <61487178+BLee-bot@users.noreply.github.com>
Date: Mon, 4 Nov 2024 18:15:13 +0900
Subject: [PATCH 46/46] [record-hessian] Introduce HessianObserver. (#14292)

This commit introduce hessian observer.

ONE-DCO-1.0-Signed-off-by: Banseok Lee <bs93.lee@samsung.com>
---
 .../include/record-hessian/HessianObserver.h  | 46 +++++++++++++++++++
 .../record-hessian/src/HessianObserver.cpp    | 45 ++++++++++++++++++
 2 files changed, 91 insertions(+)
 create mode 100644 compiler/record-hessian/include/record-hessian/HessianObserver.h
 create mode 100644 compiler/record-hessian/src/HessianObserver.cpp

diff --git a/compiler/record-hessian/include/record-hessian/HessianObserver.h b/compiler/record-hessian/include/record-hessian/HessianObserver.h
new file mode 100644
index 00000000000..283d9e2a377
--- /dev/null
+++ b/compiler/record-hessian/include/record-hessian/HessianObserver.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RECORD_HESSIAN_HESSIANOBSERVER_H__
+#define __RECORD_HESSIAN_HESSIANOBSERVER_H__
+
+#include "record-hessian/HessianComputer.h"
+
+#include <luci/IR/CircleNode.h>
+#include <luci_interpreter/Interpreter.h>
+
+#include <memory>
+
+namespace record_hessian
+{
+
+class HessianObserver : public luci_interpreter::ExecutionObserver
+{
+public:
+  HessianObserver() = default;
+
+  void postTensorWrite(const luci::CircleNode *node,
+                       const luci_interpreter::Tensor *tensor) override;
+
+  std::unique_ptr<HessianMap> hessianData() { return _hessian_computer.getMap(); }
+
+private:
+  HessianComputer _hessian_computer;
+};
+
+} // namespace record_hessian
+
+#endif // __RECORD_HESSIAN_HESSIANOBSERVER_H__
diff --git a/compiler/record-hessian/src/HessianObserver.cpp b/compiler/record-hessian/src/HessianObserver.cpp
new file mode 100644
index 00000000000..aef981b0d51
--- /dev/null
+++ b/compiler/record-hessian/src/HessianObserver.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "record-hessian/HessianObserver.h"
+
+namespace record_hessian
+{
+
+void HessianObserver::postTensorWrite(const luci::CircleNode *node,
+                                      const luci_interpreter::Tensor *tensor)
+{
+  assert(node != nullptr);
+  assert(tensor != nullptr);
+
+  auto node_outputs = loco::succs(node);
+  for (auto node_output : node_outputs)
+  {
+    auto cur_node = dynamic_cast<luci::CircleNode *>(node_output);
+    if (cur_node == nullptr)
+    {
+      throw std::runtime_error("Record Hessian: node output shouldn't be null.");
+    }
+    // TODO : ADD TCONV/DepthCONV cases
+    if (cur_node->opcode() == luci::CircleOpcode::FULLY_CONNECTED ||
+        cur_node->opcode() == luci::CircleOpcode::CONV_2D)
+    {
+      _hessian_computer.recordHessian(cur_node, tensor);
+    }
+  }
+}
+
+} // namespace record_hessian