Merge remote-tracking branch 'upstream/master' into mbencer/Propagate…

…SharedMemoryOperands
Samsung · Nov 4, 2024 · 61ac3a1 · 61ac3a1
2 parents 25b1edb + 85bcba8
commit 61ac3a1
Show file tree

Hide file tree

Showing 80 changed files with 4,007 additions and 535 deletions.
diff --git a/compiler/angkor/README.md b/compiler/angkor/README.md
@@ -2,11 +2,11 @@
 
 ## Purpose
 
-_angkor_ is a `nncc` core library
+_angkor_ is an `nncc` core library
 
 ## How to use
 
-_angkor_ implements abstract data type(ADT) for feature, kernel, tensor.
+_angkor_ implements abstract data type (ADT) for feature, kernel, tensor.
 There are layout, shape information and enumerator and so on.
 
 To use some of these things, just insert `include`!

diff --git a/compiler/caffegen/README.md b/compiler/caffegen/README.md
@@ -5,16 +5,16 @@
 ## How caffegen works
 
 Some of commands in `caffegen` use standard input for reading data and standard output for exporting result.
-In this case, we strongly recommand you to use pipe, not copy & paste the content of file itself.
+In this case, we strongly recommend you to use pipe, not copy & paste the content of file itself.
 
 Otherwise, `caffegen` use arguments to pass some directories.
 
 ## Supported command
 
-Basically, caffgen command is used as `caffegen [COMMAND]` and there are four `COMMAND` types.
+Basically, caffegen command is used as `caffegen [COMMAND]` and there are four `COMMAND` types:
  - init : initialize parameters using prototxt.
- - encode : make a binary file(caffemodel) using initialized data
- - decode : decode a binary file(caffemodel) and reproduce the initialized data
+ - encode : make a binary file (caffemodel) using initialized data
+ - decode : decode a binary file (caffemodel) and reproduce the initialized data
  - merge : copy the trained weights from a caffemodel into a prototxt file
 
 ## How to use each command

diff --git a/compiler/circle2circle/README.md b/compiler/circle2circle/README.md
@@ -1,3 +1,3 @@
 # circle2circle
 
-_circle2circle_ provides Circle optimizations as executable tool
+_circle2circle_ provides Circle optimizations as an executable tool
diff --git a/compiler/dalgona/src/PostOperatorHook.h b/compiler/dalgona/src/PostOperatorHook.h
@@ -187,12 +187,17 @@ class PostOperatorHook final : public luci::CircleNodeVisitor<void>
     POST_OPERATOR_HOOK_PROLOGUE(FullyConnected)
 
     auto fused_act = node->fusedActivationFunction();
-
+    py::dict bias;
+    // bias is optional
+    if (inputs.size() == 3)
+    {
+      bias = inputs[2];
+    }
     pySafeCall(hook,
                node->name(),       // name
                inputs[0],          // input
                inputs[1],          // weights
-               inputs[2],          // bias
+               bias,               // bias
                output,             // output
                toString(fused_act) // fused activation
     );

diff --git a/compiler/dalgona/src/Utils.cpp b/compiler/dalgona/src/Utils.cpp
@@ -68,6 +68,8 @@ py::array numpyArray(const Tensor *tensor)
       return py::array_t<int64_t, py::array::c_style>(shape, tensor->data<int64_t>());
     case loco::DataType::U8:
       return py::array_t<uint8_t, py::array::c_style>(shape, tensor->data<uint8_t>());
+    case loco::DataType::BOOL:
+      return py::array_t<bool, py::array::c_style>(shape, tensor->data<bool>());
     default:
       throw std::runtime_error("Unsupported data type");
   }

diff --git a/compiler/fme-apply/src/FMEqualizer.cpp b/compiler/fme-apply/src/FMEqualizer.cpp
@@ -82,8 +82,8 @@ void FMEqualizer::equalize(loco::Graph *g, std::vector<EqualizePattern> &p)
   phase.emplace_back(std::make_unique<fme_apply::FusePreScalePass>());
   phase.emplace_back(std::make_unique<fme_apply::FusePostScalePass>());
 
-  ProgressReporter prog(g, logo::PhaseStrategy::Restart);
-  logo::PhaseRunner<logo::PhaseStrategy::Restart> phase_runner{g};
+  ProgressReporter prog(g, logo::PhaseStrategy::Saturate);
+  logo::PhaseRunner<logo::PhaseStrategy::Saturate> phase_runner{g};
   phase_runner.attach(&prog);
   phase_runner.run(phase);
 

diff --git a/compiler/fme-apply/src/InsertScaleShift.cpp b/compiler/fme-apply/src/InsertScaleShift.cpp
@@ -125,7 +125,8 @@ bool calculate_smooth_quant_scale(luci::CircleNode *node, EqualizePattern *p)
         cur = i;
         for (uint32_t j = 0; j < norm_dim; j++)
         {
-          weight_max.at(i) = std::max(weight_max.at(i), weight->at<loco::DataType::FLOAT32>(cur));
+          weight_max.at(i) =
+            std::max(weight_max.at(i), std::abs(weight->at<loco::DataType::FLOAT32>(cur)));
           cur += weight_I;
         }
       }
@@ -166,7 +167,8 @@ bool calculate_smooth_quant_scale(luci::CircleNode *node, EqualizePattern *p)
         cur = i;
         for (uint32_t j = 0; j < weight_O; j++)
         {
-          weight_max.at(i) = std::max(weight_max.at(i), weight->at<loco::DataType::FLOAT32>(cur));
+          weight_max.at(i) =
+            std::max(weight_max.at(i), std::abs(weight->at<loco::DataType::FLOAT32>(cur)));
           cur += weight_I;
         }
       }
@@ -301,7 +303,7 @@ struct InsertScaleShiftVisitor final : public luci::CircleNodeMutableVisitor<voi
     auto valid = ::calculate_smooth_quant_scale(node, _pattern);
     auto back_node = node;
     // Find front node.
-    const auto support_depth = 2;
+    const auto support_depth = 3;
     auto front_node = find_arg_with_name(node, _pattern->front, support_depth);
     if (not front_node)
     {

diff --git a/compiler/fme-apply/src/pass/FusePostScalePass.cpp b/compiler/fme-apply/src/pass/FusePostScalePass.cpp
@@ -248,7 +248,7 @@ struct FusePostScale final : public luci::CircleNodeMutableVisitor<bool>
     auto param =
       loco::must_cast<luci::CircleConst *>(post_scale->inputs(1)); // FIX_PostScale_UNLESS
     auto filter = loco::must_cast<luci::CircleConst *>(node->weights());
-    auto bias = loco::must_cast<luci::CircleConst *>(node->bias());
+    luci::CircleConst *bias = dynamic_cast<luci::CircleConst *>(node->bias());
 
     uint32_t filter_o = filter->dim(0).value();
     uint32_t filter_i = filter->dim(1).value();
@@ -259,26 +259,34 @@ struct FusePostScale final : public luci::CircleNodeMutableVisitor<bool>
       throw std::runtime_error("Mismatch between scale size and filter output channel size: " +
                                std::to_string(filter_o) + " != " + std::to_string(param_size));
     }
-    const auto bias_size = bias->size<loco::DataType::FLOAT32>();
-    if (bias_size != param_size)
+    if (bias)
     {
-      throw std::runtime_error("Mismatch between scale size and bias size: " +
-                               std::to_string(bias_size) + " != " + std::to_string(param_size));
+      const auto bias_size = bias->size<loco::DataType::FLOAT32>();
+      if (bias_size != param_size)
+      {
+        throw std::runtime_error("Mismatch between scale size and bias size: " +
+                                 std::to_string(bias_size) + " != " + std::to_string(param_size));
+      }
     }
 
     auto cloned_fc = luci::clone_node(node, node->graph());
     assert(cloned_fc != nullptr); // FIX_CALLER_UNLESS
     auto fused_fc = loco::must_cast<luci::CircleFullyConnected *>(cloned_fc);
     auto fused_filter = luci::clone(filter);
-    auto fused_bias = luci::clone(bias);
 
     fused_fc->name(node->name() + "_fused_" + random_str());
     fused_filter->name(filter->name() + "_fused_" + random_str());
-    fused_bias->name(bias->name() + "_fused_" + random_str());
 
     add_origin(fused_fc, luci::get_origin(node));
     add_origin(fused_filter, luci::get_origin(filter));
-    add_origin(fused_bias, luci::get_origin(bias));
+
+    luci::CircleConst *fused_bias = nullptr;
+    if (bias)
+    {
+      fused_bias = luci::clone(bias);
+      fused_bias->name(bias->name() + "_fused_" + random_str());
+      add_origin(fused_bias, luci::get_origin(bias));
+    }
 
     // Multiply param to weights
     for (uint32_t o = 0; o < filter_o; o++)
@@ -294,17 +302,21 @@ struct FusePostScale final : public luci::CircleNodeMutableVisitor<bool>
       }
     }
 
-    // Multiply param to bias
-    for (uint32_t c = 0; c < filter_o; ++c)
-    {
-      float scale = param->at<loco::DataType::FLOAT32>(c);
-      fused_bias->at<loco::DataType::FLOAT32>(c) =
-        fused_bias->at<loco::DataType::FLOAT32>(c) * scale;
-    }
-
     fused_fc->input(node->input());
     fused_fc->weights(fused_filter);
-    fused_fc->bias(fused_bias);
+    fused_fc->bias(node->bias());
+
+    if (bias)
+    {
+      // Multiply param to bias
+      for (uint32_t c = 0; c < filter_o; ++c)
+      {
+        float scale = param->at<loco::DataType::FLOAT32>(c);
+        fused_bias->at<loco::DataType::FLOAT32>(c) =
+          fused_bias->at<loco::DataType::FLOAT32>(c) * scale;
+      }
+      fused_fc->bias(fused_bias);
+    }
 
     loco::replace(post_scale).with(fused_fc);
 

diff --git a/compiler/fme-detect/src/EqualizePatternFinder.cpp b/compiler/fme-detect/src/EqualizePatternFinder.cpp
@@ -173,7 +173,7 @@ Forwardable forwardable(luci::CircleNode *node)
     case luci::CircleOpcode::LEAKY_RELU:
       return {true, false};
     case luci::CircleOpcode::GELU:
-      return {true, false};
+      return {false, false};
     default:
       return {false, false};
   }
@@ -186,8 +186,11 @@ void match(luci::CircleNode *front, std::vector<EqualizePattern> &res)
     throw std::invalid_argument("front");
 
   auto front_fusability = fusability(front);
-
-  for (auto succ : loco::succs(front))
+  auto succs = loco::succs(front);
+  // TODO Support multiple successors.
+  if (succs.size() != 1)
+    return;
+  for (auto succ : succs)
   {
     // Check succ fusability
     auto back = loco::must_cast<luci::CircleNode *>(succ);
@@ -201,15 +204,24 @@ void match(luci::CircleNode *front, std::vector<EqualizePattern> &res)
       auto f = forwardable(back);
       if (f.scale_forwardable)
       {
-        auto succ_succs = loco::succs(back);
+        auto back_succs = loco::succs(back);
         // Only support single successor for simplicity
-        if (succ_succs.size() != 1)
+        if (back_succs.size() != 1)
           continue;
-        auto next_succ = *succ_succs.begin();
-        auto next_back = loco::must_cast<luci::CircleNode *>(next_succ);
-        back_fusability = fusability(next_back);
-        back_fusability.pre_scale &= f.scale_forwardable;
-        back = next_back;
+        back = loco::must_cast<luci::CircleNode *>(*back_succs.begin());
+        back_fusability = fusability(back);
+        if (not back_fusability.pre_scale)
+        {
+          f = forwardable(back);
+          if (f.scale_forwardable)
+          {
+            back_succs = loco::succs(back);
+            if (back_succs.size() != 1)
+              continue;
+            back = loco::must_cast<luci::CircleNode *>(*back_succs.begin());
+            back_fusability = fusability(back);
+          }
+        }
       }
     }
 

diff --git a/compiler/kuma/README.md b/compiler/kuma/README.md
@@ -4,4 +4,4 @@ _kuma_ is a collection of offline memory allocators.
 
 ## What does "kuma" mean?
 
-_kuma_ originates from _cooma_ which is an abbreviation of **C**ollection **O**f **O**ffline **M**emory **A**lloators.
+_kuma_ originates from _cooma_ which is an abbreviation of **C**ollection **O**f **O**ffline **M**emory **A**llocators.
diff --git a/compiler/loco/doc/LEP_000_Dialect_Service.md b/compiler/loco/doc/LEP_000_Dialect_Service.md
@@ -64,7 +64,7 @@ struct GraphOutputIndexQueryService : public DialectService
 
 This proposal extends ``Dialect`` class with ``service`` method.
 
-Each dialect SHOULD return a valid pointer on ``service<Service>`` method call if it implements that service. Otherwise, it SHOULD return a null pointer otherwise.
+Each dialect SHOULD return a valid pointer on ``service<Service>`` method call if it implements that service. Otherwise, it SHOULD return a null pointer.
 
 **WARNING** It is impossible to use ``get``. ``get`` is currently reserved for singleton accessor.
 
@@ -106,7 +106,7 @@ std::vector<loco::Node *> output_nodes(loco::Graph *g)
 
 ### How to register a service
 
-Each dialect should invoke protected ``service`` method during its construction.
+Each dialect should invoke the protected ``service`` method during its construction.
 ```cxx
 AwesomeDialect::AwesomeDialect()
 {

diff --git a/compiler/locoex-customop/README.md b/compiler/locoex-customop/README.md
@@ -1,9 +1,9 @@
 # locoex
 
-_locoex_ is an extention of loco. Classes with `COp` prefix enables *Custom Operation*.
+_locoex_ is an extension of loco. Classes with the `COp` prefix enable *Custom Operation*.
 In this version, a *custom operation* means one of the following:
 
-1. an op that is supported by Tensorflow but not supported both by the moco and the onert
-1. an op that is not supported by Tensorflow, moco, and the onert
+1. an op that is supported by Tensorflow but not by moco and onert
+2. an op that is not supported by Tensorflow, moco or onert
 
-`COpCall` node will represent IR entity that calls custom operations and kernels.
+`COpCall` node will represent an IR entity that calls custom operations and kernels.
diff --git a/compiler/locomotiv/README.md b/compiler/locomotiv/README.md
@@ -2,7 +2,7 @@
 _locomotiv_ is a reference interpreter for _loco_ IR.
 
 # Purpose
-- _locomotiv_ would serve as code level specification and reference implementation for loco IR.
+- _locomotiv_ would serve as code level specification and a reference implementation for loco IR.
 - _locomotiv_ is required for loco-related tools to be tested.
 
 # Sample code to use locomotiv library
@@ -60,31 +60,31 @@ case loco::DataType::FLOAT32:
 4. Test new node execution at `locomotiv/src/Node/TheNode.test.cpp` if possible.
 
 ### Note on internal data layout rule
-For each domain(see `loco::Domain`), `locomotiv` has fixed layout rule on how to store its data in memory.
+For each domain (see `loco::Domain`), `locomotiv` has fixed layout rule on how to store its data in memory.
 - Feature is represented as NHWC layout
-  - That is number of batch(N), height(H), width(W) and channel depth(C)
+  - That is number of batch (N), height (H), width (W) and channel depth (C)
 - Filter is represented as NHWC layout
-  - That is number of filter(N), height(H), width(W) and input channel depth(C)
+  - That is number of filter (N), height (H), width (W) and input channel depth (C)
 - DepthwiseFilter is represented as HWCM layout
-  - That is height(H), width(W), input channel depth(C) and depth multiplier(M)
+  - That is height (H), width (W), input channel depth (C) and depth multiplier (M)
 - Matrix is represented as HW layout
-  - That is height(H), width(W)
+  - That is height (H), width (W)
 
 ### Notes on step 3
 - Mocking Tensorflow lite `reference_op.h` might be a good place to start.
-- `execute()` can be called multiple time. It just recalculates and updates annotated data. So it should `erase_annot_data()` before newly `annot_data()`.
+- `execute()` can be called multiple times. It just recalculates and updates annotated data. So it should `erase_annot_data()` before newly `annot_data()`.
 - Most node execution behaviour would be implemented for each data type.
 - `execute()` should throw runtime error on invalid cases. Some of these cases are explained:
   - Invalid argument node
-    - e.g.) Pull -> MaxPool2D is invalid as MaxPool2D requires feature map as its argument.
+    - e.g. Pull -> MaxPool2D is invalid as MaxPool2D requires feature map as its argument.
   - Lack of argument data
-    - e.g.) Given 'Pull -> Push' graph. On execution of Push, if no NodeData annotated to Pull, it is invalid.
+    - e.g. Given 'Pull -> Push' graph. On execution of Push, if no NodeData annotated to Pull, it is invalid.
   - Mismatch of argument shapes
-    - e.g.) Addition between 2x2 and 3x3 tensor is invalid
-    - e.g.) MaxPool2D expects its ifm to be 4D feature, otherwise invalid.
+    - e.g. Addition between 2x2 and 3x3 tensor is invalid
+    - e.g. MaxPool2D expects its ifm to be 4D feature, otherwise invalid.
   - Mismatch between node's own information and inferred information
     - Some node already have attributes like shape or data type. If inferred information is different with existing node's, it is invalid.
 
 ### Recommendation on step 4 (test)
 - If the node has no arguments, create a node object and `NodeExecution::run()` on it. Check whether it operates correctly.
-- If the node has N(>= 1) arguments, make N pull node inputs, source them to the node to be tested. FeatureEncode or FilterEncode node may be required inbetween depending on the node's argument type. Then annotate N pull nodes with its data, `NodeExecution::run()` on the node to test, and check whether it operates correctly.
+- If the node has N (>= 1) arguments, make N pull node inputs, source them to the node to be tested. FeatureEncode or FilterEncode node may be required inbetween depending on the node's argument type. Then annotate N pull nodes with its data, `NodeExecution::run()` on the node to test, and check whether it operates correctly.
diff --git a/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst b/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst
@@ -27,6 +27,7 @@ REGISTER_KERNEL(Gather)
 REGISTER_KERNEL(Gelu)
 REGISTER_KERNEL(Greater)
 REGISTER_KERNEL(GreaterEqual)
+REGISTER_KERNEL(GRU)
 REGISTER_KERNEL(HardSwish)
 REGISTER_KERNEL(If)
 REGISTER_KERNEL(InstanceNorm)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,4 +4,4 @@ _kuma_ is a collection of offline memory allocators.

		## What does "kuma" mean?

		_kuma_ originates from _cooma_ which is an abbreviation of Collection Of Offline Memory Alloators.
		_kuma_ originates from _cooma_ which is an abbreviation of Collection Of Offline Memory Allocators.