[onert] Remove frontend layout usage in ACL backend

This commit removes frontend layout checking and permutation in ACL backend. ONE-DCO-1.0-Signed-off-by: Hyeongseok Oh <[email protected]>
Samsung · Jul 29, 2024 · 6fed2dd · 6fed2dd
1 parent 237bf23
commit 6fed2dd
Show file tree

Hide file tree

Showing 18 changed files with 101 additions and 324 deletions.
diff --git a/runtime/onert/backend/acl_cl/ConstantInitializer.cc b/runtime/onert/backend/acl_cl/ConstantInitializer.cc
@@ -96,12 +96,10 @@ void ConstantInitializer::visit(const ir::operation::Reverse &node)
   const auto &axis_obj = _operands.at(axis_index);
 
   const auto ifm_rank = input_obj.shape().rank();
-  const auto frontend_layout = this->_current_layout;
 
   if (axis_obj.isConstant())
   {
-    _init_map[axis_index] = [ifm_rank, frontend_layout](const ir::Operand &operand,
-                                                        backend::ITensor &obj) {
+    _init_map[axis_index] = [ifm_rank](const ir::Operand &operand, backend::ITensor &obj) {
       assert(operand.data());
 
       const auto axis_value = *(reinterpret_cast<const int32_t *>(operand.data()->base()));
@@ -111,7 +109,7 @@ void ConstantInitializer::visit(const ir::operation::Reverse &node)
         axis_tmp = axis_tmp + ifm_rank;
       }
 
-      auto axis = acl_common::ToARMComputeAxis(ifm_rank, axis_tmp, frontend_layout).value();
+      auto axis = acl_common::ToARMComputeAxis(ifm_rank, axis_tmp).value();
 
       obj.access([&](ITensor &tensor) {
         int32_t *into = reinterpret_cast<int32_t *>(tensor.buffer());

diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc
@@ -48,7 +48,7 @@ KernelGenerator::KernelGenerator(
   const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
   const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
   : basic::KernelGeneratorBase{graph}, _ctx(graph.operands()), _operations_ctx(graph.operations()),
-    _current_layout{graph.layout()}, _tensor_builder(tensor_builder), _tensor_reg(tensor_reg)
+    _tensor_builder(tensor_builder), _tensor_reg(tensor_reg)
 {
   // DO NOTHING
 }
@@ -164,8 +164,8 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
 
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(ir::Layout::NHWC);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(ir::Layout::NHWC);
   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
   const auto &ker_shape = _ctx.at(ker_index).shape();
   const auto ker_height = ker_shape.dim(1);
@@ -201,8 +201,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
   const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
 
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(ir::Layout::NHWC);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(ir::Layout::NHWC);
   // Kernel format is [1, kernel_height, kernel_width, depth_out].
   const auto &ker_shape = _ctx.at(ker_index).shape();
   const auto ker_height = ker_shape.dim(1);
@@ -269,8 +269,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
   else
   {
     const auto rank = _ctx.at(ofm_index).shape().rank();
-    const auto frontend_layout = _current_layout;
-    const auto fixed_axis = acl_common::ToARMComputeAxis(rank, axis, frontend_layout).value();
+    const auto fixed_axis = acl_common::ToARMComputeAxis(rank, axis).value();
     fn = acl_common::generateLayer<::arm_compute::CLConcatenateLayer>(
       input_tensors, output_tensor->handle(), fixed_axis);
   }
@@ -289,7 +288,7 @@ void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 
   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ICLTensor,
                                                 ::arm_compute::CLFullyConnectedReshapingLayer>(
-    node, _ctx, _tensor_builder, _tensor_reg, _current_layout);
+    node, _ctx, _tensor_builder, _tensor_reg);
   _return_fn = std::make_unique<exec::FunctionSequence>(
     std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
 }
@@ -308,18 +307,17 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   // Convert to ACL axes taking into account negative values and possible duplicates.
   const auto &axes = _ctx.at(axes_index);
   const auto input_rank = _ctx.at(input_index).shape().rank();
-  const auto frontend_layout = _current_layout;
 
   std::unique_ptr<arm_compute::IFunction> fn;
   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
   {
-    const auto acl_axes = acl_common::asCoordinates(axes, input_rank, frontend_layout);
+    const auto acl_axes = acl_common::asCoordinates(axes, input_rank);
     fn = acl_common::generateLayer<arm_compute::CLReduceMean>(input_tensor->handle(), acl_axes,
                                                               keep_dims, output_tensor->handle());
   }
   else
   {
-    const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout);
+    const auto acl_axes = acl_common::asSet(axes, input_rank);
 
     fn = acl_common::generateLayer<arm_compute::CLReduceOperation>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
@@ -337,11 +335,6 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
   auto output_tensor = _tensor_reg->getAclTensor(output_index);
   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
-  // NOTE This operation must not be changed the layout from frontend to backend
-  //      So, PermutationOperationPass makes layouts of frontend and backend the same.
-  assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
-         _current_layout == ir::Layout::NHWC);
-
   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
                                                                    output_tensor->handle());
 
@@ -394,7 +387,6 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
 
   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
-  const auto frontend_layout = _current_layout;
 
   // Set initializers for indices data such as order of inputData
   int input_rank = _ctx.at(input_index).shape().rank();
@@ -423,8 +415,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
     assert(beginData_base != nullptr);
     for (int n = 0; n < input_rank; ++n)
     {
-      auto axis =
-        ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout).value();
+      auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n).value();
 
       int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
       starts[axis] = begin_value;
@@ -459,7 +450,6 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
 
   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
-  const auto frontend_layout = _current_layout;
 
   // Set initializers for indices data such as order of inputData
   int input_rank = _ctx.at(input_index).shape().rank();
@@ -496,8 +486,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
     assert(startData_base != nullptr);
     for (int n = 0; n < input_rank; ++n)
     {
-      auto axis =
-        ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout).value();
+      auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n).value();
 
       int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
       starts[axis] = start_value;
@@ -511,12 +500,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
   }
 
   // Set mask bits such as order of inputData
-  const auto begin_mask =
-    acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank, frontend_layout);
-  const auto end_mask =
-    acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank, frontend_layout);
+  const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank);
+  const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank);
   const auto shrink_axis_mask =
-    acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank, frontend_layout);
+    acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank);
 
   ::arm_compute::Coordinates starts_set;
   ::arm_compute::Coordinates ends_set;
@@ -559,7 +546,6 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
 
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
-  const auto frontend_layout = _current_layout;
 
   const auto &perms = _ctx.at(perm_idx);
   std::vector<int32_t> pv;
@@ -587,7 +573,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
   }
   else
   {
-    auto backend_pv = acl_common::getARMComputePermutationVector(rank, pv, frontend_layout);
+    auto backend_pv = acl_common::getARMComputePermutationVector(rank, pv);
 
     fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
                                                            ofm_tensor->handle(), backend_pv);
@@ -836,9 +822,8 @@ void KernelGenerator::visit(const ir::operation::OneHot &node)
   auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
 
   const size_t output_rank = _ctx.at(output_idx).shape().rank();
-  const auto frontend_layout = _current_layout;
   int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
-  axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout).value();
+  axis = acl_common::ToARMComputeAxis(output_rank, axis).value();
 
   if (output_tensor->num_dimensions() != output_tensor->info()->num_dimensions())
   {
@@ -886,11 +871,9 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
   for (const auto &input_index : input_indexes)
     inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
 
-  const auto frontend_layout = _current_layout;
-
   if (axis < 0)
     axis += output_rank;
-  axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout).value();
+  axis = acl_common::ToARMComputeAxis(output_rank, axis).value();
 
   // Disable applied dim_correction
   for (const auto &input_index : input_indexes)
@@ -921,7 +904,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
 void KernelGenerator::visit(const ir::operation::Pool2D &node)
 {
   auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>(
-    node, _ctx, _tensor_reg, _current_layout, acl_common::convertPoolType(node.param().op_type));
+    node, _ctx, _tensor_reg, acl_common::convertPoolType(node.param().op_type));
 
   const auto ofm_index{node.getOutputs().at(0)};
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
@@ -1168,9 +1151,9 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
   const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
   const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
 
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
-  const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_layout);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(ir::Layout::NHWC);
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(ir::Layout::NHWC);
+  const auto ker_shape = _ctx.at(ker_index).shape().asFeature(ir::Layout::NHWC);
 
   const auto stride = node.param().stride;
 
@@ -1257,16 +1240,6 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
   auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
 
-  // NOTE The frontend layout and backend layout must be the same for this operation.
-  //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
-  //      is not not efficient even if it works well. If so, it would be better to set the
-  //      layout of these backend tensors to the same layout.
-  //      There is also one thing we have to think about. This operation depends on the layout of
-  //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
-  //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
-  //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
-  assert(ifm_rank < 4 || _current_layout == ir::Layout::NHWC);
-
   // input is n-D, indices k-D, output is (n + k - 1)-D
   size_t n = ifm_rank;
   assert(n == ifm_tensor->num_dimensions());
@@ -1315,15 +1288,14 @@ void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
-  auto frontend_layout = _current_layout;
 
   int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
   if (axis_value < 0)
   {
     axis_value += ifm_rank;
   }
 
-  auto acl_axis = acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout).value();
+  auto acl_axis = acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
   auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
                                              : ::arm_compute::ReductionOperation::ARG_IDX_MIN;
   auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayerEx>(
@@ -1393,11 +1365,10 @@ void KernelGenerator::visit(const ir::operation::Split &node)
   for (const auto &ofm_ind : output_indexes)
     output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
 
-  const auto frontend_layout = _current_layout;
   auto axis = _ctx.at(axis_index).asScalar<int32_t>();
   if (axis < 0)
     axis += ifm_rank;
-  axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout).value();
+  axis = acl_common::ToARMComputeAxis(ifm_rank, axis).value();
 
   auto fn =
     acl_common::generateLayer<arm_compute::CLSplit>(ifm_tensor->handle(), output_tensors, axis);
@@ -1431,16 +1402,14 @@ void KernelGenerator::visit(const ir::operation::SplitV &node)
   {
     int32_t split_dim = split_dim_op.asScalar<int32_t>();
     uint32_t split_dim_revised = (split_dim < 0) ? (split_dim + ifm_rank) : split_dim;
-    const auto frontend_layout = _current_layout;
 
     if (ifm_tensor->num_dimensions() != ifm_tensor->info()->num_dimensions())
     {
       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
       acl_common::disableDimCorrection(ifm_tensor);
     }
 
-    split_dim_revised =
-      acl_common::ToARMComputeAxis(ifm_rank, split_dim_revised, frontend_layout).value();
+    split_dim_revised = acl_common::ToARMComputeAxis(ifm_rank, split_dim_revised).value();
     fn->configure(ifm_tensor->handle(), size_split_tensor->handle(), split_dim_revised,
                   output_tensors, node.param().num_splits);
 
@@ -1473,10 +1442,9 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
   for (const auto &output_index : output_indexes)
     outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
 
-  const auto frontend_layout = _current_layout;
   if (axis < 0)
     axis += input_rank;
-  axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout).value();
+  axis = acl_common::ToARMComputeAxis(input_rank, axis).value();
 
   // Disable applied dim_correction
   if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
@@ -1515,15 +1483,13 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
   auto input = _tensor_reg->getAclTensor(input_index)->handle();
   auto output = _tensor_reg->getAclTensor(output_index)->handle();
 
-  const auto frontend_layout = _current_layout;
-
   ::arm_compute::PaddingList padding_list;
   padding_list.resize(rank);
   for (int32_t n = 0; n < rank; ++n)
   {
     const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
 
-    const auto axis = acl_common::ToARMComputeAxis(rank, n, frontend_layout).value();
+    const auto axis = acl_common::ToARMComputeAxis(rank, n).value();
     padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
   }
 

diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.h b/runtime/onert/backend/acl_cl/KernelGenerator.h
@@ -90,7 +90,6 @@ class KernelGenerator : public basic::KernelGeneratorBase
 private:
   const ir::Operands &_ctx;
   const ir::Operations &_operations_ctx;
-  const ir::Layout _current_layout;
   std::shared_ptr<TensorBuilder> _tensor_builder;
   std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> _tensor_reg;
 };

diff --git a/runtime/onert/backend/acl_cl/Optimizer.cc b/runtime/onert/backend/acl_cl/Optimizer.cc
@@ -44,10 +44,7 @@ void Optimizer::optimize()
     acl_common::AclSubTensorAnalyzer sa{*_context->graph()};
     sa.setUsePadding();
     _context->graph()->operations().iterate(
-      [&](const ir::OperationIndex &, const ir::IOperation &op) {
-        sa.setLayout(_context->graph()->layout());
-        op.accept(sa);
-      });
+      [&](const ir::OperationIndex &, const ir::IOperation &op) { op.accept(sa); });
 
     _tensor_builder->parent_map(sa.releaseParentMap());
   }

diff --git a/runtime/onert/backend/acl_common/AclBackendContext.h b/runtime/onert/backend/acl_common/AclBackendContext.h
@@ -60,9 +60,8 @@ class AclBackendContext
       if (this->external_operands().contains(ind))
         return;
 
-      const auto frontend_layout = this->graph()->layout();
-      ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, ir::Layout::NHWC),
-                                   obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
+      ir::OperandInfo backend_info{obj.shape(), obj.typeInfo(), obj.info().memAllocType(),
+                                   obj.isConstant()};
       this->tensor_builder->registerTensorInfo(ind, backend_info);
     });