Skip to content

Commit

Permalink
[onert] Remove frontend layout usage in ACL backend
Browse files Browse the repository at this point in the history
This commit removes frontend layout checking and permutation in ACL backend.

ONE-DCO-1.0-Signed-off-by: Hyeongseok Oh <[email protected]>
  • Loading branch information
hseok-oh committed Jul 29, 2024
1 parent 237bf23 commit 6fed2dd
Show file tree
Hide file tree
Showing 18 changed files with 101 additions and 324 deletions.
6 changes: 2 additions & 4 deletions runtime/onert/backend/acl_cl/ConstantInitializer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,10 @@ void ConstantInitializer::visit(const ir::operation::Reverse &node)
const auto &axis_obj = _operands.at(axis_index);

const auto ifm_rank = input_obj.shape().rank();
const auto frontend_layout = this->_current_layout;

if (axis_obj.isConstant())
{
_init_map[axis_index] = [ifm_rank, frontend_layout](const ir::Operand &operand,
backend::ITensor &obj) {
_init_map[axis_index] = [ifm_rank](const ir::Operand &operand, backend::ITensor &obj) {
assert(operand.data());

const auto axis_value = *(reinterpret_cast<const int32_t *>(operand.data()->base()));
Expand All @@ -111,7 +109,7 @@ void ConstantInitializer::visit(const ir::operation::Reverse &node)
axis_tmp = axis_tmp + ifm_rank;
}

auto axis = acl_common::ToARMComputeAxis(ifm_rank, axis_tmp, frontend_layout).value();
auto axis = acl_common::ToARMComputeAxis(ifm_rank, axis_tmp).value();

obj.access([&](ITensor &tensor) {
int32_t *into = reinterpret_cast<int32_t *>(tensor.buffer());
Expand Down
86 changes: 26 additions & 60 deletions runtime/onert/backend/acl_cl/KernelGenerator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ KernelGenerator::KernelGenerator(
const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
: basic::KernelGeneratorBase{graph}, _ctx(graph.operands()), _operations_ctx(graph.operations()),
_current_layout{graph.layout()}, _tensor_builder(tensor_builder), _tensor_reg(tensor_reg)
_tensor_builder(tensor_builder), _tensor_reg(tensor_reg)
{
// DO NOTHING
}
Expand Down Expand Up @@ -164,8 +164,8 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};

const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(ir::Layout::NHWC);
const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(ir::Layout::NHWC);
// Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
const auto &ker_shape = _ctx.at(ker_index).shape();
const auto ker_height = ker_shape.dim(1);
Expand Down Expand Up @@ -201,8 +201,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};

const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(ir::Layout::NHWC);
const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(ir::Layout::NHWC);
// Kernel format is [1, kernel_height, kernel_width, depth_out].
const auto &ker_shape = _ctx.at(ker_index).shape();
const auto ker_height = ker_shape.dim(1);
Expand Down Expand Up @@ -269,8 +269,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
else
{
const auto rank = _ctx.at(ofm_index).shape().rank();
const auto frontend_layout = _current_layout;
const auto fixed_axis = acl_common::ToARMComputeAxis(rank, axis, frontend_layout).value();
const auto fixed_axis = acl_common::ToARMComputeAxis(rank, axis).value();
fn = acl_common::generateLayer<::arm_compute::CLConcatenateLayer>(
input_tensors, output_tensor->handle(), fixed_axis);
}
Expand All @@ -289,7 +288,7 @@ void KernelGenerator::visit(const ir::operation::FullyConnected &node)

auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ICLTensor,
::arm_compute::CLFullyConnectedReshapingLayer>(
node, _ctx, _tensor_builder, _tensor_reg, _current_layout);
node, _ctx, _tensor_builder, _tensor_reg);
_return_fn = std::make_unique<exec::FunctionSequence>(
std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
}
Expand All @@ -308,18 +307,17 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
// Convert to ACL axes taking into account negative values and possible duplicates.
const auto &axes = _ctx.at(axes_index);
const auto input_rank = _ctx.at(input_index).shape().rank();
const auto frontend_layout = _current_layout;

std::unique_ptr<arm_compute::IFunction> fn;
if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
{
const auto acl_axes = acl_common::asCoordinates(axes, input_rank, frontend_layout);
const auto acl_axes = acl_common::asCoordinates(axes, input_rank);
fn = acl_common::generateLayer<arm_compute::CLReduceMean>(input_tensor->handle(), acl_axes,
keep_dims, output_tensor->handle());
}
else
{
const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout);
const auto acl_axes = acl_common::asSet(axes, input_rank);

fn = acl_common::generateLayer<arm_compute::CLReduceOperation>(
_tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
Expand All @@ -337,11 +335,6 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
auto output_tensor = _tensor_reg->getAclTensor(output_index);
auto input_tensor = _tensor_reg->getAclTensor(input_index);

// NOTE This operation must not be changed the layout from frontend to backend
// So, PermutationOperationPass makes layouts of frontend and backend the same.
assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
_current_layout == ir::Layout::NHWC);

auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
output_tensor->handle());

Expand Down Expand Up @@ -394,7 +387,6 @@ void KernelGenerator::visit(const ir::operation::Slice &node)

auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
const auto frontend_layout = _current_layout;

// Set initializers for indices data such as order of inputData
int input_rank = _ctx.at(input_index).shape().rank();
Expand Down Expand Up @@ -423,8 +415,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
assert(beginData_base != nullptr);
for (int n = 0; n < input_rank; ++n)
{
auto axis =
::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout).value();
auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n).value();

int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
starts[axis] = begin_value;
Expand Down Expand Up @@ -459,7 +450,6 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)

auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
const auto frontend_layout = _current_layout;

// Set initializers for indices data such as order of inputData
int input_rank = _ctx.at(input_index).shape().rank();
Expand Down Expand Up @@ -496,8 +486,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
assert(startData_base != nullptr);
for (int n = 0; n < input_rank; ++n)
{
auto axis =
::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout).value();
auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n).value();

int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
starts[axis] = start_value;
Expand All @@ -511,12 +500,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
}

// Set mask bits such as order of inputData
const auto begin_mask =
acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank, frontend_layout);
const auto end_mask =
acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank, frontend_layout);
const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank);
const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank);
const auto shrink_axis_mask =
acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank, frontend_layout);
acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank);

::arm_compute::Coordinates starts_set;
::arm_compute::Coordinates ends_set;
Expand Down Expand Up @@ -559,7 +546,6 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)

auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
const auto frontend_layout = _current_layout;

const auto &perms = _ctx.at(perm_idx);
std::vector<int32_t> pv;
Expand Down Expand Up @@ -587,7 +573,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
}
else
{
auto backend_pv = acl_common::getARMComputePermutationVector(rank, pv, frontend_layout);
auto backend_pv = acl_common::getARMComputePermutationVector(rank, pv);

fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
ofm_tensor->handle(), backend_pv);
Expand Down Expand Up @@ -836,9 +822,8 @@ void KernelGenerator::visit(const ir::operation::OneHot &node)
auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);

const size_t output_rank = _ctx.at(output_idx).shape().rank();
const auto frontend_layout = _current_layout;
int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout).value();
axis = acl_common::ToARMComputeAxis(output_rank, axis).value();

if (output_tensor->num_dimensions() != output_tensor->info()->num_dimensions())
{
Expand Down Expand Up @@ -886,11 +871,9 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
for (const auto &input_index : input_indexes)
inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());

const auto frontend_layout = _current_layout;

if (axis < 0)
axis += output_rank;
axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout).value();
axis = acl_common::ToARMComputeAxis(output_rank, axis).value();

// Disable applied dim_correction
for (const auto &input_index : input_indexes)
Expand Down Expand Up @@ -921,7 +904,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
void KernelGenerator::visit(const ir::operation::Pool2D &node)
{
auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>(
node, _ctx, _tensor_reg, _current_layout, acl_common::convertPoolType(node.param().op_type));
node, _ctx, _tensor_reg, acl_common::convertPoolType(node.param().op_type));

const auto ofm_index{node.getOutputs().at(0)};
auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
Expand Down Expand Up @@ -1168,9 +1151,9 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};

const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_layout);
const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(ir::Layout::NHWC);
const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(ir::Layout::NHWC);
const auto ker_shape = _ctx.at(ker_index).shape().asFeature(ir::Layout::NHWC);

const auto stride = node.param().stride;

Expand Down Expand Up @@ -1257,16 +1240,6 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
auto indices_tensor = _tensor_reg->getAclTensor(indices_index);

// NOTE The frontend layout and backend layout must be the same for this operation.
// If not the same, we have to add a stage(?) to perform permutation of output tensor. It
// is not not efficient even if it works well. If so, it would be better to set the
// layout of these backend tensors to the same layout.
// There is also one thing we have to think about. This operation depends on the layout of
// a model. For example, if a model in NHWC has this operation as output rank == 4, indices
// rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
// and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
assert(ifm_rank < 4 || _current_layout == ir::Layout::NHWC);

// input is n-D, indices k-D, output is (n + k - 1)-D
size_t n = ifm_rank;
assert(n == ifm_tensor->num_dimensions());
Expand Down Expand Up @@ -1315,15 +1288,14 @@ void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
auto frontend_layout = _current_layout;

int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
if (axis_value < 0)
{
axis_value += ifm_rank;
}

auto acl_axis = acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout).value();
auto acl_axis = acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
: ::arm_compute::ReductionOperation::ARG_IDX_MIN;
auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayerEx>(
Expand Down Expand Up @@ -1393,11 +1365,10 @@ void KernelGenerator::visit(const ir::operation::Split &node)
for (const auto &ofm_ind : output_indexes)
output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());

const auto frontend_layout = _current_layout;
auto axis = _ctx.at(axis_index).asScalar<int32_t>();
if (axis < 0)
axis += ifm_rank;
axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout).value();
axis = acl_common::ToARMComputeAxis(ifm_rank, axis).value();

auto fn =
acl_common::generateLayer<arm_compute::CLSplit>(ifm_tensor->handle(), output_tensors, axis);
Expand Down Expand Up @@ -1431,16 +1402,14 @@ void KernelGenerator::visit(const ir::operation::SplitV &node)
{
int32_t split_dim = split_dim_op.asScalar<int32_t>();
uint32_t split_dim_revised = (split_dim < 0) ? (split_dim + ifm_rank) : split_dim;
const auto frontend_layout = _current_layout;

if (ifm_tensor->num_dimensions() != ifm_tensor->info()->num_dimensions())
{
// This means that high dimension's value is 1 and ifm tensor is applied dim_correction
acl_common::disableDimCorrection(ifm_tensor);
}

split_dim_revised =
acl_common::ToARMComputeAxis(ifm_rank, split_dim_revised, frontend_layout).value();
split_dim_revised = acl_common::ToARMComputeAxis(ifm_rank, split_dim_revised).value();
fn->configure(ifm_tensor->handle(), size_split_tensor->handle(), split_dim_revised,
output_tensors, node.param().num_splits);

Expand Down Expand Up @@ -1473,10 +1442,9 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
for (const auto &output_index : output_indexes)
outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());

const auto frontend_layout = _current_layout;
if (axis < 0)
axis += input_rank;
axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout).value();
axis = acl_common::ToARMComputeAxis(input_rank, axis).value();

// Disable applied dim_correction
if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
Expand Down Expand Up @@ -1515,15 +1483,13 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
auto input = _tensor_reg->getAclTensor(input_index)->handle();
auto output = _tensor_reg->getAclTensor(output_index)->handle();

const auto frontend_layout = _current_layout;

::arm_compute::PaddingList padding_list;
padding_list.resize(rank);
for (int32_t n = 0; n < rank; ++n)
{
const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);

const auto axis = acl_common::ToARMComputeAxis(rank, n, frontend_layout).value();
const auto axis = acl_common::ToARMComputeAxis(rank, n).value();
padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
}

Expand Down
1 change: 0 additions & 1 deletion runtime/onert/backend/acl_cl/KernelGenerator.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ class KernelGenerator : public basic::KernelGeneratorBase
private:
const ir::Operands &_ctx;
const ir::Operations &_operations_ctx;
const ir::Layout _current_layout;
std::shared_ptr<TensorBuilder> _tensor_builder;
std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> _tensor_reg;
};
Expand Down
5 changes: 1 addition & 4 deletions runtime/onert/backend/acl_cl/Optimizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,7 @@ void Optimizer::optimize()
acl_common::AclSubTensorAnalyzer sa{*_context->graph()};
sa.setUsePadding();
_context->graph()->operations().iterate(
[&](const ir::OperationIndex &, const ir::IOperation &op) {
sa.setLayout(_context->graph()->layout());
op.accept(sa);
});
[&](const ir::OperationIndex &, const ir::IOperation &op) { op.accept(sa); });

_tensor_builder->parent_map(sa.releaseParentMap());
}
Expand Down
5 changes: 2 additions & 3 deletions runtime/onert/backend/acl_common/AclBackendContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,8 @@ class AclBackendContext
if (this->external_operands().contains(ind))
return;

const auto frontend_layout = this->graph()->layout();
ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, ir::Layout::NHWC),
obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
ir::OperandInfo backend_info{obj.shape(), obj.typeInfo(), obj.info().memAllocType(),
obj.isConstant()};
this->tensor_builder->registerTensorInfo(ind, backend_info);
});

Expand Down
Loading

0 comments on commit 6fed2dd

Please sign in to comment.