From 609feeb4ffef3b404737788f1ac1d077beb1d9b0 Mon Sep 17 00:00:00 2001
From: Hyeongseok Oh <hseok82.oh@samsung.com>
Date: Tue, 3 Dec 2024 19:05:02 +0900
Subject: [PATCH] [onert] Remove ArgMinMax CL extension

This commit removes ARMCompute CL extension for ArgMinMax.
ArgMinMax extension is introduced to support int64 output type, but it is supported by ARMCompute now.

ONE-DCO-1.0-Signed-off-by: Hyeongseok Oh <hseok82.oh@samsung.com>
---
 .../CL/kernels/CLArgMinMaxLayerKernelEx.h     | 115 ----
 .../arm_compute/runtime/CL/CLFunctionsEx.h    |   1 -
 .../runtime/CL/functions/CLArgMinMaxLayerEx.h | 109 ----
 .../src/core/CL/CLKernelLibrary.cpp           |   4 -
 .../src/core/CL/cl_kernels/arg_min_max_ex.cl  | 564 ------------------
 .../CL/kernels/CLArgMinMaxLayerKernelEx.cpp   | 332 -----------
 .../CL/functions/CLArgMinMaxLayerEx.cpp       | 224 -------
 .../onert/backend/acl_cl/KernelGenerator.cc   |   2 +-
 8 files changed, 1 insertion(+), 1350 deletions(-)
 delete mode 100644 compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
 delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
 delete mode 100644 compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl
 delete mode 100644 compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
 delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp

diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
deleted file mode 100644
index 46d4ae8589b..00000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H
-#define ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H
-
-#include "src/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the reduction operation kernel
- *
- * @note The default data type for an uninitialized output tensor is
- *       signed 32-bit integer (S32). It is the user's responsibility to check
- *       that the results do not overflow because the indices are computed
- *       in unsigned 32-bit (U32).
- */
-class CLArgMinMaxLayerKernelEx : public ICLKernel
-{
-public:
-  /** Default constructor */
-  CLArgMinMaxLayerKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLArgMinMaxLayerKernelEx(const CLArgMinMaxLayerKernelEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLArgMinMaxLayerKernelEx &operator=(const CLArgMinMaxLayerKernelEx &) = delete;
-  /** Allow instances of this class to be moved */
-  CLArgMinMaxLayerKernelEx(CLArgMinMaxLayerKernelEx &&) = default;
-  /** Allow instances of this class to be moved */
-  CLArgMinMaxLayerKernelEx &operator=(CLArgMinMaxLayerKernelEx &&) = default;
-  /** Default destructor */
-  ~CLArgMinMaxLayerKernelEx() = default;
-
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input       Source tensor. Data types supported: S32/F16/F32.
-   * @param[in]  prev_output Destination tensor of the previous iterations of @ref
-   * CLArgMinMaxLayerKernelEx. Data types supported: U32/S32
-   *                         Has to be nullptr for the first iteration
-   * @param[out] output      Destination tensor. Data types supported: U32/S32
-   *                         Output will have the same number of dimensions as input.
-   * @param[in]  axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
-   * @param[in]  op          Reduction operation to perform. Only ArgMin and ArgMax are supported.
-   */
-  void configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output,
-                 unsigned int axis, ReductionOperation op);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLArgMinMaxLayerKernelEx.
-   *
-   * @param[in] input       Source tensor info. Data types supported: S32/F16/F32.
-   * @param[in] prev_output Destination tensor info of the previous iterations. Data types
-   * supported: U32/S32
-   *                        Has to be nullptr for the first iteration
-   * @param[in] output      Destination tensor info. Data types supported: U32/S32
-   *                        Output will have the same number of dimensions as input.
-   * @param[in] axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
-   * @param[in] op          Reduction operation to perform.  Only ArgMin and ArgMax are supported.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *prev_output,
-                         const ITensorInfo *output, unsigned int axis, ReductionOperation op);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input;
-  const ICLTensor *_prev_output;
-  ICLTensor *_output;
-  unsigned int _reduction_axis;
-  ReductionOperation _op;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
index 664b8b3b1a3..d12d9180e28 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
@@ -16,7 +16,6 @@
 #ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__
 #define __ARM_COMPUTE_CLFUNCTIONSEX_H__
 
-#include <arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h>
 #include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h>
 #include <arm_compute/runtime/CL/functions/CLCastBool.h>
 #include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h>
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
deleted file mode 100644
index 05bcc40755c..00000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__
-#define __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__
-
-#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h"
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-namespace arm_compute
-{
-class ITensorInfo;
-class ICLTensor;
-
-/** Function to calculate the index of the minimum or maximum values in a
- *  tensor based on an axis.
- *
- * @note The default data type for an uninitialized output tensor is
- *       signed 32-bit integer (S32). It is the user's responsibility to check
- *       that the results do not overflow because the indices are computed
- *       in unsigned 32-bit (U32).
- */
-class CLArgMinMaxLayerEx : public IFunction
-{
-public:
-  /** Default Constructor.
-   *
-   * @param[in] memory_manager (Optional) Memory manager.
-   */
-  CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input  Input source tensor. Data types supported: QASYMM8/F16/F32.
-   * @param[in]  axis   Axis to find max/min index.
-   * @param[out] output Output source tensor. Data types supported: U32/S32.
-   * @param[in]  op     Reduction operation to perform. Operations supported: ARG_IDX_MAX,
-   * ARG_IDX_MIN
-   */
-  void configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLArgMinMaxLayerEx
-   *
-   * @param[in] input  Input source tensor info. Data types supported: QASYMM8/F16/F32.
-   * @param[in] axis   Axis to find max/min index.
-   * @param[in] output Output source tensor info. Data types supported: U32/S32.
-   * @param[in] op     Reduction operation to perform. Operations supported: ARG_IDX_MAX,
-   * ARG_IDX_MIN
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output,
-                         const ReductionOperation &op);
-
-  // Inherited methods overridden:
-  void run() override;
-
-private:
-  MemoryGroup _memory_group;
-  std::vector<CLTensor> _results_vector;
-  CLTensor _not_reshaped_output;
-  std::vector<CLArgMinMaxLayerKernelEx> _reduction_kernels_vector;
-  CLReshapeLayer _reshape_kernel;
-  unsigned int _num_of_stages;
-  unsigned int _reduction_axis;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index e15dc2685cb..dce115b2af1 100644
--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -92,10 +92,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
   {
     "activation_float_helpers.h",
 #include "./cl_kernels/activation_float_helpers.hembed"
-  },
-  {
-    "arg_min_max_ex.cl",
-#include "./cl_kernels/arg_min_max_ex.clembed"
   },
   {
     "binary_logical_op.cl",
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl
deleted file mode 100644
index 135cacf596f..00000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl
+++ /dev/null
@@ -1,564 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(FLOAT_DATA_TYPE)
-#define ISGREATER(x, y) isgreater(x, y)
-#define ISLESS(x, y) isless(x, y)
-#else // !FLOAT_DATA_TYPE
-#if defined(WIDTH)
-#define ISGREATER(x, y) (x > y) ? 1 : 0
-#define ISLESS(x, y) (x < y) ? 1 : 0
-#else // !defined(WIDTH)
-#define ISGREATER(x, y) \
-  select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x > y)
-#define ISLESS(x, y) \
-  select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x < y)
-#endif // defined(WIDTH)
-#endif // defined(FLOAT_DATA_TYPE)
-
-#if defined(ARG_MAX)
-#define CONDITION_TO_USE(x, y) ISGREATER(x, y)
-#elif defined(ARG_MIN)
-#define CONDITION_TO_USE(x, y) ISLESS(x, y)
-#else // !(defined(ARG_MAX) || defined(ARG_MIN))
-#error "Unsupported reduction operation!"
-#endif // defined(ARG_MAX)
-
-#if defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT)
-#if defined(WIDTH)
-#if defined(ARG_MIN)
-#if defined(PREV_OUTPUT)
-/** Find index minimum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global const DATA_TYPE *input,
-                                             __global const DATA_TYPE_OUTPUT *prev_res,
-                                             const int x_idx)
-{
-  int end_elem = (x_idx + 1) * 16;
-  if (end_elem > WIDTH)
-  {
-    end_elem = WIDTH - x_idx * 16;
-  }
-  DATA_TYPE_OUTPUT res = prev_res[0];
-  for (int x_v = 1; x_v < end_elem; ++x_v)
-  {
-    res = select(res, prev_res[x_v], *(input + prev_res[x_v]) < *(input + res));
-  }
-  return res;
-}
-#else // !defined(PREV_OUTPUT)
-/** Find index minimum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx)
-{
-#if WIDTH < 16
-  DATA_TYPE_OUTPUT res = 0;
-  for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
-  {
-    res = select(res, x_v, *(input + x_v) < *(input + res));
-  }
-  return res;
-#else  // WIDTH >= 16
-  int x_elem = x_idx * 16;
-  const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
-  x_elem -= x_goback;
-
-  VEC_DATA_TYPE(DATA_TYPE, 16)
-  in = vload16(0, input - x_goback);
-  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-  res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-
-  VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
-  idx_sel = (in.s01234567 <= in.s89abcdef);
-  in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
-  res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
-
-  idx_sel.s0123 =
-    (in.s0123 < in.s4567) ||
-    (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
-  in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
-  res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
-
-  idx_sel.s01 =
-    (in.s01 < in.s23) ||
-    (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
-  in.s01 = select(in.s23, in.s01, idx_sel.s01);
-  res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
-
-  idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
-  res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
-
-  return res.s0 + x_elem;
-#endif // WIDTH < 16
-}
-#endif // defined(PREV_OUTPUT)
-#endif // defined(ARG_MIN)
-#if defined(ARG_MAX)
-#if defined(PREV_OUTPUT)
-/** Find index maximum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global const DATA_TYPE *input,
-                                             __global const DATA_TYPE_OUTPUT *prev_res,
-                                             const int x_idx)
-{
-  int end_elem = (x_idx + 1) * 16;
-  if (end_elem > WIDTH)
-  {
-    end_elem = WIDTH - x_idx * 16;
-  }
-  DATA_TYPE_OUTPUT res = prev_res[0];
-  unsigned int res_int = res;
-  DATA_TYPE_OUTPUT condition_check2;
-  for (int x_v = 1; x_v < end_elem; ++x_v)
-  {
-    int i1 = prev_res[x_v];
-    condition_check2 = *(input + i1) > *(input + res_int);
-    res = select(res, prev_res[x_v], condition_check2);
-  }
-  return res;
-}
-#else // !defined(PREV_OUTPUT)
-/** Find index maximum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx)
-{
-#if WIDTH < 16
-  DATA_TYPE_OUTPUT res = 0;
-  unsigned int i1;
-  unsigned int i2;
-  DATA_TYPE_OUTPUT condition_check;
-  for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
-  {
-    i1 = x_v;
-    i2 = res;
-    condition_check = *(input + i1) > *(input + i2);
-    res = select(res, x_v, condition_check);
-  }
-  return res;
-#else  // WIDTH >= 16
-  int x_elem = x_idx * 16;
-  const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
-  x_elem -= x_goback;
-
-  VEC_DATA_TYPE(DATA_TYPE, 16)
-  in = vload16(0, input - x_goback);
-  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-  res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-
-  VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
-  idx_sel = (in.s01234567 >= in.s89abcdef);
-  in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
-  res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
-
-  idx_sel.s0123 =
-    (in.s0123 > in.s4567) ||
-    (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
-  in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
-  res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
-
-  idx_sel.s01 =
-    (in.s01 > in.s23) ||
-    (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
-  in.s01 = select(in.s23, in.s01, idx_sel.s01);
-  res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
-
-  idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
-  res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
-
-  return res.s0 + x_elem;
-#endif // WIDTH < 16
-}
-#endif // defined(PREV_OUTPUT)
-#endif // defined(ARG_MAX)
-
-/** This kernel performs parallel reduction given an operation on x-axis.
- *
- * @note In case the results of previous stages are passed the flag PREV_OUTPUT has to be passed
- * using -DPREV_OUTPUT
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g.
- * -DDATA_TYPE_OUTPUT=uint
- * @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the
- * ArgMax
- * @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the
- * ArgMin
- *
- * @param[in] src_ptr                                   Pointer to the source tensor. Supported data
- * types: S32/F16/F32
- * @param[in] src_stride_x                              Stride of the source tensor in X dimension
- * (in bytes)
- * @param[in] src_step_x                                src_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] src_stride_y                              Stride of the source tensor in Y dimension
- * (in bytes)
- * @param[in] src_step_y                                src_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the
- * source tensor
- * @param[in] prev_res_ptr                              (Optional) Pointer to previous results
- * tensor. Supported data types: U32/S32
- * @param[in] prev_res_stride_x                         (Optional) Stride of the output tensor in X
- * dimension (in bytes)
- * @param[in] prev_res_step_x                           (Optional) prev_res_stride_x * number of
- * elements along X processed per workitem(in bytes)
- * @param[in] prev_res_stride_y                         (Optional) Stride of the output tensor in Y
- * dimension (in bytes)
- * @param[in] prev_res_step_y                           (Optional) prev_res_stride_y * number of
- * elements along Y processed per workitem(in bytes)
- * @param[in] prev_res_offset_first_element_in_bytes    (Optional) The offset of the first element
- * in the previous results tensor
- * @param[in] partial_res_ptr                           The local buffer to hold partial result
- * values. Supported data types: U32/S32
- * @param[in] partial_res_stride_x                      Stride of the output tensor in X dimension
- * (in bytes)
- * @param[in] partial_res_step_x                        partial_res_stride_x * number of elements
- * along X processed per workitem(in bytes)
- * @param[in] partial_res_stride_y                      Stride of the output tensor in Y dimension
- * (in bytes)
- * @param[in] partial_res_step_y                        partial_res_stride_y * number of elements
- * along Y processed per workitem(in bytes)
- * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the
- * source tensor
- * @param[in] local_results                             Local buffer for storing the partial result
- */
-__kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src),
-#if defined(PREV_OUTPUT)
-                               IMAGE_DECLARATION(prev_res),
-#endif // defined(PREV_OUTPUT)
-                               IMAGE_DECLARATION(partial_res),
-                               __local DATA_TYPE_OUTPUT *local_results)
-{
-#if defined(PREV_OUTPUT)
-  Image src = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
-  Image prev_res = CONVERT_TO_IMAGE_STRUCT(prev_res);
-#else  // !defined(PREV_OUTPUT)
-  Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#endif // defined(PREV_OUTPUT)
-  Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
-
-  unsigned int lsize = get_local_size(0);
-  unsigned int lid = get_local_id(0);
-
-  const uint x_idx = get_global_id(0);
-  const uint y_idx = get_global_id(1);
-  const __global DATA_TYPE *src_in_row =
-    (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y);
-
-  for (unsigned int y = 0; y < get_local_size(1); ++y)
-  {
-#if defined(ARG_MAX)
-#if defined(PREV_OUTPUT)
-    local_results[lid] =
-      arg_idx_max_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
-#else  // !defined(PREV_OUTPUT)
-    local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
-#endif // defined(PREV_OUTPUT)
-#else  // defined(ARG_MIN)
-#if defined(PREV_OUTPUT)
-    local_results[lid] =
-      arg_idx_min_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
-#else  // !defined(PREV_OUTPUT)
-    local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
-#endif // defined(PREV_OUTPUT)
-#endif // defined(ARG_MAX) || defined(ARG_MIN)
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // Looking for the next highest power of 2 (maximum value of lsize is 8)
-    unsigned int middle = lsize - 1;
-    middle |= middle >> 1;
-    middle |= middle >> 2;
-    middle += 1;
-    // Perform parallel reduction
-    DATA_TYPE_OUTPUT condition_check3;
-    for (unsigned int i = middle; i > 0; i >>= 1)
-    {
-      if (lid < i && lid + i < lsize)
-      {
-        DATA_TYPE tmp0 = *(src_in_row + local_results[lid]);
-        DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]);
-#if defined(ARG_MAX)
-        condition_check3 =
-          ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1);
-        local_results[lid] = select(local_results[lid], local_results[lid + i], condition_check3);
-#else  // defined(ARG_MIN)
-        local_results[lid] = select(
-          local_results[lid], local_results[lid + i],
-          ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
-#endif // defined(ARG_MAX) || defined(ARG_MIN)
-      }
-      barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    if (lid == 0)
-    {
-      ((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
-    }
-  }
-}
-#endif // defined(WIDTH)
-
-#if defined(HEIGHT)
-/** This kernel performs reduction on y-axis.
- *
- * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g.
- * -DDATA_TYPE=float
- * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g.
- * -DDATA_TYPE_OUTPUT=uint
- * @note The data type of the select results must be passed at compile time using
- * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
- * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
- *
- * @param[in] src_ptr                              Pointer to the source tensor. Supported data
- * types: S32/F16/F32
- * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in
- * bytes)
- * @param[in] src_step_x                           src_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] src_stride_y                         Stride of the source tensor in Y dimension (in
- * bytes)
- * @param[in] src_step_y                           src_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source
- * tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
- * data types: U32/S32
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
- * bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
- * bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
- * tensor
- */
-__kernel void arg_min_max_ex_y(IMAGE_DECLARATION(src), IMAGE_DECLARATION(output))
-{
-  Image src = CONVERT_TO_IMAGE_STRUCT(src);
-  Image output = CONVERT_TO_IMAGE_STRUCT(output);
-
-  VEC_DATA_TYPE(DATA_TYPE, 16)
-  res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16));
-
-  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-  indx = 0;
-  for (unsigned int y = 1; y < HEIGHT; ++y)
-  {
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in =
-      CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16));
-
-    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-    cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
-    indx = select(indx, y, cond_conv);
-    res = select(res, in, CONDITION_TO_USE(in, res));
-  }
-
-  // Store result
-  vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
-}
-#endif // defined(HEIGHT)
-
-#if defined(DEPTH)
-/** This kernel performs reduction on z-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data type of the select results must be passed at compile time using
- * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
- * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data
- * types: S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in
- * bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in
- * bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source
- * tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
- * data types: U32/S32
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
- * bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
- * bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
- * tensor
- */
-__kernel void arg_min_max_ex_z(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VEC_DATA_TYPE(DATA_TYPE, 16)
-  res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)),
-                VEC_DATA_TYPE(DATA_TYPE, 16));
-
-  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-  indx = 0;
-  for (DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z)
-  {
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)),
-                 VEC_DATA_TYPE(DATA_TYPE, 16));
-
-    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-    cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
-    indx = select(indx, z, cond_conv);
-    res = select(res, in, CONDITION_TO_USE(in, res));
-  }
-
-  // Store result
-  vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
-}
-#endif /* defined(DEPTH) */
-
-#if defined(BATCH) && defined(DEPTH)
-/** This kernel performs reduction on w-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data type of the select results must be passed at compile time using
- * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
- * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
- * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data
- * types: S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in
- * bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in
- * bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in
- * bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in
- * bytes)
- * @param[in] input_step_w                         input_stride_w * number of elements along W
- * processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source
- * tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
- * data types: U32/S32
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
- * bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X
- * processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
- * bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y
- * processed per workitem(in bytes)
- * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in
- * bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z
- * processed per workitem(in bytes)
- * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in
- * bytes)
- * @param[in] output_step_w                        output_stride_w * number of elements along W
- * processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
- * tensor
- */
-__kernel void arg_min_max_ex_w(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
-  Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
-  Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
-
-  VEC_DATA_TYPE(DATA_TYPE, 16)
-  res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)),
-                VEC_DATA_TYPE(DATA_TYPE, 16));
-
-  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-  indx = 0;
-  for (DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w)
-  {
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)),
-                 VEC_DATA_TYPE(DATA_TYPE, 16));
-
-    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-    cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
-    indx = select(indx, w, cond_conv);
-    res = select(res, in, CONDITION_TO_USE(in, res));
-  }
-
-  // Store result
-  vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
-}
-#endif /* defined(BATCH) && defined(DEPTH) */
-#endif /* defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) */
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
deleted file mode 100644
index 9874097390f..00000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
+++ /dev/null
@@ -1,332 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-constexpr unsigned int vector_size = 16;
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_output,
-                          const ITensorInfo *output, unsigned int axis, ReductionOperation op)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8,
-                                                       DataType::QASYMM8_SIGNED, DataType::S32,
-                                                       DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX &&
-                                    op != ReductionOperation::ARG_IDX_MIN,
-                                  "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
-                                  "Reduction axis greater than max number of dimensions");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
-
-  if (output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32,
-                                                         DataType::S64);
-  }
-  if (prev_output != nullptr && prev_output->total_size() != 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(prev_output, 1, DataType::U32,
-                                                         DataType::S32, DataType::S64);
-    if (output->total_size() != 0)
-    {
-      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(prev_output, output);
-    }
-  }
-
-  return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input,
-                                                         ITensorInfo *prev_output,
-                                                         ITensorInfo *output, unsigned int axis,
-                                                         ReductionOperation op)
-{
-  ARM_COMPUTE_UNUSED(op);
-  // Output tensor auto initialization if not yet initialized
-  TensorShape output_shape{input->tensor_shape()};
-  output_shape.set(axis, 1);
-  DataType output_data_type = (prev_output != nullptr) ? (prev_output->data_type()) : DataType::S32;
-  auto_init_if_empty(*output, input->clone()
-                                ->set_tensor_shape(output_shape)
-                                .set_data_type(output_data_type)
-                                .reset_padding()
-                                .set_is_resizable(true));
-
-  Window win =
-    calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input), Steps(vector_size));
-  bool window_changed = false;
-
-  switch (axis)
-  {
-    case 0:
-    {
-      ITensorInfo *input_tensor_access = prev_output != nullptr ? prev_output : input;
-      AccessWindowStatic input_access(input_tensor_access, 0, 0,
-                                      static_cast<int>(input_tensor_access->dimension(0)), 1);
-      AccessWindowHorizontal output_access(output, 0, 1);
-      window_changed = update_window_and_padding(win, input_access, output_access);
-      output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-    break;
-    case 1:
-    case 2:
-    case 3:
-    {
-      AccessWindowHorizontal input_access(input, 0, vector_size);
-      AccessWindowHorizontal output_access(output, 0, vector_size);
-      window_changed = update_window_and_padding(win, input_access, output_access);
-      output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-    break;
-    default:
-      ARM_COMPUTE_ERROR("Not supported");
-  }
-
-  Status err = (window_changed)
-                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                 : Status{};
-  return std::make_tuple(err, win);
-}
-} // namespace
-
-CLArgMinMaxLayerKernelEx::CLArgMinMaxLayerKernelEx()
-  : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0),
-    _op(ReductionOperation::ARG_IDX_MAX)
-{
-}
-
-void CLArgMinMaxLayerKernelEx::configure(const ICLTensor *input, const ICLTensor *prev_output,
-                                         ICLTensor *output, unsigned int axis,
-                                         ReductionOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(
-    validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr,
-                       output->info(), axis, op));
-  auto win_config = validate_and_configure_window(
-    input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis,
-    op);
-  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-  _input = input;
-  _prev_output = prev_output;
-  _output = output;
-  _reduction_axis = axis;
-  _op = op;
-
-  // Set build options
-  CLBuildOptions build_opts;
-
-  build_opts.add_option_if(_prev_output != nullptr, "-DPREV_OUTPUT");
-  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
-  build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN");
-  build_opts.add_option("-DDATA_TYPE_OUTPUT=" +
-                        get_cl_type_from_data_type(output->info()->data_type()));
-  build_opts.add_option("-DDATA_TYPE_SELECT=" +
-                        get_cl_signed_type_from_element_size(input->info()->element_size()));
-
-  // Create kernel
-  cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
-  std::string kernel_axis_name;
-  switch (axis)
-  {
-    case 0:
-    {
-      const ICLTensor *input_for_width = prev_output != nullptr ? _prev_output : _input;
-      build_opts.add_option("-DWIDTH=" +
-                            support::cpp11::to_string(input_for_width->info()->dimension(0)));
-
-      kernel_axis_name = "x";
-      lws_hint = create_lws_hint_parallel_implementations(input_for_width->info()->dimension(0),
-                                                          vector_size);
-    }
-    break;
-    case 1:
-      build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
-      kernel_axis_name = "y";
-      break;
-    case 2:
-      build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
-      kernel_axis_name = "z";
-      break;
-    case 3:
-      build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
-      build_opts.add_option("-DBATCH=" + support::cpp11::to_string(input->info()->dimension(3)));
-      kernel_axis_name = "w";
-      break;
-    default:
-      ARM_COMPUTE_ERROR("Not supported");
-  }
-  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
-    "arg_min_max_ex_" + kernel_axis_name, build_opts.options()));
-
-  // Configure kernel window
-  ICLKernel::configure_internal(std::get<1>(win_config), lws_hint);
-}
-
-Status CLArgMinMaxLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *prev_output,
-                                          const ITensorInfo *output, unsigned int axis,
-                                          ReductionOperation op)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, prev_output, output, axis, op));
-  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
-    input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr,
-    output->clone().get(), axis, op)));
-  return Status{};
-}
-
-void CLArgMinMaxLayerKernelEx::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  switch (_reduction_axis)
-  {
-    case 0:
-    {
-      // Set out window
-      Window out_window(window);
-      out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-      // Get first input and output slices
-      Window in_slice = window.first_slice_window_2D();
-      Window out_slice = out_window.first_slice_window_2D();
-
-      // Reshape window
-      const unsigned int num_tensors = _prev_output != nullptr ? 3 : 2;
-
-      // Set local sums buffer
-      unsigned int local_res_size = lws_hint()[0] * _output->info()->element_size();
-      _kernel.setArg(num_arguments_per_2D_tensor() * num_tensors, local_res_size, nullptr);
-      do
-      {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, in_slice);
-        if (_prev_output != nullptr)
-        {
-          add_2D_tensor_argument(idx, _prev_output, in_slice);
-        }
-        add_2D_tensor_argument(idx, _output, out_slice);
-        enqueue(queue, *this, in_slice, lws_hint());
-      } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
-    }
-    break;
-    case 1:
-    {
-      // Get first input and output slices
-      Window window_in{window};
-      window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1),
-                                                    _input->info()->dimension(1)));
-      Window in_slice = window_in.first_slice_window_2D();
-      Window out_slice = window.first_slice_window_2D();
-
-      do
-      {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, in_slice);
-        add_2D_tensor_argument(idx, _output, out_slice);
-        enqueue(queue, *this, in_slice, lws_hint());
-      } while (window_in.slide_window_slice_2D(in_slice) &&
-               window.slide_window_slice_2D(out_slice));
-    }
-    break;
-    case 2:
-    {
-      // Get first input and output slices
-      Window window_in{window};
-      window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2),
-                                                    _input->info()->dimension(2)));
-      Window in_slice = window_in.first_slice_window_3D();
-      Window out_slice = window.first_slice_window_3D();
-
-      do
-      {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, in_slice);
-        add_3D_tensor_argument(idx, _output, out_slice);
-        enqueue(queue, *this, in_slice, lws_hint());
-      } while (window_in.slide_window_slice_3D(in_slice) &&
-               window.slide_window_slice_3D(out_slice));
-    }
-    break;
-    case 3:
-    {
-      // Get first input and output slices
-      Window window_in{window};
-      window_in.set(3, Window::Dimension(0, 1, 1));
-      Window in_slice = window_in.first_slice_window_4D();
-      Window out_slice = window.first_slice_window_4D();
-
-      do
-      {
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, _input, in_slice);
-        add_4D_tensor_argument(idx, _output, out_slice);
-        enqueue(queue, *this, in_slice, lws_hint());
-      } while (window_in.slide_window_slice_4D(in_slice) &&
-               window.slide_window_slice_4D(out_slice));
-    }
-    break;
-    default:
-      ARM_COMPUTE_ERROR("Not supported");
-  }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
deleted file mode 100644
index 6b9b0d4b46d..00000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/runtime/Utils.h"
-
-namespace arm_compute
-{
-CLArgMinMaxLayerEx::CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
-  : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(),
-    _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis()
-{
-}
-
-Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const ITensorInfo *output,
-                                    const ReductionOperation &op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX &&
-                                    op != ReductionOperation::ARG_IDX_MIN,
-                                  "Invalid reduction operation");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions),
-                                  "Reduction axis greater than max number of dimensions");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
-  const unsigned int num_of_stages =
-    utils::calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
-
-  DataType output_data_type = DataType::S32;
-  TensorInfo not_reshaped_output;
-  const auto input_num_channles = input->num_channels();
-  const auto input_qinfo = input->quantization_info();
-
-  if (output->total_size() != 0)
-  {
-    output_data_type = output->data_type();
-    const TensorInfo expected_output_shape =
-      output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(
-        input->tensor_shape(), axis, false));
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
-  }
-
-  auto shape_before_reshape = input->tensor_shape();
-  shape_before_reshape.set(axis, 1);
-  auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type,
-                                  int num_channels, QuantizationInfo qinfo) {
-    ti.set_data_type(data_type)
-      .set_tensor_shape(shape)
-      .set_num_channels(num_channels)
-      .set_quantization_info(qinfo);
-  };
-
-  initialize_tensorinfo(not_reshaped_output, shape_before_reshape, output_data_type,
-                        input_num_channles, input_qinfo);
-
-  if (num_of_stages == 1)
-  {
-    ARM_COMPUTE_RETURN_ON_ERROR(
-      CLArgMinMaxLayerKernelEx::validate(input, nullptr, &not_reshaped_output, axis, op));
-  }
-  else
-  {
-    // Create temporary tensor infos
-    std::vector<TensorInfo> sums_vector(num_of_stages - 1);
-
-    // Create intermediate tensor info
-    TensorShape shape{input->tensor_shape()};
-
-    for (unsigned int i = 0; i < num_of_stages - 1; i++)
-    {
-      shape.set(0, ceil(shape.x() / 128.f));
-      sums_vector[i].set_data_type(input->data_type());
-      sums_vector[i].set_tensor_shape(shape);
-      sums_vector[i].set_num_channels(input->num_channels());
-    }
-
-    // Validate ReductionOperation only on first kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(
-      CLArgMinMaxLayerKernelEx::validate(input, nullptr, &sums_vector[0], axis, op));
-
-    // Validate ReductionOperation on intermediate stages
-    for (unsigned int i = 1; i < num_of_stages - 1; ++i)
-    {
-      ARM_COMPUTE_RETURN_ON_ERROR(
-        CLArgMinMaxLayerKernelEx::validate(input, &sums_vector[i - 1], &sums_vector[i], axis, op));
-    }
-
-    // Validate ReductionOperation on the last stage
-    const unsigned int last_stage = num_of_stages - 1;
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(
-      input, &sums_vector[last_stage - 1], &not_reshaped_output, axis, op));
-  }
-  ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&not_reshaped_output, output));
-  return Status{};
-}
-
-void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *output,
-                                   const ReductionOperation &op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  _num_of_stages = utils::calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
-  _reduction_axis = axis;
-
-  const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(
-    input->info()->tensor_shape(), axis, false);
-  DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN)
-                                ? DataType::S32
-                                : output->info()->data_type();
-  auto_init_if_empty(*output->info(), input->info()
-                                        ->clone()
-                                        ->set_tensor_shape(output_shape)
-                                        .set_data_type(output_data_type)
-                                        .reset_padding()
-                                        .set_is_resizable(true));
-
-  // Configure reduction operation kernels
-  _reduction_kernels_vector.resize(_num_of_stages);
-
-  _memory_group.manage(&_not_reshaped_output);
-  // Create temporary tensors
-  if (_num_of_stages == 1)
-  {
-    // Force an early initialization for int64 output type
-    TensorShape output_shape{input->info()->tensor_shape()};
-    output_shape.set(axis, 1);
-    auto_init_if_empty(*_not_reshaped_output.info(), input->info()
-                                                       ->clone()
-                                                       ->set_tensor_shape(output_shape)
-                                                       .set_data_type(output_data_type)
-                                                       .reset_padding()
-                                                       .set_is_resizable(true));
-    _not_reshaped_output.info()->set_tensor_shape(output_shape);
-    _reduction_kernels_vector[0].configure(input, nullptr, &_not_reshaped_output, axis, op);
-  }
-  else
-  {
-    _results_vector.resize(_num_of_stages - 1);
-    TensorShape shape{input->info()->tensor_shape()};
-    for (unsigned int i = 0; i < _num_of_stages - 1; i++)
-    {
-      shape.set(0, ceil(shape.x() / 128.f));
-      _results_vector[i].allocator()->init(
-        input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type));
-    }
-
-    // Apply ReductionOperation only on first kernel
-    _memory_group.manage(&_results_vector[0]);
-    _reduction_kernels_vector[0].configure(input, nullptr, &_results_vector[0], axis, op);
-
-    // Apply ReductionOperation on intermediate stages
-    for (unsigned int i = 1; i < _num_of_stages - 1; ++i)
-    {
-      _memory_group.manage(&_results_vector[i]);
-      _reduction_kernels_vector[i].configure(input, &_results_vector[i - 1], &_results_vector[i],
-                                             axis, op);
-      _results_vector[i - 1].allocator()->allocate();
-    }
-
-    // Apply ReductionOperation on the last stage
-    const unsigned int last_stage = _num_of_stages - 1;
-    _reduction_kernels_vector[last_stage].configure(input, &_results_vector[last_stage - 1],
-                                                    &_not_reshaped_output, axis, op);
-    _results_vector[last_stage - 1].allocator()->allocate();
-  }
-  _reshape_kernel.configure(CLKernelLibrary::get().get_compile_context(), &_not_reshaped_output,
-                            output);
-  _not_reshaped_output.allocator()->allocate();
-}
-
-void CLArgMinMaxLayerEx::run()
-{
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  for (unsigned int i = 0; i < _num_of_stages; ++i)
-  {
-    CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
-  }
-  _reshape_kernel.run();
-}
-} // namespace arm_compute
diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc
index b63326c24cb..69378a0ee00 100644
--- a/runtime/onert/backend/acl_cl/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc
@@ -1274,7 +1274,7 @@ void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
   auto acl_axis = acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
   auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
                                              : ::arm_compute::ReductionOperation::ARG_IDX_MIN;
-  auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayerEx>(
+  auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayer>(
     ifm_tensor->handle(), acl_axis, ofm_tensor->handle(), reduce_type);
 
   _return_fn = asAclFunction(std::move(fn));