From 609feeb4ffef3b404737788f1ac1d077beb1d9b0 Mon Sep 17 00:00:00 2001 From: Hyeongseok Oh Date: Tue, 3 Dec 2024 19:05:02 +0900 Subject: [PATCH] [onert] Remove ArgMinMax CL extension This commit removes ARMCompute CL extension for ArgMinMax. ArgMinMax extension is introduced to support int64 output type, but it is supported by ARMCompute now. ONE-DCO-1.0-Signed-off-by: Hyeongseok Oh --- .../CL/kernels/CLArgMinMaxLayerKernelEx.h | 115 ---- .../arm_compute/runtime/CL/CLFunctionsEx.h | 1 - .../runtime/CL/functions/CLArgMinMaxLayerEx.h | 109 ---- .../src/core/CL/CLKernelLibrary.cpp | 4 - .../src/core/CL/cl_kernels/arg_min_max_ex.cl | 564 ------------------ .../CL/kernels/CLArgMinMaxLayerKernelEx.cpp | 332 ----------- .../CL/functions/CLArgMinMaxLayerEx.cpp | 224 ------- .../onert/backend/acl_cl/KernelGenerator.cc | 2 +- 8 files changed, 1 insertion(+), 1350 deletions(-) delete mode 100644 compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h delete mode 100644 compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h delete mode 100644 compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl delete mode 100644 compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp delete mode 100644 compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h deleted file mode 100644 index 46d4ae8589b..00000000000 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2019-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H -#define ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H - -#include "src/core/CL/ICLKernel.h" -#include "arm_compute/core/Types.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the reduction operation kernel - * - * @note The default data type for an uninitialized output tensor is - * signed 32-bit integer (S32). It is the user's responsibility to check - * that the results do not overflow because the indices are computed - * in unsigned 32-bit (U32). - */ -class CLArgMinMaxLayerKernelEx : public ICLKernel -{ -public: - /** Default constructor */ - CLArgMinMaxLayerKernelEx(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLArgMinMaxLayerKernelEx(const CLArgMinMaxLayerKernelEx &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLArgMinMaxLayerKernelEx &operator=(const CLArgMinMaxLayerKernelEx &) = delete; - /** Allow instances of this class to be moved */ - CLArgMinMaxLayerKernelEx(CLArgMinMaxLayerKernelEx &&) = default; - /** Allow instances of this class to be moved */ - CLArgMinMaxLayerKernelEx &operator=(CLArgMinMaxLayerKernelEx &&) = default; - /** Default destructor */ - ~CLArgMinMaxLayerKernelEx() = default; - - /** Set the input and output tensors. - * - * @param[in] input Source tensor. Data types supported: S32/F16/F32. - * @param[in] prev_output Destination tensor of the previous iterations of @ref - * CLArgMinMaxLayerKernelEx. Data types supported: U32/S32 - * Has to be nullptr for the first iteration - * @param[out] output Destination tensor. Data types supported: U32/S32 - * Output will have the same number of dimensions as input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 - * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported. - */ - void configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, - unsigned int axis, ReductionOperation op); - - /** Static function to check if given info will lead to a valid configuration of @ref - * CLArgMinMaxLayerKernelEx. - * - * @param[in] input Source tensor info. Data types supported: S32/F16/F32. - * @param[in] prev_output Destination tensor info of the previous iterations. Data types - * supported: U32/S32 - * Has to be nullptr for the first iteration - * @param[in] output Destination tensor info. Data types supported: U32/S32 - * Output will have the same number of dimensions as input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 - * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *prev_output, - const ITensorInfo *output, unsigned int axis, ReductionOperation op); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - const ICLTensor *_prev_output; - ICLTensor *_output; - unsigned int _reduction_axis; - ReductionOperation _op; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h index 664b8b3b1a3..d12d9180e28 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h @@ -16,7 +16,6 @@ #ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__ #define __ARM_COMPUTE_CLFUNCTIONSEX_H__ -#include #include #include #include diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h deleted file mode 100644 index 05bcc40755c..00000000000 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2018-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ -#define __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ - -#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h" - -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/IMemoryManager.h" -#include "arm_compute/runtime/MemoryGroup.h" - -namespace arm_compute -{ -class ITensorInfo; -class ICLTensor; - -/** Function to calculate the index of the minimum or maximum values in a - * tensor based on an axis. - * - * @note The default data type for an uninitialized output tensor is - * signed 32-bit integer (S32). It is the user's responsibility to check - * that the results do not overflow because the indices are computed - * in unsigned 32-bit (U32). - */ -class CLArgMinMaxLayerEx : public IFunction -{ -public: - /** Default Constructor. - * - * @param[in] memory_manager (Optional) Memory manager. - */ - CLArgMinMaxLayerEx(std::shared_ptr memory_manager = nullptr); - /** Set the input and output tensors. - * - * @param[in] input Input source tensor. Data types supported: QASYMM8/F16/F32. - * @param[in] axis Axis to find max/min index. - * @param[out] output Output source tensor. Data types supported: U32/S32. - * @param[in] op Reduction operation to perform. Operations supported: ARG_IDX_MAX, - * ARG_IDX_MIN - */ - void configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLArgMinMaxLayerEx - * - * @param[in] input Input source tensor info. Data types supported: QASYMM8/F16/F32. - * @param[in] axis Axis to find max/min index. - * @param[in] output Output source tensor info. Data types supported: U32/S32. - * @param[in] op Reduction operation to perform. Operations supported: ARG_IDX_MAX, - * ARG_IDX_MIN - * - * @return a status - */ - static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output, - const ReductionOperation &op); - - // Inherited methods overridden: - void run() override; - -private: - MemoryGroup _memory_group; - std::vector _results_vector; - CLTensor _not_reshaped_output; - std::vector _reduction_kernels_vector; - CLReshapeLayer _reshape_kernel; - unsigned int _num_of_stages; - unsigned int _reduction_axis; -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp index e15dc2685cb..dce115b2af1 100644 --- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -92,10 +92,6 @@ const std::map CLKernelLibraryEx::_program_source_map { "activation_float_helpers.h", #include "./cl_kernels/activation_float_helpers.hembed" - }, - { - "arg_min_max_ex.cl", -#include "./cl_kernels/arg_min_max_ex.clembed" }, { "binary_logical_op.cl", diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl deleted file mode 100644 index 135cacf596f..00000000000 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl +++ /dev/null @@ -1,564 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2019-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#if defined(FLOAT_DATA_TYPE) -#define ISGREATER(x, y) isgreater(x, y) -#define ISLESS(x, y) isless(x, y) -#else // !FLOAT_DATA_TYPE -#if defined(WIDTH) -#define ISGREATER(x, y) (x > y) ? 1 : 0 -#define ISLESS(x, y) (x < y) ? 1 : 0 -#else // !defined(WIDTH) -#define ISGREATER(x, y) \ - select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x > y) -#define ISLESS(x, y) \ - select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x < y) -#endif // defined(WIDTH) -#endif // defined(FLOAT_DATA_TYPE) - -#if defined(ARG_MAX) -#define CONDITION_TO_USE(x, y) ISGREATER(x, y) -#elif defined(ARG_MIN) -#define CONDITION_TO_USE(x, y) ISLESS(x, y) -#else // !(defined(ARG_MAX) || defined(ARG_MIN)) -#error "Unsupported reduction operation!" -#endif // defined(ARG_MAX) - -#if defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) -#if defined(WIDTH) -#if defined(ARG_MIN) -#if defined(PREV_OUTPUT) -/** Find index minimum value of a vector - * - * @param[in] input Pointer to the first value. - * - * @return index of the vector. - */ -inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global const DATA_TYPE *input, - __global const DATA_TYPE_OUTPUT *prev_res, - const int x_idx) -{ - int end_elem = (x_idx + 1) * 16; - if (end_elem > WIDTH) - { - end_elem = WIDTH - x_idx * 16; - } - DATA_TYPE_OUTPUT res = prev_res[0]; - for (int x_v = 1; x_v < end_elem; ++x_v) - { - res = select(res, prev_res[x_v], *(input + prev_res[x_v]) < *(input + res)); - } - return res; -} -#else // !defined(PREV_OUTPUT) -/** Find index minimum value of a vector - * - * @param[in] input Pointer to the first value. - * - * @return index of the vector. - */ -inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx) -{ -#if WIDTH < 16 - DATA_TYPE_OUTPUT res = 0; - for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v) - { - res = select(res, x_v, *(input + x_v) < *(input + res)); - } - return res; -#else // WIDTH >= 16 - int x_elem = x_idx * 16; - const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH); - x_elem -= x_goback; - - VEC_DATA_TYPE(DATA_TYPE, 16) - in = vload16(0, input - x_goback); - VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) - res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - - VEC_DATA_TYPE(DATA_TYPE_SELECT, 8) - idx_sel = (in.s01234567 <= in.s89abcdef); - in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel); - res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8)); - - idx_sel.s0123 = - (in.s0123 < in.s4567) || - (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4))); - in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123); - res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4)); - - idx_sel.s01 = - (in.s01 < in.s23) || - (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2))); - in.s01 = select(in.s23, in.s01, idx_sel.s01); - res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2)); - - idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT)); - res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int)); - - return res.s0 + x_elem; -#endif // WIDTH < 16 -} -#endif // defined(PREV_OUTPUT) -#endif // defined(ARG_MIN) -#if defined(ARG_MAX) -#if defined(PREV_OUTPUT) -/** Find index maximum value of a vector - * - * @param[in] input Pointer to the first value. - * - * @return index of the vector. - */ -inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global const DATA_TYPE *input, - __global const DATA_TYPE_OUTPUT *prev_res, - const int x_idx) -{ - int end_elem = (x_idx + 1) * 16; - if (end_elem > WIDTH) - { - end_elem = WIDTH - x_idx * 16; - } - DATA_TYPE_OUTPUT res = prev_res[0]; - unsigned int res_int = res; - DATA_TYPE_OUTPUT condition_check2; - for (int x_v = 1; x_v < end_elem; ++x_v) - { - int i1 = prev_res[x_v]; - condition_check2 = *(input + i1) > *(input + res_int); - res = select(res, prev_res[x_v], condition_check2); - } - return res; -} -#else // !defined(PREV_OUTPUT) -/** Find index maximum value of a vector - * - * @param[in] input Pointer to the first value. - * - * @return index of the vector. - */ -inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx) -{ -#if WIDTH < 16 - DATA_TYPE_OUTPUT res = 0; - unsigned int i1; - unsigned int i2; - DATA_TYPE_OUTPUT condition_check; - for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v) - { - i1 = x_v; - i2 = res; - condition_check = *(input + i1) > *(input + i2); - res = select(res, x_v, condition_check); - } - return res; -#else // WIDTH >= 16 - int x_elem = x_idx * 16; - const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH); - x_elem -= x_goback; - - VEC_DATA_TYPE(DATA_TYPE, 16) - in = vload16(0, input - x_goback); - VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) - res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - - VEC_DATA_TYPE(DATA_TYPE_SELECT, 8) - idx_sel = (in.s01234567 >= in.s89abcdef); - in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel); - res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8)); - - idx_sel.s0123 = - (in.s0123 > in.s4567) || - (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4))); - in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123); - res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4)); - - idx_sel.s01 = - (in.s01 > in.s23) || - (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2))); - in.s01 = select(in.s23, in.s01, idx_sel.s01); - res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2)); - - idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT)); - res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int)); - - return res.s0 + x_elem; -#endif // WIDTH < 16 -} -#endif // defined(PREV_OUTPUT) -#endif // defined(ARG_MAX) - -/** This kernel performs parallel reduction given an operation on x-axis. - * - * @note In case the results of previous stages are passed the flag PREV_OUTPUT has to be passed - * using -DPREV_OUTPUT - * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. - * -DDATA_TYPE_OUTPUT=uint - * @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the - * ArgMax - * @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the - * ArgMin - * - * @param[in] src_ptr Pointer to the source tensor. Supported data - * types: S32/F16/F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension - * (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension - * (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the - * source tensor - * @param[in] prev_res_ptr (Optional) Pointer to previous results - * tensor. Supported data types: U32/S32 - * @param[in] prev_res_stride_x (Optional) Stride of the output tensor in X - * dimension (in bytes) - * @param[in] prev_res_step_x (Optional) prev_res_stride_x * number of - * elements along X processed per workitem(in bytes) - * @param[in] prev_res_stride_y (Optional) Stride of the output tensor in Y - * dimension (in bytes) - * @param[in] prev_res_step_y (Optional) prev_res_stride_y * number of - * elements along Y processed per workitem(in bytes) - * @param[in] prev_res_offset_first_element_in_bytes (Optional) The offset of the first element - * in the previous results tensor - * @param[in] partial_res_ptr The local buffer to hold partial result - * values. Supported data types: U32/S32 - * @param[in] partial_res_stride_x Stride of the output tensor in X dimension - * (in bytes) - * @param[in] partial_res_step_x partial_res_stride_x * number of elements - * along X processed per workitem(in bytes) - * @param[in] partial_res_stride_y Stride of the output tensor in Y dimension - * (in bytes) - * @param[in] partial_res_step_y partial_res_stride_y * number of elements - * along Y processed per workitem(in bytes) - * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the - * source tensor - * @param[in] local_results Local buffer for storing the partial result - */ -__kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src), -#if defined(PREV_OUTPUT) - IMAGE_DECLARATION(prev_res), -#endif // defined(PREV_OUTPUT) - IMAGE_DECLARATION(partial_res), - __local DATA_TYPE_OUTPUT *local_results) -{ -#if defined(PREV_OUTPUT) - Image src = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src); - Image prev_res = CONVERT_TO_IMAGE_STRUCT(prev_res); -#else // !defined(PREV_OUTPUT) - Image src = CONVERT_TO_IMAGE_STRUCT(src); -#endif // defined(PREV_OUTPUT) - Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res); - - unsigned int lsize = get_local_size(0); - unsigned int lid = get_local_id(0); - - const uint x_idx = get_global_id(0); - const uint y_idx = get_global_id(1); - const __global DATA_TYPE *src_in_row = - (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y); - - for (unsigned int y = 0; y < get_local_size(1); ++y) - { -#if defined(ARG_MAX) -#if defined(PREV_OUTPUT) - local_results[lid] = - arg_idx_max_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx); -#else // !defined(PREV_OUTPUT) - local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx); -#endif // defined(PREV_OUTPUT) -#else // defined(ARG_MIN) -#if defined(PREV_OUTPUT) - local_results[lid] = - arg_idx_min_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx); -#else // !defined(PREV_OUTPUT) - local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx); -#endif // defined(PREV_OUTPUT) -#endif // defined(ARG_MAX) || defined(ARG_MIN) - - barrier(CLK_LOCAL_MEM_FENCE); - - // Looking for the next highest power of 2 (maximum value of lsize is 8) - unsigned int middle = lsize - 1; - middle |= middle >> 1; - middle |= middle >> 2; - middle += 1; - // Perform parallel reduction - DATA_TYPE_OUTPUT condition_check3; - for (unsigned int i = middle; i > 0; i >>= 1) - { - if (lid < i && lid + i < lsize) - { - DATA_TYPE tmp0 = *(src_in_row + local_results[lid]); - DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]); -#if defined(ARG_MAX) - condition_check3 = - ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1); - local_results[lid] = select(local_results[lid], local_results[lid + i], condition_check3); -#else // defined(ARG_MIN) - local_results[lid] = select( - local_results[lid], local_results[lid + i], - ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1)); -#endif // defined(ARG_MAX) || defined(ARG_MIN) - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (lid == 0) - { - ((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0]; - } - } -} -#endif // defined(WIDTH) - -#if defined(HEIGHT) -/** This kernel performs reduction on y-axis. - * - * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. - * -DDATA_TYPE=float - * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. - * -DDATA_TYPE_OUTPUT=uint - * @note The data type of the select results must be passed at compile time using - * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int - * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128 - * - * @param[in] src_ptr Pointer to the source tensor. Supported data - * types: S32/F16/F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in - * bytes) - * @param[in] src_step_x src_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in - * bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source - * tensor - * @param[in] output_ptr The local buffer to hold sumed values. Supported - * data types: U32/S32 - * @param[in] output_stride_x Stride of the output tensor in X dimension (in - * bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the output tensor in Y dimension (in - * bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source - * tensor - */ -__kernel void arg_min_max_ex_y(IMAGE_DECLARATION(src), IMAGE_DECLARATION(output)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image output = CONVERT_TO_IMAGE_STRUCT(output); - - VEC_DATA_TYPE(DATA_TYPE, 16) - res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16)); - - VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) - indx = 0; - for (unsigned int y = 1; y < HEIGHT; ++y) - { - VEC_DATA_TYPE(DATA_TYPE, 16) - in = - CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16)); - - VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) - cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)); - indx = select(indx, y, cond_conv); - res = select(res, in, CONDITION_TO_USE(in, res)); - } - - // Store result - vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr); -} -#endif // defined(HEIGHT) - -#if defined(DEPTH) -/** This kernel performs reduction on z-axis. - * - * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The data type of the select results must be passed at compile time using - * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int - * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data - * types: S32/F16/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in - * bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in - * bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source - * tensor - * @param[in] output_ptr The local buffer to hold sumed values. Supported - * data types: U32/S32 - * @param[in] output_stride_x Stride of the output tensor in X dimension (in - * bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the output tensor in Y dimension (in - * bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the output tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source - * tensor - */ -__kernel void arg_min_max_ex_z(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) -{ - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_DATA_TYPE(DATA_TYPE, 16) - res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), - VEC_DATA_TYPE(DATA_TYPE, 16)); - - VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) - indx = 0; - for (DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z) - { - VEC_DATA_TYPE(DATA_TYPE, 16) - in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), - VEC_DATA_TYPE(DATA_TYPE, 16)); - - VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) - cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)); - indx = select(indx, z, cond_conv); - res = select(res, in, CONDITION_TO_USE(in, res)); - } - - // Store result - vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr); -} -#endif /* defined(DEPTH) */ - -#if defined(BATCH) && defined(DEPTH) -/** This kernel performs reduction on w-axis. - * - * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The data type of the select results must be passed at compile time using - * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int - * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128 - * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data - * types: S32/F16/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in - * bytes) - * @param[in] input_step_x input_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in - * bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in - * bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] input_stride_w Stride of the source tensor in W dimension (in - * bytes) - * @param[in] input_step_w input_stride_w * number of elements along W - * processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source - * tensor - * @param[in] output_ptr The local buffer to hold sumed values. Supported - * data types: U32/S32 - * @param[in] output_stride_x Stride of the output tensor in X dimension (in - * bytes) - * @param[in] output_step_x output_stride_x * number of elements along X - * processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the output tensor in Y dimension (in - * bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y - * processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the output tensor in Z dimension (in - * bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z - * processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the output tensor in W dimension (in - * bytes) - * @param[in] output_step_w output_stride_w * number of elements along W - * processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source - * tensor - */ -__kernel void arg_min_max_ex_w(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) -{ - Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH); - Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH); - - VEC_DATA_TYPE(DATA_TYPE, 16) - res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), - VEC_DATA_TYPE(DATA_TYPE, 16)); - - VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) - indx = 0; - for (DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w) - { - VEC_DATA_TYPE(DATA_TYPE, 16) - in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), - VEC_DATA_TYPE(DATA_TYPE, 16)); - - VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) - cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)); - indx = select(indx, w, cond_conv); - res = select(res, in, CONDITION_TO_USE(in, res)); - } - - // Store result - vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr); -} -#endif /* defined(BATCH) && defined(DEPTH) */ -#endif /* defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) */ diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp deleted file mode 100644 index 9874097390f..00000000000 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp +++ /dev/null @@ -1,332 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2019-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace -{ -constexpr unsigned int vector_size = 16; - -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_output, - const ITensorInfo *output, unsigned int axis, ReductionOperation op) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, - DataType::QASYMM8_SIGNED, DataType::S32, - DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && - op != ReductionOperation::ARG_IDX_MIN, - "Only ARG_IDX_MAX and ARG_IDX_MIN are supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, - "Reduction axis greater than max number of dimensions"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); - - if (output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, - DataType::S64); - } - if (prev_output != nullptr && prev_output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(prev_output, 1, DataType::U32, - DataType::S32, DataType::S64); - if (output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(prev_output, output); - } - } - - return Status{}; -} - -std::tuple validate_and_configure_window(ITensorInfo *input, - ITensorInfo *prev_output, - ITensorInfo *output, unsigned int axis, - ReductionOperation op) -{ - ARM_COMPUTE_UNUSED(op); - // Output tensor auto initialization if not yet initialized - TensorShape output_shape{input->tensor_shape()}; - output_shape.set(axis, 1); - DataType output_data_type = (prev_output != nullptr) ? (prev_output->data_type()) : DataType::S32; - auto_init_if_empty(*output, input->clone() - ->set_tensor_shape(output_shape) - .set_data_type(output_data_type) - .reset_padding() - .set_is_resizable(true)); - - Window win = - calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input), Steps(vector_size)); - bool window_changed = false; - - switch (axis) - { - case 0: - { - ITensorInfo *input_tensor_access = prev_output != nullptr ? prev_output : input; - AccessWindowStatic input_access(input_tensor_access, 0, 0, - static_cast(input_tensor_access->dimension(0)), 1); - AccessWindowHorizontal output_access(output, 0, 1); - window_changed = update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); - } - break; - case 1: - case 2: - case 3: - { - AccessWindowHorizontal input_access(input, 0, vector_size); - AccessWindowHorizontal output_access(output, 0, vector_size); - window_changed = update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); - } - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - } - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - return std::make_tuple(err, win); -} -} // namespace - -CLArgMinMaxLayerKernelEx::CLArgMinMaxLayerKernelEx() - : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0), - _op(ReductionOperation::ARG_IDX_MAX) -{ -} - -void CLArgMinMaxLayerKernelEx::configure(const ICLTensor *input, const ICLTensor *prev_output, - ICLTensor *output, unsigned int axis, - ReductionOperation op) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, - output->info(), axis, op)); - auto win_config = validate_and_configure_window( - input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis, - op); - ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); - - _input = input; - _prev_output = prev_output; - _output = output; - _reduction_axis = axis; - _op = op; - - // Set build options - CLBuildOptions build_opts; - - build_opts.add_option_if(_prev_output != nullptr, "-DPREV_OUTPUT"); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE"); - build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN"); - build_opts.add_option("-DDATA_TYPE_OUTPUT=" + - get_cl_type_from_data_type(output->info()->data_type())); - build_opts.add_option("-DDATA_TYPE_SELECT=" + - get_cl_signed_type_from_element_size(input->info()->element_size())); - - // Create kernel - cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange(); - std::string kernel_axis_name; - switch (axis) - { - case 0: - { - const ICLTensor *input_for_width = prev_output != nullptr ? _prev_output : _input; - build_opts.add_option("-DWIDTH=" + - support::cpp11::to_string(input_for_width->info()->dimension(0))); - - kernel_axis_name = "x"; - lws_hint = create_lws_hint_parallel_implementations(input_for_width->info()->dimension(0), - vector_size); - } - break; - case 1: - build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1))); - kernel_axis_name = "y"; - break; - case 2: - build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); - kernel_axis_name = "z"; - break; - case 3: - build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); - build_opts.add_option("-DBATCH=" + support::cpp11::to_string(input->info()->dimension(3))); - kernel_axis_name = "w"; - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - } - _kernel = static_cast(CLKernelLibraryEx::get().create_kernel( - "arg_min_max_ex_" + kernel_axis_name, build_opts.options())); - - // Configure kernel window - ICLKernel::configure_internal(std::get<1>(win_config), lws_hint); -} - -Status CLArgMinMaxLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *prev_output, - const ITensorInfo *output, unsigned int axis, - ReductionOperation op) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, prev_output, output, axis, op)); - ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( - input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr, - output->clone().get(), axis, op))); - return Status{}; -} - -void CLArgMinMaxLayerKernelEx::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - switch (_reduction_axis) - { - case 0: - { - // Set out window - Window out_window(window); - out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); - - // Get first input and output slices - Window in_slice = window.first_slice_window_2D(); - Window out_slice = out_window.first_slice_window_2D(); - - // Reshape window - const unsigned int num_tensors = _prev_output != nullptr ? 3 : 2; - - // Set local sums buffer - unsigned int local_res_size = lws_hint()[0] * _output->info()->element_size(); - _kernel.setArg(num_arguments_per_2D_tensor() * num_tensors, local_res_size, nullptr); - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, in_slice); - if (_prev_output != nullptr) - { - add_2D_tensor_argument(idx, _prev_output, in_slice); - } - add_2D_tensor_argument(idx, _output, out_slice); - enqueue(queue, *this, in_slice, lws_hint()); - } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); - } - break; - case 1: - { - // Get first input and output slices - Window window_in{window}; - window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), - _input->info()->dimension(1))); - Window in_slice = window_in.first_slice_window_2D(); - Window out_slice = window.first_slice_window_2D(); - - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, in_slice); - add_2D_tensor_argument(idx, _output, out_slice); - enqueue(queue, *this, in_slice, lws_hint()); - } while (window_in.slide_window_slice_2D(in_slice) && - window.slide_window_slice_2D(out_slice)); - } - break; - case 2: - { - // Get first input and output slices - Window window_in{window}; - window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), - _input->info()->dimension(2))); - Window in_slice = window_in.first_slice_window_3D(); - Window out_slice = window.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, in_slice); - add_3D_tensor_argument(idx, _output, out_slice); - enqueue(queue, *this, in_slice, lws_hint()); - } while (window_in.slide_window_slice_3D(in_slice) && - window.slide_window_slice_3D(out_slice)); - } - break; - case 3: - { - // Get first input and output slices - Window window_in{window}; - window_in.set(3, Window::Dimension(0, 1, 1)); - Window in_slice = window_in.first_slice_window_4D(); - Window out_slice = window.first_slice_window_4D(); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, in_slice); - add_4D_tensor_argument(idx, _output, out_slice); - enqueue(queue, *this, in_slice, lws_hint()); - } while (window_in.slide_window_slice_4D(in_slice) && - window.slide_window_slice_4D(out_slice)); - } - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - } -} -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp deleted file mode 100644 index 6b9b0d4b46d..00000000000 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2018-2020 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/helpers/WindowHelpers.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/runtime/Utils.h" - -namespace arm_compute -{ -CLArgMinMaxLayerEx::CLArgMinMaxLayerEx(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(), - _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis() -{ -} - -Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, - const ReductionOperation &op) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && - op != ReductionOperation::ARG_IDX_MIN, - "Invalid reduction operation"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast(TensorShape::num_max_dimensions), - "Reduction axis greater than max number of dimensions"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); - const unsigned int num_of_stages = - utils::calculate_number_of_stages_only_x_axis(input->dimension(0), axis); - - DataType output_data_type = DataType::S32; - TensorInfo not_reshaped_output; - const auto input_num_channles = input->num_channels(); - const auto input_qinfo = input->quantization_info(); - - if (output->total_size() != 0) - { - output_data_type = output->data_type(); - const TensorInfo expected_output_shape = - output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape( - input->tensor_shape(), axis, false)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); - } - - auto shape_before_reshape = input->tensor_shape(); - shape_before_reshape.set(axis, 1); - auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, - int num_channels, QuantizationInfo qinfo) { - ti.set_data_type(data_type) - .set_tensor_shape(shape) - .set_num_channels(num_channels) - .set_quantization_info(qinfo); - }; - - initialize_tensorinfo(not_reshaped_output, shape_before_reshape, output_data_type, - input_num_channles, input_qinfo); - - if (num_of_stages == 1) - { - ARM_COMPUTE_RETURN_ON_ERROR( - CLArgMinMaxLayerKernelEx::validate(input, nullptr, ¬_reshaped_output, axis, op)); - } - else - { - // Create temporary tensor infos - std::vector sums_vector(num_of_stages - 1); - - // Create intermediate tensor info - TensorShape shape{input->tensor_shape()}; - - for (unsigned int i = 0; i < num_of_stages - 1; i++) - { - shape.set(0, ceil(shape.x() / 128.f)); - sums_vector[i].set_data_type(input->data_type()); - sums_vector[i].set_tensor_shape(shape); - sums_vector[i].set_num_channels(input->num_channels()); - } - - // Validate ReductionOperation only on first kernel - ARM_COMPUTE_RETURN_ON_ERROR( - CLArgMinMaxLayerKernelEx::validate(input, nullptr, &sums_vector[0], axis, op)); - - // Validate ReductionOperation on intermediate stages - for (unsigned int i = 1; i < num_of_stages - 1; ++i) - { - ARM_COMPUTE_RETURN_ON_ERROR( - CLArgMinMaxLayerKernelEx::validate(input, &sums_vector[i - 1], &sums_vector[i], axis, op)); - } - - // Validate ReductionOperation on the last stage - const unsigned int last_stage = num_of_stages - 1; - ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate( - input, &sums_vector[last_stage - 1], ¬_reshaped_output, axis, op)); - } - ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(¬_reshaped_output, output)); - return Status{}; -} - -void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *output, - const ReductionOperation &op) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - _num_of_stages = utils::calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis); - _reduction_axis = axis; - - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape( - input->info()->tensor_shape(), axis, false); - DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN) - ? DataType::S32 - : output->info()->data_type(); - auto_init_if_empty(*output->info(), input->info() - ->clone() - ->set_tensor_shape(output_shape) - .set_data_type(output_data_type) - .reset_padding() - .set_is_resizable(true)); - - // Configure reduction operation kernels - _reduction_kernels_vector.resize(_num_of_stages); - - _memory_group.manage(&_not_reshaped_output); - // Create temporary tensors - if (_num_of_stages == 1) - { - // Force an early initialization for int64 output type - TensorShape output_shape{input->info()->tensor_shape()}; - output_shape.set(axis, 1); - auto_init_if_empty(*_not_reshaped_output.info(), input->info() - ->clone() - ->set_tensor_shape(output_shape) - .set_data_type(output_data_type) - .reset_padding() - .set_is_resizable(true)); - _not_reshaped_output.info()->set_tensor_shape(output_shape); - _reduction_kernels_vector[0].configure(input, nullptr, &_not_reshaped_output, axis, op); - } - else - { - _results_vector.resize(_num_of_stages - 1); - TensorShape shape{input->info()->tensor_shape()}; - for (unsigned int i = 0; i < _num_of_stages - 1; i++) - { - shape.set(0, ceil(shape.x() / 128.f)); - _results_vector[i].allocator()->init( - input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type)); - } - - // Apply ReductionOperation only on first kernel - _memory_group.manage(&_results_vector[0]); - _reduction_kernels_vector[0].configure(input, nullptr, &_results_vector[0], axis, op); - - // Apply ReductionOperation on intermediate stages - for (unsigned int i = 1; i < _num_of_stages - 1; ++i) - { - _memory_group.manage(&_results_vector[i]); - _reduction_kernels_vector[i].configure(input, &_results_vector[i - 1], &_results_vector[i], - axis, op); - _results_vector[i - 1].allocator()->allocate(); - } - - // Apply ReductionOperation on the last stage - const unsigned int last_stage = _num_of_stages - 1; - _reduction_kernels_vector[last_stage].configure(input, &_results_vector[last_stage - 1], - &_not_reshaped_output, axis, op); - _results_vector[last_stage - 1].allocator()->allocate(); - } - _reshape_kernel.configure(CLKernelLibrary::get().get_compile_context(), &_not_reshaped_output, - output); - _not_reshaped_output.allocator()->allocate(); -} - -void CLArgMinMaxLayerEx::run() -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - for (unsigned int i = 0; i < _num_of_stages; ++i) - { - CLScheduler::get().enqueue(_reduction_kernels_vector[i], false); - } - _reshape_kernel.run(); -} -} // namespace arm_compute diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc index b63326c24cb..69378a0ee00 100644 --- a/runtime/onert/backend/acl_cl/KernelGenerator.cc +++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc @@ -1274,7 +1274,7 @@ void KernelGenerator::visit(const ir::operation::ArgMinMax &node) auto acl_axis = acl_common::ToARMComputeAxis(ifm_rank, axis_value).value(); auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX : ::arm_compute::ReductionOperation::ARG_IDX_MIN; - auto fn = acl_common::generateLayer( + auto fn = acl_common::generateLayer( ifm_tensor->handle(), acl_axis, ofm_tensor->handle(), reduce_type); _return_fn = asAclFunction(std::move(fn));