From 28dc17314d2d670186b1bee62f02be65826f6bc0 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Fri, 29 Nov 2024 14:12:21 +0000 Subject: [PATCH] Revert "Add new launch property to support work_group_scratch_memory" This reverts commit 7222f79b160681434250bac74f3aae0e4abc46f0. --- include/ur_api.h | 15 +-- include/ur_ddi.h | 1 - include/ur_print.hpp | 16 --- scripts/core/exp-launch-properties.yml | 13 +- source/adapters/cuda/enqueue.cpp | 111 +++++++----------- source/adapters/cuda/kernel.hpp | 45 ++----- source/adapters/level_zero/queue.cpp | 5 +- .../level_zero/ur_interface_loader.hpp | 4 +- source/adapters/level_zero/v2/queue_api.cpp | 6 +- source/adapters/level_zero/v2/queue_api.hpp | 6 +- .../v2/queue_immediate_in_order.cpp | 4 +- .../v2/queue_immediate_in_order.hpp | 4 +- source/adapters/mock/ur_mockddi.cpp | 23 +--- source/loader/layers/tracing/ur_trcddi.cpp | 25 ++-- source/loader/layers/validation/ur_valddi.cpp | 13 +- source/loader/ur_ldrddi.cpp | 35 +----- source/loader/ur_libapi.cpp | 12 +- source/ur_api.cpp | 4 - .../launch_properties.cpp | 4 +- 19 files changed, 94 insertions(+), 252 deletions(-) diff --git a/include/ur_api.h b/include/ur_api.h index 1de876cb7f..eb8b07221c 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -9560,7 +9560,6 @@ typedef enum ur_exp_launch_property_id_t { UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0, ///< The property has no effect UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1, ///< Whether to launch a cooperative kernel UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions - UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, ///< Implicit work group memory allocation /// @cond UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -9574,12 +9573,10 @@ typedef enum ur_exp_launch_property_id_t { /// _Analogues_ /// - **CUlaunchAttributeValue** typedef union ur_exp_launch_property_value_t { - uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each - ///< value must be a divisor of the corresponding global work-size - ///< dimension (in units of work-group). - int cooperative; ///< [in] non-zero value indicates a cooperative kernel - size_t workgroup_mem_size; ///< [in] non-zero value indicates the amount of work group memory to - ///< allocate in bytes + uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each + ///< value must be a divisor of the corresponding global work-size + ///< dimension (in units of work-group). + int cooperative; ///< [in] non-zero value indicates a cooperative kernel } ur_exp_launch_property_value_t; @@ -9620,7 +9617,6 @@ typedef struct ur_exp_launch_property_t { /// + NULL == hQueue /// + NULL == hKernel /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pGlobalWorkOffset` /// + `NULL == pGlobalWorkSize` /// + `NULL == launchPropList` /// + NULL == pGlobalWorkSize @@ -9649,8 +9645,6 @@ urEnqueueKernelLaunchCustomExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items - const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the - ///< offset used to calculate the global ID of a work-item const size_t *pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel ///< function @@ -11560,7 +11554,6 @@ typedef struct ur_enqueue_kernel_launch_custom_exp_params_t { ur_queue_handle_t *phQueue; ur_kernel_handle_t *phKernel; uint32_t *pworkDim; - const size_t **ppGlobalWorkOffset; const size_t **ppGlobalWorkSize; const size_t **ppLocalWorkSize; uint32_t *pnumPropsInLaunchPropList; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index cdf90eda6d..40a6c5c269 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -1467,7 +1467,6 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)( uint32_t, const size_t *, const size_t *, - const size_t *, uint32_t, const ur_exp_launch_property_t *, uint32_t, diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 190d3f9cd5..8888a74f91 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -10397,9 +10397,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION"; break; - case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: - os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY"; - break; default: os << "unknown enumerator"; break; @@ -10436,13 +10433,6 @@ inline ur_result_t printUnion( os << (params.cooperative); - break; - case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: - - os << ".workgroup_mem_size = "; - - os << (params.workgroup_mem_size); - break; default: os << ""; @@ -15110,12 +15100,6 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct os << *(params->pworkDim); - os << ", "; - os << ".pGlobalWorkOffset = "; - - ur::details::printPtr(os, - *(params->ppGlobalWorkOffset)); - os << ", "; os << ".pGlobalWorkSize = "; diff --git a/scripts/core/exp-launch-properties.yml b/scripts/core/exp-launch-properties.yml index ca28421815..9e66e9ea06 100644 --- a/scripts/core/exp-launch-properties.yml +++ b/scripts/core/exp-launch-properties.yml @@ -29,8 +29,6 @@ etors: desc: "Whether to launch a cooperative kernel" - name: CLUSTER_DIMENSION desc: "work-group cluster dimensions" - - name: WORK_GROUP_MEMORY - desc: "Implicit work group memory allocation" --- #-------------------------------------------------------------------------- type: union desc: "Specifies a launch property value" @@ -47,10 +45,6 @@ members: name: cooperative desc: "[in] non-zero value indicates a cooperative kernel" tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE - - type: size_t - name: workgroup_mem_size - desc: "[in] non-zero value indicates the amount of work group memory to allocate in bytes" - tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY --- #-------------------------------------------------------------------------- type: struct desc: "Kernel launch property" @@ -88,9 +82,6 @@ params: - type: uint32_t name: workDim desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items" - - type: "const size_t*" - name: pGlobalWorkOffset - desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item" - type: const size_t* name: pGlobalWorkSize desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function" @@ -106,10 +97,10 @@ params: - type: uint32_t name: numEventsInWaitList desc: "[in] size of the event wait list" - - type: const $x_event_handle_t* + - type: const ur_event_handle_t* name: phEventWaitList desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. " - - type: $x_event_handle_t* + - type: ur_event_handle_t* name: phEvent desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array." returns: diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index a3fc58dfb9..fc3d0220e8 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -422,13 +422,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( phEventWaitList, phEvent); } -static ur_result_t -enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, - uint32_t workDim, const size_t *pGlobalWorkOffset, - const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent, size_t WorkGroupMemory) { +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { // Preconditions UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(), UR_RESULT_ERROR_INVALID_KERNEL); @@ -446,9 +444,6 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - // Set work group memory so we can compute the whole memory requirement - if (WorkGroupMemory) - hKernel->setWorkGroupMemory(WorkGroupMemory); uint32_t LocalSize = hKernel->getLocalSize(); CUfunction CuFunc = hKernel->get(); @@ -511,17 +506,6 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, phEvent, - /*WorkGroupMemory=*/0); -} - UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, @@ -532,9 +516,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE; coop_prop.value.cooperative = 1; return urEnqueueKernelLaunchCustomExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList, - phEvent); + hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1, + &coop_prop, numEventsInWaitList, phEventWaitList, phEvent); } return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, @@ -543,29 +526,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numPropsInLaunchPropList, const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - size_t WorkGroupMemory = [&]() -> size_t { - const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if( - launchPropList, launchPropList + numPropsInLaunchPropList, - [](const ur_exp_launch_property_t &Prop) { - return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY; - }); - if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList) - return WorkGroupMemoryProp->value.workgroup_mem_size; - return 0; - }(); - - if (numPropsInLaunchPropList == 0 || - (WorkGroupMemory && numPropsInLaunchPropList == 1)) { - return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, phEvent, - WorkGroupMemory); + if (numPropsInLaunchPropList == 0) { + urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize, + pLocalWorkSize, numEventsInWaitList, phEventWaitList, + phEvent); } #if CUDA_VERSION >= 11080 // Preconditions @@ -578,8 +548,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - std::vector launch_attribute; - launch_attribute.reserve(numPropsInLaunchPropList); + std::vector launch_attribute(numPropsInLaunchPropList); // Early exit for zero size kernel if (*pGlobalWorkSize == 0) { @@ -592,35 +561,40 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - // Set work group memory so we can compute the whole memory requirement - if (WorkGroupMemory) - hKernel->setWorkGroupMemory(WorkGroupMemory); uint32_t LocalSize = hKernel->getLocalSize(); CUfunction CuFunc = hKernel->get(); for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) { switch (launchPropList[i].id) { case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: { - auto &attr = launch_attribute.emplace_back(); - attr.id = CU_LAUNCH_ATTRIBUTE_IGNORE; + launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE; break; } case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: { - auto &attr = launch_attribute.emplace_back(); - attr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; + + launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; // Note that cuda orders from right to left wrt SYCL dimensional order. if (workDim == 3) { - attr.value.clusterDim.x = launchPropList[i].value.clusterDim[2]; - attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1]; - attr.value.clusterDim.z = launchPropList[i].value.clusterDim[0]; + launch_attribute[i].value.clusterDim.x = + launchPropList[i].value.clusterDim[2]; + launch_attribute[i].value.clusterDim.y = + launchPropList[i].value.clusterDim[1]; + launch_attribute[i].value.clusterDim.z = + launchPropList[i].value.clusterDim[0]; } else if (workDim == 2) { - attr.value.clusterDim.x = launchPropList[i].value.clusterDim[1]; - attr.value.clusterDim.y = launchPropList[i].value.clusterDim[0]; - attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2]; + launch_attribute[i].value.clusterDim.x = + launchPropList[i].value.clusterDim[1]; + launch_attribute[i].value.clusterDim.y = + launchPropList[i].value.clusterDim[0]; + launch_attribute[i].value.clusterDim.z = + launchPropList[i].value.clusterDim[2]; } else { - attr.value.clusterDim.x = launchPropList[i].value.clusterDim[0]; - attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1]; - attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2]; + launch_attribute[i].value.clusterDim.x = + launchPropList[i].value.clusterDim[0]; + launch_attribute[i].value.clusterDim.y = + launchPropList[i].value.clusterDim[1]; + launch_attribute[i].value.clusterDim.z = + launchPropList[i].value.clusterDim[2]; } UR_CHECK_ERROR(cuFuncSetAttribute( @@ -629,12 +603,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( break; } case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: { - auto &attr = launch_attribute.emplace_back(); - attr.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE; - attr.value.cooperative = launchPropList[i].value.cooperative; - break; - } - case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: { + launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE; + launch_attribute[i].value.cooperative = + launchPropList[i].value.cooperative; break; } default: { @@ -647,8 +618,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( // using the standard UR_CHECK_ERROR if (ur_result_t Ret = setKernelParams(hQueue->getContext(), hQueue->Device, workDim, - pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid); + nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel, + CuFunc, ThreadsPerBlock, BlocksPerGrid); Ret != UR_RESULT_SUCCESS) return Ret; @@ -696,7 +667,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( launch_config.sharedMemBytes = LocalSize; launch_config.hStream = CuStream; launch_config.attrs = &launch_attribute[0]; - launch_config.numAttrs = launch_attribute.size(); + launch_config.numAttrs = numPropsInLaunchPropList; UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc, const_cast(ArgIndices.data()), diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp index c3b9fae09a..7ad20a4f0e 100644 --- a/source/adapters/cuda/kernel.hpp +++ b/source/adapters/cuda/kernel.hpp @@ -65,8 +65,6 @@ struct ur_kernel_handle_t_ { args_size_t ParamSizes; args_index_t Indices; args_size_t OffsetPerIndex; - size_t WorkGroupMemory = 0; - // A struct to keep track of memargs so that we can do dependency analysis // at urEnqueueKernelLaunch struct mem_obj_arg { @@ -107,28 +105,22 @@ struct ur_kernel_handle_t_ { OffsetPerIndex[Index] = LocalSize; } - // maximum required alignment is the size of the largest vector type - static constexpr size_t MaxAlignment = sizeof(double) * 16; + void addLocalArg(size_t Index, size_t Size) { + size_t LocalOffset = this->getLocalSize(); + + // maximum required alignment is the size of the largest vector type + const size_t MaxAlignment = sizeof(double) * 16; - static size_t alignMemoryAllocation(size_t Size, size_t Offset) { // for arguments smaller than the maximum alignment simply align to the // size of the argument const size_t Alignment = std::min(MaxAlignment, Size); // align the argument - size_t AlignedLocalOffset = Offset; - size_t Pad = Offset % Alignment; + size_t AlignedLocalOffset = LocalOffset; + size_t Pad = LocalOffset % Alignment; if (Pad != 0) { AlignedLocalOffset += Alignment - Pad; } - return AlignedLocalOffset; - } - - void addLocalArg(size_t Index, size_t Size) { - size_t LocalOffset = this->getLocalSize(); - - // align the argument - size_t AlignedLocalOffset = alignMemoryAllocation(Size, LocalOffset); addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), Size + (AlignedLocalOffset - LocalOffset)); @@ -148,24 +140,6 @@ struct ur_kernel_handle_t_ { MemObjArgs.push_back(arguments::mem_obj_arg{hMem, Index, Flags}); } - void setWorkGroupMemory(size_t MemSize) { - assert(WorkGroupMemory == 0 && - "Work Group Memory size can only be set once"); - // Ensure first offset is MaxAlignment aligned - WorkGroupMemory = alignMemoryAllocation(MaxAlignment, MemSize); - - // Adjust local accessor setting - // the dynamic memory will start at offset 0 (allows us to keep accessing - // local memory as a GV) and accessors will use the rest of the range - for (size_t i = 0; i < OffsetPerIndex.size(); i++) { - // if offset is 0, it is not a local accessor argument. - if (!OffsetPerIndex[i]) - continue; - assert(ParamSizes[i] == sizeof(size_t) && "Offset should be a size_t"); - *reinterpret_cast(Indices[i]) += WorkGroupMemory; - } - } - void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) { assert(Size == sizeof(std::uint32_t) * 3); std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); @@ -173,15 +147,13 @@ struct ur_kernel_handle_t_ { void clearLocalSize() { std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0); - WorkGroupMemory = 0; } const args_index_t &getIndices() const noexcept { return Indices; } uint32_t getLocalSize() const { return std::accumulate(std::begin(OffsetPerIndex), - std::end(OffsetPerIndex), 0) + - WorkGroupMemory; + std::end(OffsetPerIndex), 0); } } Args; @@ -266,7 +238,6 @@ struct ur_kernel_handle_t_ { return Args.getIndices(); } - void setWorkGroupMemory(size_t MemSize) { Args.setWorkGroupMemory(MemSize); } uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); } void clearLocalSize() { Args.clearLocalSize(); } diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index 95c8d026a7..c4598f3472 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -902,15 +902,14 @@ ur_result_t urQueueFlush( ur_result_t urEnqueueKernelLaunchCustomExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numPropsInLaunchPropList, const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { std::ignore = hQueue; std::ignore = hKernel; std::ignore = workDim; - std::ignore = pGlobalWorkOffset; std::ignore = pGlobalWorkSize; std::ignore = pLocalWorkSize; std::ignore = numPropsInLaunchPropList; diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp index 0832303b50..1215d6449e 100644 --- a/source/adapters/level_zero/ur_interface_loader.hpp +++ b/source/adapters/level_zero/ur_interface_loader.hpp @@ -694,8 +694,8 @@ ur_result_t urEnqueueTimestampRecordingExp( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); ur_result_t urEnqueueKernelLaunchCustomExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numPropsInLaunchPropList, const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); diff --git a/source/adapters/level_zero/v2/queue_api.cpp b/source/adapters/level_zero/v2/queue_api.cpp index e4659b5f2c..b7b45625a2 100644 --- a/source/adapters/level_zero/v2/queue_api.cpp +++ b/source/adapters/level_zero/v2/queue_api.cpp @@ -391,13 +391,13 @@ ur_result_t urEnqueueTimestampRecordingExp( } ur_result_t urEnqueueKernelLaunchCustomExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numPropsInLaunchPropList, const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) try { return hQueue->enqueueKernelLaunchCustomExp( - hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, numPropsInLaunchPropList, launchPropList, numEventsInWaitList, phEventWaitList, phEvent); } catch (...) { diff --git a/source/adapters/level_zero/v2/queue_api.hpp b/source/adapters/level_zero/v2/queue_api.hpp index c59f084fc4..7cb039ccdd 100644 --- a/source/adapters/level_zero/v2/queue_api.hpp +++ b/source/adapters/level_zero/v2/queue_api.hpp @@ -144,9 +144,9 @@ struct ur_queue_handle_t_ { const ur_event_handle_t *, ur_event_handle_t *) = 0; virtual ur_result_t enqueueKernelLaunchCustomExp( - ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, - const size_t *, uint32_t, const ur_exp_launch_property_t *, uint32_t, - const ur_event_handle_t *, ur_event_handle_t *) = 0; + ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, uint32_t, + const ur_exp_launch_property_t *, uint32_t, const ur_event_handle_t *, + ur_event_handle_t *) = 0; virtual ur_result_t enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, uint32_t, const ur_event_handle_t *, diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 05e48c8740..519b0ffc1e 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -1069,15 +1069,13 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp( } ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunchCustomExp( - ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { std::ignore = hKernel; std::ignore = workDim; - std::ignore = pGlobalWorkOffset; std::ignore = pGlobalWorkSize; std::ignore = pLocalWorkSize; std::ignore = numPropsInLaunchPropList; diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index bdd3009d63..33e060ded3 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -263,8 +263,8 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { ur_event_handle_t *phEvent) override; ur_result_t enqueueKernelLaunchCustomExp( ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numPropsInLaunchPropList, const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override; diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp index c8ce408756..42c342444d 100644 --- a/source/adapters/mock/ur_mockddi.cpp +++ b/source/adapters/mock/ur_mockddi.cpp @@ -10126,9 +10126,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items - const size_t * - pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the - ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel @@ -10156,17 +10153,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( ur_result_t result = UR_RESULT_SUCCESS; ur_enqueue_kernel_launch_custom_exp_params_t params = { - &hQueue, - &hKernel, - &workDim, - &pGlobalWorkOffset, - &pGlobalWorkSize, - &pLocalWorkSize, - &numPropsInLaunchPropList, - &launchPropList, - &numEventsInWaitList, - &phEventWaitList, - &phEvent}; + &hQueue, &hKernel, + &workDim, &pGlobalWorkSize, + &pLocalWorkSize, &numPropsInLaunchPropList, + &launchPropList, &numEventsInWaitList, + &phEventWaitList, &phEvent}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -10185,10 +10176,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( result = replaceCallback(¶ms); } else { - // optional output handle - if (phEvent) { - *phEvent = mock::createDummyHandle(); - } result = UR_RESULT_SUCCESS; } diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index afd1411ae8..64489c39ac 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -8698,9 +8698,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items - const size_t * - pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the - ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel @@ -8733,17 +8730,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( } ur_enqueue_kernel_launch_custom_exp_params_t params = { - &hQueue, - &hKernel, - &workDim, - &pGlobalWorkOffset, - &pGlobalWorkSize, - &pLocalWorkSize, - &numPropsInLaunchPropList, - &launchPropList, - &numEventsInWaitList, - &phEventWaitList, - &phEvent}; + &hQueue, &hKernel, + &workDim, &pGlobalWorkSize, + &pLocalWorkSize, &numPropsInLaunchPropList, + &launchPropList, &numEventsInWaitList, + &phEventWaitList, &phEvent}; uint64_t instance = getContext()->notify_begin(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP, "urEnqueueKernelLaunchCustomExp", ¶ms); @@ -8752,9 +8743,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( logger.info(" ---> urEnqueueKernelLaunchCustomExp\n"); ur_result_t result = pfnKernelLaunchCustomExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); + hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, numEventsInWaitList, + phEventWaitList, phEvent); getContext()->notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP, "urEnqueueKernelLaunchCustomExp", ¶ms, &result, diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index c2dcc7be6f..b3969de10f 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -9726,9 +9726,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items - const size_t * - pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the - ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel @@ -9769,10 +9766,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } - if (NULL == pGlobalWorkOffset) { - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - } - if (NULL == pGlobalWorkSize) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } @@ -9801,9 +9794,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( } ur_result_t result = pfnKernelLaunchCustomExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); + hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, numEventsInWaitList, + phEventWaitList, phEvent); return result; } diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index 602b8f1a82..86a6ad95a0 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -8866,9 +8866,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items - const size_t * - pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the - ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel @@ -8911,35 +8908,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( // convert loader handle to platform handle hKernel = reinterpret_cast(hKernel)->handle; - // convert loader handles to platform handles - auto phEventWaitListLocal = - std::vector(numEventsInWaitList); - for (size_t i = 0; i < numEventsInWaitList; ++i) { - phEventWaitListLocal[i] = - reinterpret_cast(phEventWaitList[i])->handle; - } - // forward to device-platform - result = pfnKernelLaunchCustomExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitListLocal.data(), phEvent); - - // In the event of ERROR_ADAPTER_SPECIFIC we should still attempt to wrap any output handles below. - if (UR_RESULT_SUCCESS != result && - UR_RESULT_ERROR_ADAPTER_SPECIFIC != result) { - return result; - } - try { - // convert platform handle to loader handle - if (nullptr != phEvent) { - *phEvent = reinterpret_cast( - context->factories.ur_event_factory.getInstance(*phEvent, - dditable)); - } - } catch (std::bad_alloc &) { - result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } + result = pfnKernelLaunchCustomExp(hQueue, hKernel, workDim, pGlobalWorkSize, + pLocalWorkSize, numPropsInLaunchPropList, + launchPropList, numEventsInWaitList, + phEventWaitList, phEvent); return result; } diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index b2e26a8b8b..3340363737 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -8992,7 +8992,6 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( /// + NULL == hQueue /// + NULL == hKernel /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pGlobalWorkOffset` /// + `NULL == pGlobalWorkSize` /// + `NULL == launchPropList` /// + NULL == pGlobalWorkSize @@ -9021,9 +9020,6 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items - const size_t * - pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the - ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel @@ -9054,10 +9050,10 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( return UR_RESULT_ERROR_UNINITIALIZED; } - return pfnKernelLaunchCustomExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); + return pfnKernelLaunchCustomExp(hQueue, hKernel, workDim, pGlobalWorkSize, + pLocalWorkSize, numPropsInLaunchPropList, + launchPropList, numEventsInWaitList, + phEventWaitList, phEvent); } catch (...) { return exceptionToResult(std::current_exception()); } diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 0b2e6a0f74..853d61472e 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -7625,7 +7625,6 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( /// + NULL == hQueue /// + NULL == hKernel /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pGlobalWorkOffset` /// + `NULL == pGlobalWorkSize` /// + `NULL == launchPropList` /// + NULL == pGlobalWorkSize @@ -7654,9 +7653,6 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items - const size_t * - pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the - ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel diff --git a/test/conformance/exp_launch_properties/launch_properties.cpp b/test/conformance/exp_launch_properties/launch_properties.cpp index 23ba56ff4b..a54a44ecaf 100644 --- a/test/conformance/exp_launch_properties/launch_properties.cpp +++ b/test/conformance/exp_launch_properties/launch_properties.cpp @@ -95,8 +95,8 @@ TEST_P(urEnqueueKernelLaunchCustomTest, Success) { AddPodArg(val); ASSERT_SUCCESS(urEnqueueKernelLaunchCustomExp( - queue, kernel, n_dimensions, &global_offset, &global_size, nullptr, 1, - &props[0], 0, nullptr, nullptr)); + queue, kernel, n_dimensions, &global_size, nullptr, 1, &props[0], 0, + nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); ValidateBuffer(buffer, sizeof(val) * global_size, val); }