From 4452bc18304439fb188f474d04175c35608cf25d Mon Sep 17 00:00:00 2001 From: Georgi Mirazchiyski Date: Tue, 5 Nov 2024 11:58:29 +0000 Subject: [PATCH 01/18] [SYCL] Add missing supported AMDGPU architectures to SYCL (#15723) The three added `gfx7` architectures were previously omitted from the list of supported ones for SYCL targeting AMDGPU. There is no rational for them to be excluded rather than a potential mistake when all the rest were added, so this PR adds them in. All of these exactly match the LLVM AMDGPU support docs here: https://llvm.org/docs/AMDGPUUsage.html#processors. This also makes reusing the current `OffloadArch` enum in clang Driver easier with no concern of whether we have to filter these out for SYCL. --- clang/lib/Driver/ToolChains/SYCL.cpp | 6 ++++++ .../sycl-device-traits-macros-amdgcn.cpp | 18 ++++++++++++++++ clang/test/Driver/sycl-oneapi-gpu-amdgpu.cpp | 9 ++++++++ .../llvm/SYCLLowerIR/DeviceConfigFile.td | 3 +++ sycl/doc/UsersManual.md | 13 +++++++----- sycl/doc/design/DeviceIf.md | 6 ++++++ ...cl_ext_oneapi_device_architecture.asciidoc | 3 +++ .../ext/oneapi/experimental/architectures.def | 3 +++ .../experimental/device_architecture.hpp | 21 +++++++++++++++++++ sycl/source/detail/device_info.hpp | 3 +++ 10 files changed, 80 insertions(+), 5 deletions(-) diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index b333bb7b476d6..6435618ae7f6a 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -1328,6 +1328,9 @@ StringRef SYCL::gen::resolveGenDevice(StringRef DeviceName) { .Case("amd_gpu_gfx700", "gfx700") .Case("amd_gpu_gfx701", "gfx701") .Case("amd_gpu_gfx702", "gfx702") + .Case("amd_gpu_gfx703", "gfx703") + .Case("amd_gpu_gfx704", "gfx704") + .Case("amd_gpu_gfx705", "gfx705") .Case("amd_gpu_gfx801", "gfx801") .Case("amd_gpu_gfx802", "gfx802") .Case("amd_gpu_gfx803", "gfx803") @@ -1415,6 +1418,9 @@ SmallString<64> SYCL::gen::getGenDeviceMacro(StringRef DeviceName) { .Case("gfx700", "AMD_GPU_GFX700") .Case("gfx701", "AMD_GPU_GFX701") .Case("gfx702", "AMD_GPU_GFX702") + .Case("gfx703", "AMD_GPU_GFX703") + .Case("gfx704", "AMD_GPU_GFX704") + .Case("gfx705", "AMD_GPU_GFX705") .Case("gfx801", "AMD_GPU_GFX801") .Case("gfx802", "AMD_GPU_GFX802") .Case("gfx803", "AMD_GPU_GFX803") diff --git a/clang/test/Driver/sycl-device-traits-macros-amdgcn.cpp b/clang/test/Driver/sycl-device-traits-macros-amdgcn.cpp index 8dacdd21f9b20..f7b8ae550f6e9 100644 --- a/clang/test/Driver/sycl-device-traits-macros-amdgcn.cpp +++ b/clang/test/Driver/sycl-device-traits-macros-amdgcn.cpp @@ -9,6 +9,15 @@ // RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx702 \ // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx703 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx704 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx705 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE // RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx801 \ // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE @@ -120,6 +129,15 @@ // RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx702 \ // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-OFFLOAD-ARCH +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx703 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-OFFLOAD-ARCH +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx704 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-OFFLOAD-ARCH +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx705 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-OFFLOAD-ARCH // RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx801 \ // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-OFFLOAD-ARCH diff --git a/clang/test/Driver/sycl-oneapi-gpu-amdgpu.cpp b/clang/test/Driver/sycl-oneapi-gpu-amdgpu.cpp index bdc108c6a0f55..e6c8530c53130 100644 --- a/clang/test/Driver/sycl-oneapi-gpu-amdgpu.cpp +++ b/clang/test/Driver/sycl-oneapi-gpu-amdgpu.cpp @@ -9,6 +9,15 @@ // RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx702 \ // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefixes=DEVICE_AMD,MACRO_AMD -DDEV_STR=gfx702 -DMAC_STR=GFX702 +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx703 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefixes=DEVICE_AMD,MACRO_AMD -DDEV_STR=gfx703 -DMAC_STR=GFX703 +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx704 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefixes=DEVICE_AMD,MACRO_AMD -DDEV_STR=gfx704 -DMAC_STR=GFX704 +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx705 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefixes=DEVICE_AMD,MACRO_AMD -DDEV_STR=gfx705 -DMAC_STR=GFX705 // RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx801 \ // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefixes=DEVICE_AMD,MACRO_AMD -DDEV_STR=gfx801 -DMAC_STR=GFX801 diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td index 686c1de58a9fa..5fc1cf79a1caa 100644 --- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td +++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td @@ -320,6 +320,9 @@ def : HipTargetInfo<"amd_gpu_gfx906", !listconcat(HipMinAspects, AllUSMAspects), def : HipTargetInfo<"amd_gpu_gfx700", HipMinAspects, HipSubgroupSizesGCN2>; def : HipTargetInfo<"amd_gpu_gfx701", HipMinAspects, HipSubgroupSizesGCN2>; def : HipTargetInfo<"amd_gpu_gfx702", HipMinAspects, HipSubgroupSizesGCN2>; +def : HipTargetInfo<"amd_gpu_gfx703", HipMinAspects, HipSubgroupSizesGCN2>; +def : HipTargetInfo<"amd_gpu_gfx704", HipMinAspects, HipSubgroupSizesGCN2>; +def : HipTargetInfo<"amd_gpu_gfx705", HipMinAspects, HipSubgroupSizesGCN2>; def : HipTargetInfo<"amd_gpu_gfx801", HipMinAspects, HipSubgroupSizesGCN3>; def : HipTargetInfo<"amd_gpu_gfx802", HipMinAspects, HipSubgroupSizesGCN3>; def : HipTargetInfo<"amd_gpu_gfx803", HipMinAspects, HipSubgroupSizesGCN3>; diff --git a/sycl/doc/UsersManual.md b/sycl/doc/UsersManual.md index 3f184edc12def..efdf03249616c 100644 --- a/sycl/doc/UsersManual.md +++ b/sycl/doc/UsersManual.md @@ -95,6 +95,9 @@ and not recommended to use in production environment. * amd_gpu_gfx700 - AMD GCN GFX7 (Sea Islands (CI)) architecture * amd_gpu_gfx701 - AMD GCN GFX7 (Sea Islands (CI)) architecture * amd_gpu_gfx702 - AMD GCN GFX7 (Sea Islands (CI)) architecture + * amd_gpu_gfx703 - AMD GCN GFX7 (Sea Islands (CI)) architecture + * amd_gpu_gfx704 - AMD GCN GFX7 (Sea Islands (CI)) architecture + * amd_gpu_gfx705 - AMD GCN GFX7 (Sea Islands (CI)) architecture * amd_gpu_gfx801 - AMD GCN GFX8 (Volcanic Islands (VI)) architecture * amd_gpu_gfx802 - AMD GCN GFX8 (Volcanic Islands (VI)) architecture * amd_gpu_gfx803 - AMD GCN GFX8 (Volcanic Islands (VI)) architecture @@ -104,13 +107,13 @@ and not recommended to use in production environment. * amd_gpu_gfx902 - AMD GCN GFX9 (Vega) architecture * amd_gpu_gfx904 - AMD GCN GFX9 (Vega) architecture * amd_gpu_gfx906 - AMD GCN GFX9 (Vega) architecture - * amd_gpu_gfx908 - AMD GCN GFX9 (Vega) architecture + * amd_gpu_gfx908 - AMD GCN GFX9 (CDNA1) architecture * amd_gpu_gfx909 - AMD GCN GFX9 (Vega) architecture - * amd_gpu_gfx90a - AMD GCN GFX9 (Vega) architecture + * amd_gpu_gfx90a - AMD GCN GFX9 (CDNA2) architecture * amd_gpu_gfx90c - AMD GCN GFX9 (Vega) architecture - * amd_gpu_gfx940 - AMD GCN GFX9 (Vega) architecture - * amd_gpu_gfx941 - AMD GCN GFX9 (Vega) architecture - * amd_gpu_gfx942 - AMD GCN GFX9 (Vega) architecture + * amd_gpu_gfx940 - AMD GCN GFX9 (CDNA3) architecture + * amd_gpu_gfx941 - AMD GCN GFX9 (CDNA3) architecture + * amd_gpu_gfx942 - AMD GCN GFX9 (CDNA3) architecture * amd_gpu_gfx1010 - AMD GCN GFX10.1 (RDNA 1) architecture * amd_gpu_gfx1011 - AMD GCN GFX10.1 (RDNA 1) architecture * amd_gpu_gfx1012 - AMD GCN GFX10.1 (RDNA 1) architecture diff --git a/sycl/doc/design/DeviceIf.md b/sycl/doc/design/DeviceIf.md index 5d26bdab639db..606ad0d3be728 100644 --- a/sycl/doc/design/DeviceIf.md +++ b/sycl/doc/design/DeviceIf.md @@ -88,6 +88,9 @@ recognizes: * `amd_gpu_gfx700` * `amd_gpu_gfx701` * `amd_gpu_gfx702` +* `amd_gpu_gfx703` +* `amd_gpu_gfx704` +* `amd_gpu_gfx705` * `amd_gpu_gfx801` * `amd_gpu_gfx802` * `amd_gpu_gfx803` @@ -187,6 +190,9 @@ one of the following corresponding C++ macro names: * `__SYCL_TARGET_AMD_GPU_GFX700__` * `__SYCL_TARGET_AMD_GPU_GFX701__` * `__SYCL_TARGET_AMD_GPU_GFX702__` +* `__SYCL_TARGET_AMD_GPU_GFX703__` +* `__SYCL_TARGET_AMD_GPU_GFX704__` +* `__SYCL_TARGET_AMD_GPU_GFX705__` * `__SYCL_TARGET_AMD_GPU_GFX801__` * `__SYCL_TARGET_AMD_GPU_GFX802__` * `__SYCL_TARGET_AMD_GPU_GFX803__` diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc index f2ebcc5944462..e866613120366 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc @@ -551,6 +551,9 @@ a| amd_gpu_gfx700 amd_gpu_gfx701 amd_gpu_gfx702 +amd_gpu_gfx703 +amd_gpu_gfx704 +amd_gpu_gfx705 ---- |- |AMD GCN 2.0 architecture. diff --git a/sycl/include/sycl/ext/oneapi/experimental/architectures.def b/sycl/include/sycl/ext/oneapi/experimental/architectures.def index b8148f673814a..08ce75a370119 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/architectures.def +++ b/sycl/include/sycl/ext/oneapi/experimental/architectures.def @@ -114,6 +114,9 @@ __SYCL_ARCHITECTURE(nvidia_gpu_sm_90a, 0x01000000000090a0) __SYCL_ARCHITECTURE(amd_gpu_gfx700, 0x0200000000070000) __SYCL_ARCHITECTURE(amd_gpu_gfx701, 0x0200000000070100) __SYCL_ARCHITECTURE(amd_gpu_gfx702, 0x0200000000070200) +__SYCL_ARCHITECTURE(amd_gpu_gfx703, 0x0200000000070300) +__SYCL_ARCHITECTURE(amd_gpu_gfx704, 0x0200000000070400) +__SYCL_ARCHITECTURE(amd_gpu_gfx705, 0x0200000000070500) __SYCL_ARCHITECTURE(amd_gpu_gfx801, 0x0200000000080100) __SYCL_ARCHITECTURE(amd_gpu_gfx802, 0x0200000000080200) __SYCL_ARCHITECTURE(amd_gpu_gfx803, 0x0200000000080300) diff --git a/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp b/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp index 9638ac91ac793..ab6c011413f11 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp @@ -203,6 +203,15 @@ static constexpr ext::oneapi::experimental::architecture #ifndef __SYCL_TARGET_AMD_GPU_GFX702__ #define __SYCL_TARGET_AMD_GPU_GFX702__ 0 #endif +#ifndef __SYCL_TARGET_AMD_GPU_GFX703__ +#define __SYCL_TARGET_AMD_GPU_GFX703__ 0 +#endif +#ifndef __SYCL_TARGET_AMD_GPU_GFX704__ +#define __SYCL_TARGET_AMD_GPU_GFX704__ 0 +#endif +#ifndef __SYCL_TARGET_AMD_GPU_GFX705__ +#define __SYCL_TARGET_AMD_GPU_GFX705__ 0 +#endif #ifndef __SYCL_TARGET_AMD_GPU_GFX801__ #define __SYCL_TARGET_AMD_GPU_GFX801__ 0 #endif @@ -360,6 +369,9 @@ static constexpr bool is_allowable_aot_mode = (__SYCL_TARGET_AMD_GPU_GFX700__ == 1) || (__SYCL_TARGET_AMD_GPU_GFX701__ == 1) || (__SYCL_TARGET_AMD_GPU_GFX702__ == 1) || + (__SYCL_TARGET_AMD_GPU_GFX703__ == 1) || + (__SYCL_TARGET_AMD_GPU_GFX704__ == 1) || + (__SYCL_TARGET_AMD_GPU_GFX705__ == 1) || (__SYCL_TARGET_AMD_GPU_GFX801__ == 1) || (__SYCL_TARGET_AMD_GPU_GFX802__ == 1) || (__SYCL_TARGET_AMD_GPU_GFX803__ == 1) || @@ -538,6 +550,15 @@ get_current_architecture_aot() { #if __SYCL_TARGET_AMD_GPU_GFX702__ return ext::oneapi::experimental::architecture::amd_gpu_gfx702; #endif +#if __SYCL_TARGET_AMD_GPU_GFX703__ + return ext::oneapi::experimental::architecture::amd_gpu_gfx703; +#endif +#if __SYCL_TARGET_AMD_GPU_GFX704__ + return ext::oneapi::experimental::architecture::amd_gpu_gfx704; +#endif +#if __SYCL_TARGET_AMD_GPU_GFX705__ + return ext::oneapi::experimental::architecture::amd_gpu_gfx705; +#endif #if __SYCL_TARGET_AMD_GPU_GFX801__ return ext::oneapi::experimental::architecture::amd_gpu_gfx801; #endif diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index 6eedec5b4f404..2551e6c03b635 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -626,6 +626,9 @@ constexpr std::pair NvidiaAmdGPUArchitectures[] = {"9.0", oneapi_exp_arch::nvidia_gpu_sm_90}, {"gfx701", oneapi_exp_arch::amd_gpu_gfx701}, {"gfx702", oneapi_exp_arch::amd_gpu_gfx702}, + {"gfx703", oneapi_exp_arch::amd_gpu_gfx703}, + {"gfx704", oneapi_exp_arch::amd_gpu_gfx704}, + {"gfx705", oneapi_exp_arch::amd_gpu_gfx705}, {"gfx801", oneapi_exp_arch::amd_gpu_gfx801}, {"gfx802", oneapi_exp_arch::amd_gpu_gfx802}, {"gfx803", oneapi_exp_arch::amd_gpu_gfx803}, From f6b8fba4ece64917f8b4cce3944a90c44b8aa8a9 Mon Sep 17 00:00:00 2001 From: Buildbot for SYCL Date: Tue, 5 Nov 2024 23:02:59 +0800 Subject: [PATCH 02/18] [GHA] Uplift Linux GPU RT version to 24.39.31294.12 (#15982) Scheduled drivers uplift Co-authored-by: GitHub Actions --- devops/dependencies.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/devops/dependencies.json b/devops/dependencies.json index 25283242aeeda..944a3ffb7e8d2 100644 --- a/devops/dependencies.json +++ b/devops/dependencies.json @@ -19,9 +19,9 @@ "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" }, "level_zero": { - "github_tag": "v1.18.3", - "version": "v1.18.3", - "url": "https://github.com/oneapi-src/level-zero/releases/tag/v1.18.3", + "github_tag": "v1.18.5", + "version": "v1.18.5", + "url": "https://github.com/oneapi-src/level-zero/releases/tag/v1.18.5", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" }, "tbb": { From 730cd3a5275fff509c28e04397b98d7a98b91062 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 5 Nov 2024 15:04:05 +0000 Subject: [PATCH 03/18] [libclc] Move min/max/clamp into the CLC builtins library (#114386) (#15948) [libclc] Move min/max/clamp into the CLC builtins library (#114386) These functions are "shared" between integer and floating-point types, hence the directory name. They are used in several CLC internal functions such as __clc_ldexp. Note that clspv and spirv targets don't want to define these functions, so pre-processor macros replace calls to __clc_min with regular min, for example. This means they can use as much of the generic CLC source files as possible, but where CLC functions would usually call out to an external __clc_min symbol, they call out to an external min symbol. Then they opt out of defining __clc_min itself in their CLC builtins library. Preprocessor definitions for these targets have also been changed somewhat: what used to be CLC_SPIRV (the 32-bit target) is now CLC_SPIRV32, and CLC_SPIRV now represents either CLC_SPIRV32 or CLC_SPIRV64. Same goes for CLC_CLSPV. There are no differences (measured with llvm-diff) in any of the final builtins libraries for nvptx, amdgpu, or clspv. Neither are there differences in the SPIR-V targets' LLVM IR before it's actually lowered to SPIR-V. ---- This is a cherry-pick of 3 upstream commits: fba9f05ff7b36f9cbb5835d79f659290dadecaad, 86974e15f517e8a4ef3bb91125e75cf43d69da6d, and d12a8da1de1ce2c7d8fbf84306a2b6de5c85d707. It starts to move our downstream `libspirv` implementations towards reusing upstream's shared CLC implementations. See the removal of libspirv's `__clc_(clamp|min|max)`. --- libclc/CMakeLists.txt | 13 +- libclc/clc/include/clc/clcfunc.h | 4 +- .../include/clc/integer/gentype.inc | 4 +- .../include/clc/math/gentype.inc | 0 libclc/clc/include/clc/shared/clc_clamp.h | 15 ++ libclc/clc/include/clc/shared/clc_clamp.inc | 9 + libclc/clc/include/clc/shared/clc_max.h | 12 ++ libclc/clc/include/clc/shared/clc_max.inc | 7 + libclc/clc/include/clc/shared/clc_min.h | 12 ++ libclc/clc/include/clc/shared/clc_min.inc | 7 + libclc/clc/lib/generic/SOURCES | 3 + libclc/clc/lib/generic/shared/clc_clamp.cl | 7 + .../lib/generic}/shared/clc_clamp.inc | 8 - libclc/clc/lib/generic/shared/clc_max.cl | 7 + .../lib/generic}/shared/clc_max.inc | 8 - libclc/clc/lib/generic/shared/clc_min.cl | 7 + .../lib/generic}/shared/clc_min.inc | 8 - libclc/generic/lib/common/smoothstep.cl | 2 +- libclc/generic/lib/common/step.cl | 2 +- libclc/generic/lib/math/clc_ldexp.cl | 170 +++++++++--------- libclc/generic/lib/math/math.h | 2 +- libclc/generic/lib/shared/clamp.cl | 3 +- libclc/generic/lib/shared/max.cl | 3 +- libclc/generic/lib/shared/max.inc | 3 +- libclc/generic/lib/shared/min.cl | 3 +- libclc/generic/lib/shared/min.inc | 3 +- libclc/generic/libspirv/SOURCES | 3 - .../generic/libspirv/core/shared/clc_clamp.cl | 15 -- .../generic/libspirv/core/shared/clc_max.cl | 15 -- .../generic/libspirv/core/shared/clc_min.cl | 15 -- libclc/generic/libspirv/math/clc_hypot.cl | 114 ++++++------ 31 files changed, 256 insertions(+), 228 deletions(-) rename libclc/{generic => clc}/include/clc/integer/gentype.inc (99%) rename libclc/{generic => clc}/include/clc/math/gentype.inc (100%) create mode 100644 libclc/clc/include/clc/shared/clc_clamp.h create mode 100644 libclc/clc/include/clc/shared/clc_clamp.inc create mode 100644 libclc/clc/include/clc/shared/clc_max.h create mode 100644 libclc/clc/include/clc/shared/clc_max.inc create mode 100644 libclc/clc/include/clc/shared/clc_min.h create mode 100644 libclc/clc/include/clc/shared/clc_min.inc create mode 100644 libclc/clc/lib/generic/shared/clc_clamp.cl rename libclc/{generic/libspirv/core => clc/lib/generic}/shared/clc_clamp.inc (62%) create mode 100644 libclc/clc/lib/generic/shared/clc_max.cl rename libclc/{generic/libspirv/core => clc/lib/generic}/shared/clc_max.inc (50%) create mode 100644 libclc/clc/lib/generic/shared/clc_min.cl rename libclc/{generic/libspirv/core => clc/lib/generic}/shared/clc_min.inc (50%) delete mode 100644 libclc/generic/libspirv/core/shared/clc_clamp.cl delete mode 100644 libclc/generic/libspirv/core/shared/clc_max.cl delete mode 100644 libclc/generic/libspirv/core/shared/clc_min.cl diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index ab1a18f49557e..72687d6f18636 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -424,9 +424,19 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) set ( has_distinct_generic_addrspace TRUE ) if ( ARCH STREQUAL spirv OR ARCH STREQUAL spirv64 ) set( opt_flags -O3 ) + list( APPEND build_flags -DCLC_SPIRV ) set( spvflags --spirv-max-version=1.1 ) + set( MACRO_ARCH SPIRV32 ) + if( ARCH STREQUAL spirv64 ) + set( MACRO_ARCH SPIRV64 ) + endif() elseif( ARCH STREQUAL clspv OR ARCH STREQUAL clspv64 ) set( opt_flags -O3 ) + list( APPEND build_flags -DCLC_CLSPV ) + set( MACRO_ARCH CLSPV32 ) + if( ARCH STREQUAL clspv64 ) + set( MACRO_ARCH CLSPV64 ) + endif() elseif( ARCH STREQUAL nvptx OR ARCH STREQUAL nvptx64 ) set( opt_flags -O3 "--nvvm-reflect-enable=false" ) set( has_distinct_generic_addrspace FALSE ) @@ -437,6 +447,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) set( has_distinct_generic_addrspace FALSE ) else() set( opt_flags -O3 ) + set( MACRO_ARCH ${ARCH} ) endif() # Enable SPIR-V builtin function declarations, so they don't @@ -483,7 +494,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) list(APPEND build_flags -D__unix__) endif() - string( TOUPPER "CLC_${ARCH}" CLC_TARGET_DEFINE ) + string( TOUPPER "CLC_${MACRO_ARCH}" CLC_TARGET_DEFINE ) list( APPEND build_flags -D__CLC_INTERNAL diff --git a/libclc/clc/include/clc/clcfunc.h b/libclc/clc/include/clc/clcfunc.h index e04a405a94be7..6c95a07b1c184 100644 --- a/libclc/clc/include/clc/clcfunc.h +++ b/libclc/clc/include/clc/clcfunc.h @@ -10,9 +10,9 @@ // avoid inlines for SPIR-V related targets since we'll optimise later in the // chain -#if defined(CLC_SPIRV) || defined(CLC_SPIRV64) +#if defined(CLC_SPIRV) #define _CLC_DEF -#elif defined(CLC_CLSPV) || defined(CLC_CLSPV64) +#elif defined(CLC_CLSPV) #define _CLC_DEF __attribute__((noinline)) __attribute__((clspv_libclc_builtin)) #else #define _CLC_DEF __attribute__((always_inline)) diff --git a/libclc/generic/include/clc/integer/gentype.inc b/libclc/clc/include/clc/integer/gentype.inc similarity index 99% rename from libclc/generic/include/clc/integer/gentype.inc rename to libclc/clc/include/clc/integer/gentype.inc index 032bdc0cadbaf..e8ca005d4ccb9 100644 --- a/libclc/generic/include/clc/integer/gentype.inc +++ b/libclc/clc/include/clc/integer/gentype.inc @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -//These 2 defines only change when switching between data sizes or base types to -//keep this file manageable. +// These 2 defines only change when switching between data sizes or base types +// to keep this file manageable. #define __CLC_GENSIZE 8 #define __CLC_SCALAR_GENTYPE char diff --git a/libclc/generic/include/clc/math/gentype.inc b/libclc/clc/include/clc/math/gentype.inc similarity index 100% rename from libclc/generic/include/clc/math/gentype.inc rename to libclc/clc/include/clc/math/gentype.inc diff --git a/libclc/clc/include/clc/shared/clc_clamp.h b/libclc/clc/include/clc/shared/clc_clamp.h new file mode 100644 index 0000000000000..5c044c9a1a510 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_clamp.h @@ -0,0 +1,15 @@ +#if defined(CLC_CLSPV) || defined(CLC_SPIRV) +// clspv and spir-v targets provide their own OpenCL-compatible clamp +#define __clc_clamp clamp +#else + +#include +#include + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif diff --git a/libclc/clc/include/clc/shared/clc_clamp.inc b/libclc/clc/include/clc/shared/clc_clamp.inc new file mode 100644 index 0000000000000..cf6b0b2789bc5 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_clamp.inc @@ -0,0 +1,9 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_clamp(__CLC_GENTYPE x, + __CLC_GENTYPE y, + __CLC_GENTYPE z); + +#ifndef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_clamp(__CLC_GENTYPE x, + __CLC_SCALAR_GENTYPE y, + __CLC_SCALAR_GENTYPE z); +#endif diff --git a/libclc/clc/include/clc/shared/clc_max.h b/libclc/clc/include/clc/shared/clc_max.h new file mode 100644 index 0000000000000..2825640f6c291 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_max.h @@ -0,0 +1,12 @@ +#if defined(CLC_CLSPV) || defined(CLC_SPIRV) +// clspv and spir-v targets provide their own OpenCL-compatible max +#define __clc_max max +#else + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif diff --git a/libclc/clc/include/clc/shared/clc_max.inc b/libclc/clc/include/clc/shared/clc_max.inc new file mode 100644 index 0000000000000..bddb3fa3d920c --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_max.inc @@ -0,0 +1,7 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_max(__CLC_GENTYPE a, + __CLC_GENTYPE b); + +#ifndef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_max(__CLC_GENTYPE a, + __CLC_SCALAR_GENTYPE b); +#endif diff --git a/libclc/clc/include/clc/shared/clc_min.h b/libclc/clc/include/clc/shared/clc_min.h new file mode 100644 index 0000000000000..0b7ee140b8f45 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_min.h @@ -0,0 +1,12 @@ +#if defined(CLC_CLSPV) || defined(CLC_SPIRV) +// clspv and spir-v targets provide their own OpenCL-compatible min +#define __clc_min min +#else + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif diff --git a/libclc/clc/include/clc/shared/clc_min.inc b/libclc/clc/include/clc/shared/clc_min.inc new file mode 100644 index 0000000000000..3e1da96df43dd --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_min.inc @@ -0,0 +1,7 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_min(__CLC_GENTYPE a, + __CLC_GENTYPE b); + +#ifndef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_min(__CLC_GENTYPE a, + __CLC_SCALAR_GENTYPE b); +#endif diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index fa2e4f50b99cd..db523adb63836 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -1 +1,4 @@ geometric/clc_dot.cl +shared/clc_clamp.cl +shared/clc_max.cl +shared/clc_min.cl diff --git a/libclc/clc/lib/generic/shared/clc_clamp.cl b/libclc/clc/lib/generic/shared/clc_clamp.cl new file mode 100644 index 0000000000000..1d40da3cf2296 --- /dev/null +++ b/libclc/clc/lib/generic/shared/clc_clamp.cl @@ -0,0 +1,7 @@ +#include + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include diff --git a/libclc/generic/libspirv/core/shared/clc_clamp.inc b/libclc/clc/lib/generic/shared/clc_clamp.inc similarity index 62% rename from libclc/generic/libspirv/core/shared/clc_clamp.inc rename to libclc/clc/lib/generic/shared/clc_clamp.inc index 571f6470e5703..da67cd2ad69db 100644 --- a/libclc/generic/libspirv/core/shared/clc_clamp.inc +++ b/libclc/clc/lib/generic/shared/clc_clamp.inc @@ -1,11 +1,3 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_clamp(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z) { diff --git a/libclc/clc/lib/generic/shared/clc_max.cl b/libclc/clc/lib/generic/shared/clc_max.cl new file mode 100644 index 0000000000000..e1050ed0007ee --- /dev/null +++ b/libclc/clc/lib/generic/shared/clc_max.cl @@ -0,0 +1,7 @@ +#include + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include diff --git a/libclc/generic/libspirv/core/shared/clc_max.inc b/libclc/clc/lib/generic/shared/clc_max.inc similarity index 50% rename from libclc/generic/libspirv/core/shared/clc_max.inc rename to libclc/clc/lib/generic/shared/clc_max.inc index 882f29ce30d94..f4234cb359d86 100644 --- a/libclc/generic/libspirv/core/shared/clc_max.inc +++ b/libclc/clc/lib/generic/shared/clc_max.inc @@ -1,11 +1,3 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_max(__CLC_GENTYPE a, __CLC_GENTYPE b) { return (a > b ? a : b); diff --git a/libclc/clc/lib/generic/shared/clc_min.cl b/libclc/clc/lib/generic/shared/clc_min.cl new file mode 100644 index 0000000000000..12a26f5352407 --- /dev/null +++ b/libclc/clc/lib/generic/shared/clc_min.cl @@ -0,0 +1,7 @@ +#include + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include diff --git a/libclc/generic/libspirv/core/shared/clc_min.inc b/libclc/clc/lib/generic/shared/clc_min.inc similarity index 50% rename from libclc/generic/libspirv/core/shared/clc_min.inc rename to libclc/clc/lib/generic/shared/clc_min.inc index d8a51291dbc27..e9c85ddd3affa 100644 --- a/libclc/generic/libspirv/core/shared/clc_min.inc +++ b/libclc/clc/lib/generic/shared/clc_min.inc @@ -1,11 +1,3 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_min(__CLC_GENTYPE a, __CLC_GENTYPE b) { return (b < a ? b : a); diff --git a/libclc/generic/lib/common/smoothstep.cl b/libclc/generic/lib/common/smoothstep.cl index 8d18024c446d6..99553cac901d8 100644 --- a/libclc/generic/lib/common/smoothstep.cl +++ b/libclc/generic/lib/common/smoothstep.cl @@ -49,7 +49,7 @@ _CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, smoothstep, double, _CLC_V_S_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, smoothstep, double, double, double); -#if !defined(CLC_SPIRV) && !defined(CLC_SPIRV64) +#if !defined(CLC_SPIRV) SMOOTH_STEP_DEF(float, double, SMOOTH_STEP_IMPL_D); SMOOTH_STEP_DEF(double, float, SMOOTH_STEP_IMPL_D); diff --git a/libclc/generic/lib/common/step.cl b/libclc/generic/lib/common/step.cl index f8cbd125c0638..46e86249b57e4 100644 --- a/libclc/generic/lib/common/step.cl +++ b/libclc/generic/lib/common/step.cl @@ -44,7 +44,7 @@ STEP_DEF(double, double); _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, step, double, double); _CLC_V_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, step, double, double); -#if !defined(CLC_SPIRV) && !defined(CLC_SPIRV64) +#if !defined(CLC_SPIRV) STEP_DEF(float, double); STEP_DEF(double, float); diff --git a/libclc/generic/lib/math/clc_ldexp.cl b/libclc/generic/lib/math/clc_ldexp.cl index ae6117b7b2922..438c31835a364 100644 --- a/libclc/generic/lib/math/clc_ldexp.cl +++ b/libclc/generic/lib/math/clc_ldexp.cl @@ -20,76 +20,78 @@ * THE SOFTWARE. */ -#include -#include "config.h" #include "../clcmacro.h" +#include "config.h" #include "math.h" +#include +#include _CLC_DEF _CLC_OVERLOAD float __clc_ldexp(float x, int n) { - if (!__clc_fp32_subnormals_supported()) { - - // This treats subnormals as zeros - int i = as_int(x); - int e = (i >> 23) & 0xff; - int m = i & 0x007fffff; - int s = i & 0x80000000; - int v = add_sat(e, n); - v = clamp(v, 0, 0xff); - int mr = e == 0 | v == 0 | v == 0xff ? 0 : m; - int c = e == 0xff; - mr = c ? m : mr; - int er = c ? e : v; - er = e ? er : e; - return as_float( s | (er << 23) | mr ); - } - - /* supports denormal values */ - const int multiplier = 24; - float val_f; - uint val_ui; - uint sign; - int exponent; - val_ui = as_uint(x); - sign = val_ui & 0x80000000; - val_ui = val_ui & 0x7fffffff;/* remove the sign bit */ - int val_x = val_ui; - - exponent = val_ui >> 23; /* get the exponent */ - int dexp = exponent; - - /* denormal support */ - int fbh = 127 - (as_uint((float)(as_float(val_ui | 0x3f800000) - 1.0f)) >> 23); - int dexponent = 25 - fbh; - uint dval_ui = (( (val_ui << fbh) & 0x007fffff) | (dexponent << 23)); - int ex = dexponent + n - multiplier; - dexponent = ex; - uint val = sign | (ex << 23) | (dval_ui & 0x007fffff); - int ex1 = dexponent + multiplier; - ex1 = -ex1 +25; - dval_ui = (((dval_ui & 0x007fffff )| 0x800000) >> ex1); - dval_ui = dexponent > 0 ? val :dval_ui; - dval_ui = dexponent > 254 ? 0x7f800000 :dval_ui; /*overflow*/ - dval_ui = dexponent < -multiplier ? 0 : dval_ui; /*underflow*/ - dval_ui = dval_ui | sign; - val_f = as_float(dval_ui); - - exponent += n; - - val = sign | (exponent << 23) | (val_ui & 0x007fffff); - ex1 = exponent + multiplier; - ex1 = -ex1 +25; - val_ui = (((val_ui & 0x007fffff )| 0x800000) >> ex1); - val_ui = exponent > 0 ? val :val_ui; - val_ui = exponent > 254 ? 0x7f800000 :val_ui; /*overflow*/ - val_ui = exponent < -multiplier ? 0 : val_ui; /*underflow*/ - val_ui = val_ui | sign; - - val_ui = dexp == 0? dval_ui : val_ui; - val_f = as_float(val_ui); - - val_f = isnan(x) | isinf(x) | val_x == 0 ? x : val_f; - return val_f; + if (!__clc_fp32_subnormals_supported()) { + + // This treats subnormals as zeros + int i = as_int(x); + int e = (i >> 23) & 0xff; + int m = i & 0x007fffff; + int s = i & 0x80000000; + int v = add_sat(e, n); + v = __clc_clamp(v, 0, 0xff); + int mr = e == 0 | v == 0 | v == 0xff ? 0 : m; + int c = e == 0xff; + mr = c ? m : mr; + int er = c ? e : v; + er = e ? er : e; + return as_float(s | (er << 23) | mr); + } + + /* supports denormal values */ + const int multiplier = 24; + float val_f; + uint val_ui; + uint sign; + int exponent; + val_ui = as_uint(x); + sign = val_ui & 0x80000000; + val_ui = val_ui & 0x7fffffff; /* remove the sign bit */ + int val_x = val_ui; + + exponent = val_ui >> 23; /* get the exponent */ + int dexp = exponent; + + /* denormal support */ + int fbh = + 127 - (as_uint((float)(as_float(val_ui | 0x3f800000) - 1.0f)) >> 23); + int dexponent = 25 - fbh; + uint dval_ui = (((val_ui << fbh) & 0x007fffff) | (dexponent << 23)); + int ex = dexponent + n - multiplier; + dexponent = ex; + uint val = sign | (ex << 23) | (dval_ui & 0x007fffff); + int ex1 = dexponent + multiplier; + ex1 = -ex1 + 25; + dval_ui = (((dval_ui & 0x007fffff) | 0x800000) >> ex1); + dval_ui = dexponent > 0 ? val : dval_ui; + dval_ui = dexponent > 254 ? 0x7f800000 : dval_ui; /*overflow*/ + dval_ui = dexponent < -multiplier ? 0 : dval_ui; /*underflow*/ + dval_ui = dval_ui | sign; + val_f = as_float(dval_ui); + + exponent += n; + + val = sign | (exponent << 23) | (val_ui & 0x007fffff); + ex1 = exponent + multiplier; + ex1 = -ex1 + 25; + val_ui = (((val_ui & 0x007fffff) | 0x800000) >> ex1); + val_ui = exponent > 0 ? val : val_ui; + val_ui = exponent > 254 ? 0x7f800000 : val_ui; /*overflow*/ + val_ui = exponent < -multiplier ? 0 : val_ui; /*underflow*/ + val_ui = val_ui | sign; + + val_ui = dexp == 0 ? dval_ui : val_ui; + val_f = as_float(val_ui); + + val_f = isnan(x) | isinf(x) | val_x == 0 ? x : val_f; + return val_f; } #ifdef cl_khr_fp64 @@ -97,32 +99,32 @@ _CLC_DEF _CLC_OVERLOAD float __clc_ldexp(float x, int n) { #pragma OPENCL EXTENSION cl_khr_fp64 : enable _CLC_DEF _CLC_OVERLOAD double __clc_ldexp(double x, int n) { - long l = as_ulong(x); - int e = (l >> 52) & 0x7ff; - long s = l & 0x8000000000000000; + long l = as_ulong(x); + int e = (l >> 52) & 0x7ff; + long s = l & 0x8000000000000000; - ulong ux = as_ulong(x * 0x1.0p+53); - int de = ((int)(ux >> 52) & 0x7ff) - 53; - int c = e == 0; - e = c ? de: e; + ulong ux = as_ulong(x * 0x1.0p+53); + int de = ((int)(ux >> 52) & 0x7ff) - 53; + int c = e == 0; + e = c ? de : e; - ux = c ? ux : l; + ux = c ? ux : l; - int v = e + n; - v = clamp(v, -0x7ff, 0x7ff); + int v = e + n; + v = __clc_clamp(v, -0x7ff, 0x7ff); - ux &= ~EXPBITS_DP64; + ux &= ~EXPBITS_DP64; - double mr = as_double(ux | ((ulong)(v+53) << 52)); - mr = mr * 0x1.0p-53; + double mr = as_double(ux | ((ulong)(v + 53) << 52)); + mr = mr * 0x1.0p-53; - mr = v > 0 ? as_double(ux | ((ulong)v << 52)) : mr; + mr = v > 0 ? as_double(ux | ((ulong)v << 52)) : mr; - mr = v == 0x7ff ? as_double(s | PINFBITPATT_DP64) : mr; - mr = v < -53 ? as_double(s) : mr; + mr = v == 0x7ff ? as_double(s | PINFBITPATT_DP64) : mr; + mr = v < -53 ? as_double(s) : mr; - mr = ((n == 0) | isinf(x) | (x == 0) ) ? x : mr; - return mr; + mr = ((n == 0) | isinf(x) | (x == 0)) ? x : mr; + return mr; } #endif @@ -132,7 +134,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_ldexp(double x, int n) { #pragma OPENCL EXTENSION cl_khr_fp16 : enable _CLC_OVERLOAD _CLC_DEF half __clc_ldexp(half x, int n) { - return (half)__clc_ldexp((float)x, n); + return (half)__clc_ldexp((float)x, n); } _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __clc_ldexp, half, int); diff --git a/libclc/generic/lib/math/math.h b/libclc/generic/lib/math/math.h index 62a4c925db51b..b1f82b7991ea3 100644 --- a/libclc/generic/lib/math/math.h +++ b/libclc/generic/lib/math/math.h @@ -40,7 +40,7 @@ #if (defined __AMDGCN__ || defined __R600__) && !defined __HAS_FMAF__ #define HAVE_HW_FMA32() (0) -#elif defined CLC_SPIRV || defined CLC_SPIRV64 +#elif defined(CLC_SPIRV) bool __attribute__((noinline)) __clc_runtime_has_hw_fma32(void); #define HAVE_HW_FMA32() __clc_runtime_has_hw_fma32() #else diff --git a/libclc/generic/lib/shared/clamp.cl b/libclc/generic/lib/shared/clamp.cl index 51a009281be29..f470fc822f756 100644 --- a/libclc/generic/lib/shared/clamp.cl +++ b/libclc/generic/lib/shared/clamp.cl @@ -1,6 +1,5 @@ #include -#include -#include +#include #define __CLC_BODY #include diff --git a/libclc/generic/lib/shared/max.cl b/libclc/generic/lib/shared/max.cl index a753b702f658d..2266d5905afd6 100644 --- a/libclc/generic/lib/shared/max.cl +++ b/libclc/generic/lib/shared/max.cl @@ -1,6 +1,5 @@ #include -#include -#include +#include #define __CLC_BODY #include diff --git a/libclc/generic/lib/shared/max.inc b/libclc/generic/lib/shared/max.inc index 3c83e29c3f2d9..ec433a89c6e92 100644 --- a/libclc/generic/lib/shared/max.inc +++ b/libclc/generic/lib/shared/max.inc @@ -3,7 +3,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a, __CLC_GENTYPE b) { } #ifndef __CLC_SCALAR -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b) { +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a, + __CLC_SCALAR_GENTYPE b) { return __clc_max(a, b); } #endif diff --git a/libclc/generic/lib/shared/min.cl b/libclc/generic/lib/shared/min.cl index 31c47872f6aa4..f5c4d57f4b8d8 100644 --- a/libclc/generic/lib/shared/min.cl +++ b/libclc/generic/lib/shared/min.cl @@ -1,6 +1,5 @@ #include -#include -#include +#include #define __CLC_BODY #include diff --git a/libclc/generic/lib/shared/min.inc b/libclc/generic/lib/shared/min.inc index f70956479d6df..6a00944cbe35e 100644 --- a/libclc/generic/lib/shared/min.inc +++ b/libclc/generic/lib/shared/min.inc @@ -3,7 +3,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_GENTYPE b) { } #ifndef __CLC_SCALAR -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b) { +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, + __CLC_SCALAR_GENTYPE b) { return __clc_min(a, b); } #endif diff --git a/libclc/generic/libspirv/SOURCES b/libclc/generic/libspirv/SOURCES index a222a1f7281a3..10d86bd5a67f8 100644 --- a/libclc/generic/libspirv/SOURCES +++ b/libclc/generic/libspirv/SOURCES @@ -39,9 +39,6 @@ core/integer/clc_mul_hi.cl core/integer/clc_rhadd.cl core/integer/clc_sub_sat.cl core/integer/clc_upsample.cl -core/shared/clc_clamp.cl -core/shared/clc_max.cl -core/shared/clc_min.cl geometric/cross.cl geometric/distance.cl geometric/dot.cl diff --git a/libclc/generic/libspirv/core/shared/clc_clamp.cl b/libclc/generic/libspirv/core/shared/clc_clamp.cl deleted file mode 100644 index d85c0a420f167..0000000000000 --- a/libclc/generic/libspirv/core/shared/clc_clamp.cl +++ /dev/null @@ -1,15 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include - -#define __CLC_BODY -#include - -#define __CLC_BODY -#include diff --git a/libclc/generic/libspirv/core/shared/clc_max.cl b/libclc/generic/libspirv/core/shared/clc_max.cl deleted file mode 100644 index 211c2c3ebba1b..0000000000000 --- a/libclc/generic/libspirv/core/shared/clc_max.cl +++ /dev/null @@ -1,15 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include - -#define __CLC_BODY -#include - -#define __CLC_BODY -#include diff --git a/libclc/generic/libspirv/core/shared/clc_min.cl b/libclc/generic/libspirv/core/shared/clc_min.cl deleted file mode 100644 index 9c7ea87286d8f..0000000000000 --- a/libclc/generic/libspirv/core/shared/clc_min.cl +++ /dev/null @@ -1,15 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include - -#define __CLC_BODY -#include - -#define __CLC_BODY -#include diff --git a/libclc/generic/libspirv/math/clc_hypot.cl b/libclc/generic/libspirv/math/clc_hypot.cl index b34a5e5107b4a..2e0c2737e1642 100644 --- a/libclc/generic/libspirv/math/clc_hypot.cl +++ b/libclc/generic/libspirv/math/clc_hypot.cl @@ -8,82 +8,84 @@ #include +#include +#include #include #include #include #include -// Returns sqrt(x*x + y*y) with no overflow or underflow unless the result warrants it -_CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) -{ - uint ux = as_uint(x); - uint aux = ux & EXSIGNBIT_SP32; - uint uy = as_uint(y); - uint auy = uy & EXSIGNBIT_SP32; - float retval; - int c = aux > auy; - ux = c ? aux : auy; - uy = c ? auy : aux; - - int xexp = __spirv_ocl_s_clamp( - (int)(ux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32, -126, 126); - float fx_exp = as_float((xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); - float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); - float fx = as_float(ux) * fi_exp; - float fy = as_float(uy) * fi_exp; - retval = __spirv_ocl_sqrt(__spirv_ocl_mad(fx, fx, fy * fy)) * fx_exp; - - retval = ux > PINFBITPATT_SP32 || uy == 0 ? as_float(ux) : retval; - retval = ux == PINFBITPATT_SP32 || uy == PINFBITPATT_SP32 - ? as_float(PINFBITPATT_SP32) - : retval; - return retval; +// Returns sqrt(x*x + y*y) with no overflow or underflow unless the result +// warrants it +_CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) { + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + uint uy = as_uint(y); + uint auy = uy & EXSIGNBIT_SP32; + float retval; + int c = aux > auy; + ux = c ? aux : auy; + uy = c ? auy : aux; + + int xexp = + __clc_clamp((int)(ux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32, -126, 126); + float fx_exp = as_float((xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); + float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); + float fx = as_float(ux) * fi_exp; + float fy = as_float(uy) * fi_exp; + retval = __spirv_ocl_sqrt(__spirv_ocl_mad(fx, fx, fy * fy)) * fx_exp; + + retval = ux > PINFBITPATT_SP32 || uy == 0 ? as_float(ux) : retval; + retval = ux == PINFBITPATT_SP32 || uy == PINFBITPATT_SP32 + ? as_float(PINFBITPATT_SP32) + : retval; + return retval; } _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_hypot, float, float) #ifdef cl_khr_fp64 -_CLC_DEF _CLC_OVERLOAD double __clc_hypot(double x, double y) -{ - ulong ux = as_ulong(x) & ~SIGNBIT_DP64; - int xexp = ux >> EXPSHIFTBITS_DP64; - x = as_double(ux); +_CLC_DEF _CLC_OVERLOAD double __clc_hypot(double x, double y) { + ulong ux = as_ulong(x) & ~SIGNBIT_DP64; + int xexp = ux >> EXPSHIFTBITS_DP64; + x = as_double(ux); - ulong uy = as_ulong(y) & ~SIGNBIT_DP64; - int yexp = uy >> EXPSHIFTBITS_DP64; - y = as_double(uy); + ulong uy = as_ulong(y) & ~SIGNBIT_DP64; + int yexp = uy >> EXPSHIFTBITS_DP64; + y = as_double(uy); - int c = xexp > EXPBIAS_DP64 + 500 | yexp > EXPBIAS_DP64 + 500; - double preadjust = c ? 0x1.0p-600 : 1.0; - double postadjust = c ? 0x1.0p+600 : 1.0; + int c = xexp > EXPBIAS_DP64 + 500 | yexp > EXPBIAS_DP64 + 500; + double preadjust = c ? 0x1.0p-600 : 1.0; + double postadjust = c ? 0x1.0p+600 : 1.0; - c = xexp < EXPBIAS_DP64 - 500 | yexp < EXPBIAS_DP64 - 500; - preadjust = c ? 0x1.0p+600 : preadjust; - postadjust = c ? 0x1.0p-600 : postadjust; + c = xexp < EXPBIAS_DP64 - 500 | yexp < EXPBIAS_DP64 - 500; + preadjust = c ? 0x1.0p+600 : preadjust; + postadjust = c ? 0x1.0p-600 : postadjust; - double ax = x * preadjust; - double ay = y * preadjust; + double ax = x * preadjust; + double ay = y * preadjust; - // The post adjust may overflow, but this can't be avoided in any case - double r = __spirv_ocl_sqrt(__spirv_ocl_fma(ax, ax, ay * ay)) * postadjust; + // The post adjust may overflow, but this can't be avoided in any case + double r = __spirv_ocl_sqrt(__spirv_ocl_fma(ax, ax, ay * ay)) * postadjust; - // If the difference in exponents between x and y is large - double s = x + y; - c = __spirv_ocl_s_abs(xexp - yexp) > MANTLENGTH_DP64 + 1; - r = c ? s : r; + // If the difference in exponents between x and y is large + double s = x + y; + c = __spirv_ocl_s_abs(xexp - yexp) > MANTLENGTH_DP64 + 1; + r = c ? s : r; - // Check for NaN - //c = x != x | y != y; - c = __spirv_IsNan(x) | __spirv_IsNan(y); - r = c ? as_double(QNANBITPATT_DP64) : r; + // Check for NaN + // c = x != x | y != y; + c = __spirv_IsNan(x) | __spirv_IsNan(y); + r = c ? as_double(QNANBITPATT_DP64) : r; - // If either is Inf, we must return Inf - c = x == as_double(PINFBITPATT_DP64) | y == as_double(PINFBITPATT_DP64); - r = c ? as_double(PINFBITPATT_DP64) : r; + // If either is Inf, we must return Inf + c = x == as_double(PINFBITPATT_DP64) | y == as_double(PINFBITPATT_DP64); + r = c ? as_double(PINFBITPATT_DP64) : r; - return r; + return r; } -_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_hypot, double, double) +_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_hypot, double, + double) #endif #ifdef cl_khr_fp16 From c13b071d00a7d28426ff4ebc913a153054a6f00c Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Tue, 5 Nov 2024 17:13:42 +0000 Subject: [PATCH 04/18] [SYCL][COMPAT] Extended vectorized_binary support to logical operators (#15759) We add support for logical operators with `vectorized_binary` as well as the relevant unit-tests. --- sycl/include/syclcompat/math.hpp | 44 ++++++++++- .../syclcompat/math/math_vectorized.cpp | 74 +++++++++++++++++++ 2 files changed, 114 insertions(+), 4 deletions(-) diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp index b0b8a93d6697c..b7842a8fd99e4 100644 --- a/sycl/include/syclcompat/math.hpp +++ b/sycl/include/syclcompat/math.hpp @@ -118,6 +118,37 @@ class vectorized_binary { } }; +// Vectorized_binary for logical operations +template +class vectorized_binary< + VecT, BinaryOperation, + std::enable_if_t()( + std::declval(), + std::declval()))>>> { +public: + inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) { + unsigned result = 0; + constexpr size_t elem_size = 8 * sizeof(typename VecT::element_type); + static_assert(elem_size < 32, + "Vector element size must be less than 4 bytes"); + constexpr unsigned bool_mask = (1U << elem_size) - 1; + + for (size_t i = 0; i < a.size(); ++i) { + bool comp_result = binary_op(a[i], b[i]); + result |= (comp_result ? bool_mask : 0U) << (i * elem_size); + } + + VecT v4; + for (size_t i = 0; i < v4.size(); ++i) { + v4[i] = static_cast( + (result >> (i * elem_size)) & bool_mask); + } + + return v4; + } +}; + /// Extend the 'val' to 'bit' size, zero extend for unsigned int and signed /// extend for signed int. Returns a signed integer type. template @@ -1040,7 +1071,7 @@ struct average { } // namespace detail -/// Compute vectorized binary operation value for two values, with each value +/// Compute vectorized binary operation value for two/four values, with each /// treated as a vector type \p VecT. /// \tparam [in] VecT The type of the vector /// \tparam [in] BinaryOperation The binary operation class @@ -1052,14 +1083,19 @@ struct average { template inline unsigned vectorized_binary(unsigned a, unsigned b, const BinaryOperation binary_op, - bool need_relu = false) { + [[maybe_unused]] bool need_relu = false) { sycl::vec v0{a}, v1{b}; auto v2 = v0.as(); auto v3 = v1.as(); auto v4 = detail::vectorized_binary()(v2, v3, binary_op); - if (need_relu) - v4 = relu(v4); + if constexpr (!std::is_same_v< + bool, decltype(std::declval()( + std::declval(), + std::declval()))>) { + if (need_relu) + v4 = relu(v4); + } v0 = v4.template as>(); return v0; } diff --git a/sycl/test-e2e/syclcompat/math/math_vectorized.cpp b/sycl/test-e2e/syclcompat/math/math_vectorized.cpp index 9c57c88ce445b..630d4b9c9f154 100644 --- a/sycl/test-e2e/syclcompat/math/math_vectorized.cpp +++ b/sycl/test-e2e/syclcompat/math/math_vectorized.cpp @@ -48,6 +48,18 @@ void test_vectorized_binary(unsigned op1, unsigned op2, unsigned expected, op1, op2, expected, need_relu); } +template +void test_vectorized_binary_logical(unsigned op1, unsigned op2, + unsigned expected) { + std::cout << __PRETTY_FUNCTION__ << std::endl; + constexpr syclcompat::dim3 grid{1}; + constexpr syclcompat::dim3 threads{1}; + + BinaryOpTestLauncher(grid, threads) + .template launch_test>( + op1, op2, expected, false); +} + template void vectorized_unary_kernel(unsigned *a, unsigned *r) { *r = syclcompat::vectorized_unary(*a, UnaryOp()); @@ -203,5 +215,67 @@ int main() { test_vectorized_binary_with_pred( 0x80010002, 0x00040002, 0x00040002, false, true); + // Logical Binary Operators v2 + test_vectorized_binary_logical, sycl::short2>( + 0xFFF00002, 0xFFF00001, 0xFFFF0000); + test_vectorized_binary_logical, sycl::short2>( + 0x0001F00F, 0x0003F00F, 0x0000FFFF); + + test_vectorized_binary_logical, sycl::short2>( + 0xFFF00002, 0xFFF00001, 0x0000FFFF); + test_vectorized_binary_logical, sycl::short2>( + 0x0001F00F, 0x0003F00F, 0xFFFF0000); + + test_vectorized_binary_logical, sycl::short2>( + 0xFFF00002, 0xFFF00001, 0xFFFFFFFF); + test_vectorized_binary_logical, sycl::short2>( + 0x0001F00F, 0x0003F001, 0x0000FFFF); + + test_vectorized_binary_logical, sycl::short2>( + 0xFFF00002, 0xFFF00001, 0x0000FFFF); + test_vectorized_binary_logical, sycl::short2>( + 0x0003F00F, 0x0001F00F, 0xFFFF0000); + + test_vectorized_binary_logical, sycl::short2>( + 0xFFF00001, 0xF0F00002, 0x0000FFFF); + test_vectorized_binary_logical, sycl::short2>( + 0x0001FF0F, 0x0003F00F, 0xFFFF0000); + + test_vectorized_binary_logical, sycl::short2>( + 0xFFF00001, 0xFFF00002, 0x0000FFFF); + test_vectorized_binary_logical, sycl::short2>( + 0x0001F00F, 0x0003F00F, 0xFFFF0000); + + // Logical Binary Operators v4 + test_vectorized_binary_logical, sycl::uchar4>( + 0x0001F00F, 0x0003F00F, 0xFF00FFFF); + test_vectorized_binary_logical, sycl::uchar4>( + 0x0102F0F0, 0x0202F0FF, 0x00FFFF00); + + test_vectorized_binary_logical, sycl::uchar4>( + 0x0001F00F, 0xFF01F10F, 0xFF00FF00); + test_vectorized_binary_logical, sycl::uchar4>( + 0x0201F0F0, 0x0202F0FF, 0x00FF00FF); + + test_vectorized_binary_logical, sycl::uchar4>( + 0xFFF00002, 0xFFF10101, 0xFF0000FF); + test_vectorized_binary_logical, sycl::uchar4>( + 0x0001F1F0, 0x0103F001, 0x0000FFFF); + + test_vectorized_binary_logical, sycl::uchar4>( + 0xFFF00002, 0xF0F00001, 0xFF0000FF); + test_vectorized_binary_logical, sycl::uchar4>( + 0x0103F0F1, 0x0102F0F0, 0x00FF00FF); + + test_vectorized_binary_logical, sycl::uchar4>( + 0xFFF10001, 0xFFF00100, 0xFF00FF00); + test_vectorized_binary_logical, sycl::uchar4>( + 0x0101F1F0, 0x0003F0F1, 0x00FF00FF); + + test_vectorized_binary_logical, sycl::uchar4>( + 0xFFF10001, 0xFFF20100, 0x00FFFF00); + test_vectorized_binary_logical, sycl::uchar4>( + 0x0101F1F0, 0x0102F1F1, 0x00FF00FF); + return 0; } From 0eb08d9931b09c25fd72b73daf3e99f5f76e0a1e Mon Sep 17 00:00:00 2001 From: David Garcia Orozco Date: Tue, 5 Nov 2024 10:26:46 -0700 Subject: [PATCH 05/18] [SYCL][E2E] Replace `uint` with `unsigned int` in `Matrix/slm_utils.hpp` (#15981) Using `uint` here caused failures in internal testing on windows due to deprecated warnings. This patch replaces it with `unsigned int`. --- sycl/test-e2e/Matrix/slm_utils.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/test-e2e/Matrix/slm_utils.hpp b/sycl/test-e2e/Matrix/slm_utils.hpp index 6618f704754d5..1c481336c01f4 100644 --- a/sycl/test-e2e/Matrix/slm_utils.hpp +++ b/sycl/test-e2e/Matrix/slm_utils.hpp @@ -49,13 +49,13 @@ slm_read_write(multi_ptr pA, // NCache2*KCache2/(SGs*SG_SIZE) = 16 size_t elemsPerLoadB = NCache2 * KCache2 / (SGs * sgSize); size_t sgsPerRow = (NCache2 * vnniFactor) / (elemsPerLoadB * sgSize); - size_t GlOffsetB = - (k2 * (KCache2 / vnniFactor) + (uint)(sg.get_group_id() / sgsPerRow)) * - (colsB * vnniFactor) + - n2 * NCache2 * vnniFactor + - (sg.get_group_id() % sgsPerRow) * (elemsPerLoadB * sgSize); + size_t GlOffsetB = (k2 * (KCache2 / vnniFactor) + + (unsigned int)(sg.get_group_id() / sgsPerRow)) * + (colsB * vnniFactor) + + n2 * NCache2 * vnniFactor + + (sg.get_group_id() % sgsPerRow) * (elemsPerLoadB * sgSize); size_t LocOffsetB = - ((uint)(sg.get_group_id() / sgsPerRow)) * NCache2 * vnniFactor + + ((unsigned int)(sg.get_group_id() / sgsPerRow)) * NCache2 * vnniFactor + (sg.get_group_id() % sgsPerRow) * elemsPerLoadB * sgSize; auto SrcB = pB + GlOffsetB; auto DstB = From 22fcde27d4380da694cfb9d85fc6add6e9c35903 Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Tue, 5 Nov 2024 11:28:27 -0600 Subject: [PATCH 06/18] [SYCL][Joint Matrix] Add more cases in common JM tests functions (#15712) --- sycl/source/detail/device_info.hpp | 13 +++++++------ sycl/test-e2e/Matrix/common.hpp | 6 +++--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index 2551e6c03b635..38e5988a37456 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -841,8 +841,8 @@ struct get_device_info_impl< }; else if ((architecture::intel_gpu_pvc == DeviceArch) || (architecture::intel_gpu_bmg_g21 == DeviceArch) || - (architecture::intel_gpu_lnl_m == DeviceArch)) - return { + (architecture::intel_gpu_lnl_m == DeviceArch)) { + std::vector pvc_combs = { {8, 0, 0, 0, 16, 32, matrix_type::uint8, matrix_type::uint8, matrix_type::sint32, matrix_type::sint32}, {8, 0, 0, 0, 16, 32, matrix_type::uint8, matrix_type::sint8, @@ -950,10 +950,11 @@ struct get_device_info_impl< {8, 0, 0, 0, 16, 8, matrix_type::tf32, matrix_type::tf32, matrix_type::fp32, matrix_type::fp32}, }; - else if ((architecture::intel_gpu_dg2_g10 == DeviceArch) || - (architecture::intel_gpu_dg2_g11 == DeviceArch) || - (architecture::intel_gpu_dg2_g12 == DeviceArch) || - (architecture::intel_gpu_arl_h == DeviceArch)) + return pvc_combs; + } else if ((architecture::intel_gpu_dg2_g10 == DeviceArch) || + (architecture::intel_gpu_dg2_g11 == DeviceArch) || + (architecture::intel_gpu_dg2_g12 == DeviceArch) || + (architecture::intel_gpu_arl_h == DeviceArch)) return { {8, 0, 0, 0, 8, 32, matrix_type::uint8, matrix_type::uint8, matrix_type::sint32, matrix_type::sint32}, diff --git a/sycl/test-e2e/Matrix/common.hpp b/sycl/test-e2e/Matrix/common.hpp index 8d2e460116353..90f5508d97cf8 100644 --- a/sycl/test-e2e/Matrix/common.hpp +++ b/sycl/test-e2e/Matrix/common.hpp @@ -63,8 +63,7 @@ void matrix_multiply_ref(Ta *A, Tb *B, Tc *C, int M, int N, int K, if constexpr (std::is_same_v && std::is_same_v) acc += make_fp32(va[i]) * make_fp32(vb[i]); - else if constexpr (std::is_same_v && - std::is_same_v) + else if constexpr (std::is_same_v) acc += (float)va[i] * (float)vb[i]; else if constexpr (std::is_same_v && std::is_same_v || @@ -135,7 +134,8 @@ void matrix_rand(unsigned int rows, unsigned int cols, T *src, T val) { for (unsigned int i = 0; i < rows; i++) { for (unsigned int j = 0; j < cols; j++) { - if constexpr (std::is_same_v || std::is_same_v || + if constexpr (std::is_same_v || + std::is_same_v || std::is_same_v || std::is_same_v) { src[i * cols + j] = T(fdistr(dev)); } else if constexpr (std::is_integral_v) { From e15a0ab7606ab37030749e91caec9d790dc738ed Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Tue, 5 Nov 2024 17:44:48 +0000 Subject: [PATCH 07/18] [UR] Bump UR tag to get L0 default init refactor (#15987) Bump UR to pull in https://github.com/oneapi-src/unified-runtime/pull/2260 --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 01bd0cd4d9586..b81556a92ec2c 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit b0a9e2be61ad42d3447f1f246120ab25119a03e0 -# Merge: fa8cc8ec a0cf2ce2 +# commit f01741af022cfe82afcb026b9aa0be251eb6a497 +# Merge: 004d2474 85bb5f62 # Author: Callum Fare -# Date: Mon Nov 4 10:00:08 2024 +0000 -# Merge pull request #2165 from aarongreig/aaron/makeUSMPoolsOptional -# Make USM pools optional with a device query to report support. -set(UNIFIED_RUNTIME_TAG b0a9e2be61ad42d3447f1f246120ab25119a03e0) +# Date: Tue Nov 5 13:39:53 2024 +0000 +# Merge pull request #2260 from nrspruit/refactor_l0_default_init +# [L0] Refactor to remove default constructor inits +set(UNIFIED_RUNTIME_TAG f01741af022cfe82afcb026b9aa0be251eb6a497) From 4d000877c2b16a43ef2dba5f80d0fbd73423b7bd Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Tue, 5 Nov 2024 11:46:43 -0800 Subject: [PATCH 08/18] [Clang][NFC] Remove duplicated subdirectory (#15906) clang-nvlink-wrapper is added twice in clang/tools/CMakeLists.txt. --- clang/tools/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/tools/CMakeLists.txt b/clang/tools/CMakeLists.txt index adbb108129a25..9f238dd44109d 100644 --- a/clang/tools/CMakeLists.txt +++ b/clang/tools/CMakeLists.txt @@ -7,7 +7,6 @@ add_clang_subdirectory(clang-diff) add_clang_subdirectory(clang-format) add_clang_subdirectory(clang-fuzzer) add_clang_subdirectory(clang-import-test) -add_clang_subdirectory(clang-nvlink-wrapper) add_clang_subdirectory(clang-linker-wrapper) add_clang_subdirectory(clang-nvlink-wrapper) add_clang_subdirectory(clang-offload-packager) From 52084843bd1bdd153f1e0800371a42213fd053de Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Wed, 6 Nov 2024 08:51:58 +0100 Subject: [PATCH 09/18] [SYCL] Lift restrictions on free-function kernels when compiling at runtime (#15892) In order to be able to generate correct and complete information for the integration header, the current implementation places some restrictions on free-function kernels and their parameters. For example, parameters of free function kernels need to be forward-declarable. However, when compiling SYCL code at runtime (RTC), e.g., through the `kernel_compiler` extension, host code is typically not relevant, so the integration header is not as relevant and some restrictions on free-function kernels can be lifted. This PR introduces a `-fsycl-rtc-mode` flag (and it's negative equivalent) to deactivate some restrictions on free-function kernels and omit some information for free-function kernels from the integration header. --------- Signed-off-by: Lukas Sommer --- clang/include/clang/Basic/LangOptions.def | 1 + clang/include/clang/Driver/Options.td | 5 ++ clang/lib/Driver/ToolChains/Clang.cpp | 3 + clang/lib/Sema/SemaSYCL.cpp | 12 +++ .../free_function_int_header_rtc_mode.cpp | 80 +++++++++++++++++++ clang/test/Driver/sycl-rtc-mode.cpp | 32 ++++++++ 6 files changed, 133 insertions(+) create mode 100644 clang/test/CodeGenSYCL/free_function_int_header_rtc_mode.cpp create mode 100644 clang/test/Driver/sycl-rtc-mode.cpp diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 2b99cc35e0f1d..8bf06d5a3a8e0 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -326,6 +326,7 @@ LANGOPT(SYCLExperimentalRangeRounding, 1, 0, "Use experimental parallel for rang LANGOPT(SYCLEnableIntHeaderDiags, 1, 0, "Enable diagnostics that require the " "SYCL integration header") LANGOPT(SYCLIsNativeCPU , 1, 0, "Generate code for SYCL Native CPU") +LANGOPT(SYCLRTCMode, 1, 0, "Compile in RTC mode") LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP") LANGOPT(OffloadUniformBlock, 1, 0, "Assume that kernels are launched with uniform block sizes (default true for CUDA/HIP and false otherwise)") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 8ff41673575ac..053bf5e63d9ad 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6877,6 +6877,11 @@ defm sycl_esimd_force_stateless_mem : BoolFOption<"sycl-esimd-force-stateless-me NegFlag, BothFlags<[], [ClangOption, CLOption, CC1Option], "">>; +defm sycl_rtc_mode: BoolFOption<"sycl-rtc-mode", + LangOpts<"SYCLRTCMode">, DefaultFalse, + PosFlag, + NegFlag, + BothFlags<[HelpHidden], [ClangOption, CC1Option], " RTC mode in SYCL.">>; // TODO: Remove this option once ESIMD headers are updated to // guard vectors to be device only. def fno_sycl_esimd_build_host_code : Flag<["-"], "fno-sycl-esimd-build-host-code">, diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index bb3492887c8e9..3fa80d479936f 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5589,6 +5589,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.AddLastArg(CmdArgs, options::OPT_fsycl_decompose_functor, options::OPT_fno_sycl_decompose_functor); + Args.AddLastArg(CmdArgs, options::OPT_fsycl_rtc_mode, + options::OPT_fno_sycl_rtc_mode); + // Forward -fsycl-instrument-device-code option to cc1. This option will // only be used for SPIR/SPIR-V based targets. if (Triple.isSPIROrSPIRV()) diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 996443bcbc8fd..e45b038273d77 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -2040,6 +2040,11 @@ class SyclKernelFieldChecker : public SyclKernelFieldHandler { } bool handleStructType(ParmVarDecl *PD, QualType ParamTy) final { + if (SemaSYCLRef.getLangOpts().SYCLRTCMode) { + // When compiling in RTC mode, the restriction regarding forward + // declarations doesn't apply, as we don't need the integration header. + return isValid(); + } CXXRecordDecl *RD = ParamTy->getAsCXXRecordDecl(); // For free functions all struct/class kernel arguments are forward declared // in integration header, that adds additional restrictions for kernel @@ -6453,6 +6458,13 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { O << "} // namespace _V1\n"; O << "} // namespace sycl\n"; + // The rest of this function only applies to free-function kernels. However, + // in RTC mode, we do not need integration header information for + // free-function kernels, so we can return early here. + if (S.getLangOpts().SYCLRTCMode) { + return; + } + unsigned ShimCounter = 1; int FreeFunctionCount = 0; for (const KernelDesc &K : KernelDescs) { diff --git a/clang/test/CodeGenSYCL/free_function_int_header_rtc_mode.cpp b/clang/test/CodeGenSYCL/free_function_int_header_rtc_mode.cpp new file mode 100644 index 0000000000000..214318b563fa8 --- /dev/null +++ b/clang/test/CodeGenSYCL/free_function_int_header_rtc_mode.cpp @@ -0,0 +1,80 @@ +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple spir64-unknown-unknown -sycl-std=2020 -fsycl-rtc-mode -fsycl-int-header=%t.rtc.h %s +// RUN: FileCheck -input-file=%t.rtc.h --check-prefixes=CHECK,CHECK-RTC %s + +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple spir64-unknown-unknown -sycl-std=2020 -fno-sycl-rtc-mode -fsycl-int-header=%t.nortc.h %s +// RUN: FileCheck -input-file=%t.nortc.h --check-prefixes=CHECK,CHECK-NORTC %s + +// This test checks that free-function kernel information is included or +// excluded from the integration header, depending on the '-fsycl-rtc-mode' +// flag. + +#include "sycl.hpp" + +[[__sycl_detail__::add_ir_attributes_function("sycl-single-task-kernel", 2)]] +void free_function_single(int* ptr, int start, int end){ + for(int i = start; i < end; ++i){ + ptr[i] = start + 66; + } +} + +[[__sycl_detail__::add_ir_attributes_function("sycl-nd-range-kernel", 2)]] +void free_function_nd_range(int* ptr, int start, int end){ + for(int i = start; i < end; ++i){ + ptr[i] = start + 66; + } +} + +template +__attribute__((sycl_kernel)) void kernel(const KernelFunc &kernelFunc){ + kernelFunc(); +} + +int main(){ + sycl::accessor accessorA; + kernel( + [=]() { + accessorA.use(); + }); + return 0; +} + + +// CHECK: const char* const kernel_names[] = { +// CHECK-NEXT: "{{.*}}__sycl_kernel_free_function_singlePiii", +// CHECK-NEXT: "{{.*}}__sycl_kernel_free_function_nd_rangePiii", +// CHECK-NEXT: "{{.*}}Kernel_Function", + + +// CHECK: static constexpr const char* getName() { return "{{.*}}__sycl_kernel_free_function_singlePiii"; } +// CHECK: static constexpr const char* getName() { return "{{.*}}__sycl_kernel_free_function_nd_rangePiii"; } +// CHECK: static constexpr const char* getName() { return "{{.*}}Kernel_Function"; } + +// CHECK-RTC-NOT: free_function_single_kernel +// CHECK-RTC-NOT: free_function_nd_range + +// CHECK-NORTC: void free_function_single(int *ptr, int start, int end); +// CHECK-NORTC: static constexpr auto __sycl_shim[[#FIRST:]]() +// CHECK-NORTC-NEXT: return (void (*)(int *, int, int))free_function_single; + +// CHECK-NORTC: struct ext::oneapi::experimental::is_kernel<__sycl_shim[[#FIRST]]()> { +// CHECK-NORTC-NEXT: static constexpr bool value = true; + +// CHECK-NORTC: struct ext::oneapi::experimental::is_single_task_kernel<__sycl_shim[[#FIRST]]()> { +// CHECK-NORTC-NEXT: static constexpr bool value = true; + + +// CHECK-NORTC: void free_function_nd_range(int *ptr, int start, int end); +// CHECK-NORTC: static constexpr auto __sycl_shim[[#SECOND:]]() { +// CHECK-NORTC-NEXT: return (void (*)(int *, int, int))free_function_nd_range; + +// CHECK-NORTC: struct ext::oneapi::experimental::is_kernel<__sycl_shim[[#SECOND]]()> { +// CHECK-NORTC-NEXT: static constexpr bool value = true; + +// CHECK-NORTC: struct ext::oneapi::experimental::is_nd_range_kernel<__sycl_shim2(), 2> { +// CHECK-NORTC-NEXT: static constexpr bool value = true; + +// CHECK-NORTC: kernel_id ext::oneapi::experimental::get_kernel_id<__sycl_shim[[#FIRST]]()>() { +// CHECK-NORTC-NEXT: return sycl::detail::get_kernel_id_impl(std::string_view{"{{.*}}__sycl_kernel_free_function_singlePiii"}); + +// CHECK-NORTC: kernel_id ext::oneapi::experimental::get_kernel_id<__sycl_shim[[#SECOND]]()>() { +// CHECK-NORTC-NEXT: return sycl::detail::get_kernel_id_impl(std::string_view{"{{.*}}__sycl_kernel_free_function_nd_rangePiii"}); diff --git a/clang/test/Driver/sycl-rtc-mode.cpp b/clang/test/Driver/sycl-rtc-mode.cpp new file mode 100644 index 0000000000000..4a68c97db6409 --- /dev/null +++ b/clang/test/Driver/sycl-rtc-mode.cpp @@ -0,0 +1,32 @@ +/// +/// Perform driver test for SYCL RTC mode. +/// + +/// Check that the '-fsycl-rtc-mode' is correctly forwarded to the device +/// compilation and only to the device compilation. + +// RUN: %clangxx -fsycl -fsycl-rtc-mode --no-offload-new-driver %s -### 2>&1 \ +// RUN: | FileCheck %s + +// RUN: %clangxx -fsycl -fsycl-rtc-mode --offload-new-driver %s -### 2>&1 \ +// RUN: | FileCheck %s + +// CHECK: clang{{.*}} "-fsycl-is-device" +// CHECK-SAME: -fsycl-rtc-mode +// CHECK: clang{{.*}} "-fsycl-is-host" +// CHECK-NOT: -fsycl-rtc-mode + + +/// Check that the '-fno-sycl-rtc-mode' is correctly forwarded to the device +/// compilation and only to the device compilation. + +// RUN: %clangxx -fsycl -fno-sycl-rtc-mode --no-offload-new-driver %s -### 2>&1 \ +// RUN: | FileCheck %s --check-prefix=NEGATIVE + +// RUN: %clangxx -fsycl -fno-sycl-rtc-mode --offload-new-driver %s -### 2>&1 \ +// RUN: | FileCheck %s --check-prefix=NEGATIVE + +// NEGATIVE: clang{{.*}} "-fsycl-is-device" +// NEGATIVE-SAME: -fno-sycl-rtc-mode +// NEGATIVE: clang{{.*}} "-fsycl-is-host" +// NEGATIVE-NOT: -fsycl-rtc-mode From f77c81f4de23500dc9c7657f89d2b99cccdcaff3 Mon Sep 17 00:00:00 2001 From: Nikita Kornev Date: Wed, 6 Nov 2024 10:20:27 +0000 Subject: [PATCH 10/18] [SYCL-CTS][CI] Turn off test_multi_ptr (#15989) Waiting for https://github.com/intel/llvm/pull/15389. The test also needs to be updated then. --- devops/cts_exclude_filter_L0_GPU | 2 ++ devops/cts_exclude_filter_OCL_CPU | 2 ++ 2 files changed, 4 insertions(+) diff --git a/devops/cts_exclude_filter_L0_GPU b/devops/cts_exclude_filter_L0_GPU index dfd8b4623bae1..e11eb8767ed33 100644 --- a/devops/cts_exclude_filter_L0_GPU +++ b/devops/cts_exclude_filter_L0_GPU @@ -3,3 +3,5 @@ kernel_bundle marray # fix: https://github.com/KhronosGroup/SYCL-CTS/pull/964 accessor_legacy +# CMPLRLLVM-61839 +multi_ptr diff --git a/devops/cts_exclude_filter_OCL_CPU b/devops/cts_exclude_filter_OCL_CPU index 24f4a5c9eb41b..d8317bae46917 100644 --- a/devops/cts_exclude_filter_OCL_CPU +++ b/devops/cts_exclude_filter_OCL_CPU @@ -7,3 +7,5 @@ math_builtin_api hierarchical # fix: https://github.com/KhronosGroup/SYCL-CTS/pull/964 accessor_legacy +# CMPLRLLVM-61839 +multi_ptr From 37b339e2992b6807f09295f90ec85ce2fa20bb0e Mon Sep 17 00:00:00 2001 From: Justin Cai Date: Wed, 6 Nov 2024 03:06:57 -0800 Subject: [PATCH 11/18] [SYCL] Update aspect propagation for virtual functions (#15703) Spec & design: https://github.com/intel/llvm/pull/10540 --- .../SYCLLowerIR/SYCLPropagateAspectsUsage.cpp | 39 +++++++++++++- .../VirtualFunctions/virtual-functions-1.ll | 20 ++++++++ .../VirtualFunctions/virtual-functions-2.ll | 36 +++++++++++++ .../VirtualFunctions/virtual-functions-3.ll | 51 +++++++++++++++++++ .../VirtualFunctions/virtual-functions-4.ll | 26 ++++++++++ 5 files changed, 171 insertions(+), 1 deletion(-) create mode 100644 llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-1.ll create mode 100644 llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-2.ll create mode 100644 llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-3.ll create mode 100644 llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-4.ll diff --git a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp index 6d8c248a81607..126a03bdf03bf 100644 --- a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp +++ b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp @@ -647,6 +647,38 @@ void setSyclFixedTargetsMD(const std::vector &EntryPoints, F->setMetadata("sycl_fixed_targets", MDN); } +void collectVirtualFunctionSetInfo( + Function &F, StringMap> &VirtualFunctionSets) { + if (!F.hasFnAttribute("indirectly-callable")) + return; + Attribute IndirectlyCallableAttr = F.getFnAttribute("indirectly-callable"); + StringRef SetName = IndirectlyCallableAttr.getValueAsString(); + VirtualFunctionSets[SetName].push_back(&F); +} + +// For each set S of virtual functions that F declares, +// propagate S through the CG and then add the aspects +// used by S to F. +void processDeclaredVirtualFunctionSets( + Function *F, CallGraphTy &CG, FunctionToAspectsMapTy &AspectsMap, + SmallPtrSet &Visited, + StringMap> &VirtualFunctionSets) { + if (!F->hasFnAttribute("calls-indirectly")) + return; + Attribute CallsIndirectlyAttr = F->getFnAttribute("calls-indirectly"); + SmallVector DeclaredVirtualFunctionSetNames; + CallsIndirectlyAttr.getValueAsString().split(DeclaredVirtualFunctionSetNames, + ","); + auto &AspectsF = AspectsMap[F]; + for (auto Name : DeclaredVirtualFunctionSetNames) { + for (auto VFn : VirtualFunctionSets[Name]) { + propagateAspectsThroughCG(VFn, CG, AspectsMap, Visited); + for (auto Aspect : AspectsMap[VFn]) + AspectsF.insert(Aspect); + } + } +} + /// Returns a map of functions with corresponding used aspects. std::pair buildFunctionsToAspectsMap(Module &M, TypeToAspectsMapTy &TypesWithAspects, @@ -655,16 +687,21 @@ buildFunctionsToAspectsMap(Module &M, TypeToAspectsMapTy &TypesWithAspects, bool ValidateAspects, bool FP64ConvEmu) { FunctionToAspectsMapTy FunctionToUsedAspects; FunctionToAspectsMapTy FunctionToDeclaredAspects; + StringMap> VirtualFunctionSets; CallGraphTy CG; for (Function &F : M.functions()) { processFunction(F, FunctionToUsedAspects, FunctionToDeclaredAspects, TypesWithAspects, CG, AspectValues, FP64ConvEmu); + collectVirtualFunctionSetInfo(F, VirtualFunctionSets); } SmallPtrSet Visited; - for (Function *F : EntryPoints) + for (Function *F : EntryPoints) { propagateAspectsThroughCG(F, CG, FunctionToUsedAspects, Visited); + processDeclaredVirtualFunctionSets(F, CG, FunctionToUsedAspects, Visited, + VirtualFunctionSets); + } if (ValidateAspects) validateUsedAspectsForFunctions(FunctionToUsedAspects, AspectValues, diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-1.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-1.ll new file mode 100644 index 0000000000000..709ca33eae3b0 --- /dev/null +++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-1.ll @@ -0,0 +1,20 @@ +; RUN: opt -passes=sycl-propagate-aspects-usage < %s -S | FileCheck %s + +; CHECK: @vfn() #0 !sycl_used_aspects ![[#aspects:]] +define spir_func void @vfn() #0 { + %tmp = alloca double + ret void +} + +; CHECK: @foo() #1 !sycl_used_aspects ![[#aspects]] +define spir_kernel void @foo() #1 { + ret void +} + +; CHECK: ![[#aspects]] = !{i32 6} + +attributes #0 = { "indirectly-callable"="_ZTSv" } +attributes #1 = { "calls-indirectly"="_ZTSv" } + +!sycl_aspects = !{!0} +!0 = !{!"fp64", i32 6} diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-2.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-2.ll new file mode 100644 index 0000000000000..ae600413378f1 --- /dev/null +++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-2.ll @@ -0,0 +1,36 @@ +; RUN: opt -passes=sycl-propagate-aspects-usage < %s -S | FileCheck %s + +%Foo = type { i32 } +%Bar = type { i32 } + +; CHECK: @vfnFoo() #0 !sycl_used_aspects ![[#aspectsFoo:]] +define spir_func void @vfnFoo() #0 { + %tmp = alloca %Foo + ret void +} + +; CHECK: @vfnBar() #1 !sycl_used_aspects ![[#aspectsBar:]] +define spir_func void @vfnBar() #1 { + %tmp = alloca %Bar + ret void +} + +; CHECK: @kernel() #2 !sycl_used_aspects ![[#aspectsKernel:]] +define spir_kernel void @kernel() #2 { + ret void +} + +; CHECK: ![[#aspectsFoo]] = !{i32 1} +; CHECK: ![[#aspectsBar]] = !{i32 2} +; CHECK: ![[#aspectsKernel]] = !{i32 1, i32 2} + +attributes #0 = { "indirectly-callable"="setFoo" } +attributes #1 = { "indirectly-callable"="setBar" } +attributes #2 = { "calls-indirectly"="setFoo,setBar" } + +!sycl_aspects = !{!0} +!0 = !{!"fp64", i32 6} + +!sycl_types_that_use_aspects = !{!1, !2} +!1 = !{!"Foo", i32 1} +!2 = !{!"Bar", i32 2} diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-3.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-3.ll new file mode 100644 index 0000000000000..ada0f533ced56 --- /dev/null +++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-3.ll @@ -0,0 +1,51 @@ +; RUN: opt -passes=sycl-propagate-aspects-usage < %s -S | FileCheck %s + +%Foo = type { i32 } +%Bar = type { i32 } + +; CHECK: @vfnFoo() #0 !sycl_used_aspects ![[#aspectsFoo:]] +define spir_func void @vfnFoo() #0 { + call void @subFoo() + ret void +} + +define spir_func void @subFoo() { + %tmp = alloca %Foo + ret void +} + +; CHECK: @vfnBar() #1 !sycl_used_aspects ![[#aspectsBar:]] +define spir_func void @vfnBar() #1 { + call void @subBar() + ret void +} + +define spir_func void @subBar() { + %tmp = alloca %Bar + ret void +} + +; CHECK: @kernelA() #2 !sycl_used_aspects ![[#aspectsFoo]] +define spir_kernel void @kernelA() #2 { + ret void +} + +; CHECK: @kernelB() #3 !sycl_used_aspects ![[#aspectsBar]] +define spir_kernel void @kernelB() #3 { + ret void +} + +; CHECK: ![[#aspectsFoo]] = !{i32 1} +; CHECK: ![[#aspectsBar]] = !{i32 2} + +attributes #0 = { "indirectly-callable"="setFoo" } +attributes #1 = { "indirectly-callable"="setBar" } +attributes #2 = { "calls-indirectly"="setFoo" } +attributes #3 = { "calls-indirectly"="setBar" } + +!sycl_aspects = !{!0} +!0 = !{!"fp64", i32 6} + +!sycl_types_that_use_aspects = !{!1, !2} +!1 = !{!"Foo", i32 1} +!2 = !{!"Bar", i32 2} diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-4.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-4.ll new file mode 100644 index 0000000000000..700a2dc7a8b79 --- /dev/null +++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-4.ll @@ -0,0 +1,26 @@ +; RUN: opt -passes=sycl-propagate-aspects-usage < %s -S | FileCheck %s +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" +target triple = "spir64-unknown-unknown" + +@vtable = linkonce_odr dso_local unnamed_addr addrspace(1) constant { [3 x ptr addrspace(4)] } { [3 x ptr addrspace(4)] [ptr addrspace(4) null, ptr addrspace(4) null, ptr addrspace(4) addrspacecast (ptr @foo to ptr addrspace(4))] }, align 8 + +; CHECK: @foo() #0 !sycl_used_aspects ![[#aspects:]] +define linkonce_odr spir_func void @foo() #0 { +entry: + %tmp = alloca double + ret void +} + +; CHECK-NOT: @construct({{.*}}){{.*}}!sycl_used_aspects +define weak_odr dso_local spir_kernel void @construct(ptr addrspace(1) noundef align 8 %_arg_StorageAcc) { +entry: + store ptr addrspace(1) getelementptr inbounds inrange(-16, 8) (i8, ptr addrspace(1) @vtable, i64 16), ptr addrspace(1) %_arg_StorageAcc, align 8 + ret void +} + +; CHECK: ![[#aspects]] = !{i32 6} + +attributes #0 = { "indirectly-callable"="set-foo" } + +!sycl_aspects = !{!0} +!0 = !{!"fp64", i32 6} \ No newline at end of file From cda38dedd238a0ce9acd3d860be58ae247429885 Mon Sep 17 00:00:00 2001 From: Benjamin Tracy Date: Wed, 6 Nov 2024 14:46:20 +0000 Subject: [PATCH 12/18] [SYCL][Graph]Fix and add E2E tests for using local accessors in graphs (#15920) - Update UR tag for fix for updating local accessors on CUDA/HIP - Add e2e tests covering local accessor usage --------- Co-authored-by: Callum Fare Co-authored-by: Ewan Crawford --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 14 +-- .../Graph/Explicit/local_accessor.cpp | 10 ++ sycl/test-e2e/Graph/Inputs/local_accessor.cpp | 54 +++++++++++ .../Graph/Inputs/whole_update_local_acc.cpp | 93 +++++++++++++++++++ .../Graph/RecordReplay/local_accessor.cpp | 10 ++ .../Explicit/whole_update_local_acc.cpp | 10 ++ .../RecordReplay/whole_update_local_acc.cpp | 10 ++ 7 files changed, 194 insertions(+), 7 deletions(-) create mode 100644 sycl/test-e2e/Graph/Explicit/local_accessor.cpp create mode 100644 sycl/test-e2e/Graph/Inputs/local_accessor.cpp create mode 100644 sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp create mode 100644 sycl/test-e2e/Graph/RecordReplay/local_accessor.cpp create mode 100644 sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp create mode 100644 sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index b81556a92ec2c..15985fb0cc0b0 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit f01741af022cfe82afcb026b9aa0be251eb6a497 -# Merge: 004d2474 85bb5f62 -# Author: Callum Fare -# Date: Tue Nov 5 13:39:53 2024 +0000 -# Merge pull request #2260 from nrspruit/refactor_l0_default_init -# [L0] Refactor to remove default constructor inits -set(UNIFIED_RUNTIME_TAG f01741af022cfe82afcb026b9aa0be251eb6a497) +# commit 3edf99755ce2af3b53102a7d8438e0fe969efac3 +# Merge: 5955bad3 0b968661 +# Author: Ross Brunton +# Date: Wed Nov 6 11:07:29 2024 +0000 +# Merge pull request #2082 from RossBrunton/ross/multiadapt +# [CI] Add "loader" support to conformance testing +set(UNIFIED_RUNTIME_TAG 3edf99755ce2af3b53102a7d8438e0fe969efac3) diff --git a/sycl/test-e2e/Graph/Explicit/local_accessor.cpp b/sycl/test-e2e/Graph/Explicit/local_accessor.cpp new file mode 100644 index 0000000000000..fbeb2c6a5ef5c --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/local_accessor.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/local_accessor.cpp" diff --git a/sycl/test-e2e/Graph/Inputs/local_accessor.cpp b/sycl/test-e2e/Graph/Inputs/local_accessor.cpp new file mode 100644 index 0000000000000..b3ac9fde67b6e --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/local_accessor.cpp @@ -0,0 +1,54 @@ +// Tests basic adding of nodes with local accessors, +// and submission of the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + using T = int; + + const size_t LocalSize = 128; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 10); + + std::vector ReferenceA(DataA); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.wait_and_throw(); + + auto node = add_node(Graph, Queue, [&](handler &CGH) { + local_accessor LocalMem(LocalSize, CGH); + + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + LocalMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2; + PtrA[Item.get_global_linear_id()] += LocalMem[Item.get_local_linear_id()]; + }); + }); + + auto GraphExec = Graph.finalize(); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + + for (size_t i = 0; i < Size; i++) { + T Ref = 10 + i + (i * 2); + check_value(i, Ref, ReferenceA[i], "PtrA"); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp new file mode 100644 index 0000000000000..100792a2e4762 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp @@ -0,0 +1,93 @@ +// Tests whole graph update of nodes with local accessors, +// and submission of the graph. + +#include "../graph_common.hpp" + +using T = int; + +auto add_graph_node( + exp_ext::command_graph &Graph, + queue &Queue, size_t Size, size_t LocalSize, T *Ptr) { + return add_node(Graph, Queue, [&](handler &CGH) { + local_accessor LocalMem(LocalSize, CGH); + + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + LocalMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2; + Ptr[Item.get_global_linear_id()] += + LocalMem[Item.get_local_linear_id()] + Item.get_local_range(0); + }); + }); +} +int main() { + queue Queue{}; + + const size_t LocalSize = 128; + + std::vector DataA(Size), DataB(Size); + + std::iota(DataA.begin(), DataA.end(), 10); + std::iota(DataB.begin(), DataB.end(), 10); + + std::vector ReferenceA(DataA), ReferenceB(DataB); + + exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.wait_and_throw(); + + auto NodeA = add_graph_node(GraphA, Queue, Size, LocalSize / 2, PtrA); + + auto GraphExecA = GraphA.finalize(exp_ext::property::graph::updatable{}); + + // Create second graph for whole graph update with a different local size + exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()}; + auto NodeB = add_graph_node(GraphB, Queue, Size, LocalSize, PtrB); + + // Execute graphs before updating and check outputs + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecA); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.wait_and_throw(); + + for (size_t i = 0; i < Size; i++) { + T RefA = 10 + i + (i * 2) + LocalSize / 2; + T RefB = 10 + i; + check_value(i, RefA, ReferenceA[i], "PtrA"); + check_value(i, RefB, ReferenceB[i], "PtrB"); + } + + // Update GraphExecA using whole graph update + + GraphExecA.update(GraphB); + + // Execute graphs again and check outputs + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecA); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.wait_and_throw(); + + for (size_t i = 0; i < Size; i++) { + T RefA = 10 + i + (i * 2) + LocalSize / 2; + T RefB = 10 + i + (i * 2) + LocalSize; + check_value(i, RefA, ReferenceA[i], "PtrA"); + check_value(i, RefB, ReferenceB[i], "PtrB"); + } + + free(PtrA, Queue); + free(PtrB, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/local_accessor.cpp b/sycl/test-e2e/Graph/RecordReplay/local_accessor.cpp new file mode 100644 index 0000000000000..245983f67da4a --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/local_accessor.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/local_accessor.cpp" diff --git a/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp b/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp new file mode 100644 index 0000000000000..1db9905457ae7 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../../Inputs/whole_update_local_acc.cpp" diff --git a/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp new file mode 100644 index 0000000000000..03645b2f19bfd --- /dev/null +++ b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../../Inputs/whole_update_local_acc.cpp" From dc181bbb888044dab158a7cbe9644ea53a42d926 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Wed, 6 Nov 2024 15:08:21 +0000 Subject: [PATCH 13/18] [SYCL][COMPAT] Ensure `launch`ed kernels are fully inlined (#15941) This PR defines & uses a custom `syclcompat::detail::apply_helper` with `[[clang::always_inline]]` to ensure kernels are inlined. --- sycl/include/syclcompat/launch_policy.hpp | 23 ++++- .../syclcompat/launch/kernel_properties.cpp | 2 +- .../syclcompat/launch/launch_inlining.cpp | 97 +++++++++++++++++++ 3 files changed, 117 insertions(+), 5 deletions(-) create mode 100644 sycl/test/syclcompat/launch/launch_inlining.cpp diff --git a/sycl/include/syclcompat/launch_policy.hpp b/sycl/include/syclcompat/launch_policy.hpp index 1c5f6ed3e97d6..f6a30fc46db1e 100644 --- a/sycl/include/syclcompat/launch_policy.hpp +++ b/sycl/include/syclcompat/launch_policy.hpp @@ -192,6 +192,17 @@ launch_policy(dim3, dim3, Ts...) -> launch_policy< detail::has_type>::value>; namespace detail { +// Custom std::apply helpers to enable inlining +template +__syclcompat_inline__ constexpr void apply_expand(F f, Tuple t, + std::index_sequence) { + [[clang::always_inline]] f(get(t)...); +} + +template +__syclcompat_inline__ constexpr void apply_helper(F f, Tuple t) { + apply_expand(f, t, std::make_index_sequence{}>{}); +} template @@ -211,12 +222,16 @@ struct KernelFunctor { operator()(syclcompat::detail::range_to_item_t) const { if constexpr (HasLocalMem) { char *local_mem_ptr = static_cast( - _local_acc.template get_multi_ptr().get()); - std::apply( - [lmem_ptr = local_mem_ptr](auto &&...args) { F(args..., lmem_ptr); }, + _local_acc.template get_multi_ptr() + .get()); + apply_helper( + [lmem_ptr = local_mem_ptr](auto &&...args) { + [[clang::always_inline]] F(args..., lmem_ptr); + }, _argument_tuple); } else { - std::apply([](auto &&...args) { F(args...); }, _argument_tuple); + apply_helper([](auto &&...args) { [[clang::always_inline]] F(args...); }, + _argument_tuple); } } diff --git a/sycl/test/syclcompat/launch/kernel_properties.cpp b/sycl/test/syclcompat/launch/kernel_properties.cpp index f17571fae0c2d..78920c62c5347 100644 --- a/sycl/test/syclcompat/launch/kernel_properties.cpp +++ b/sycl/test/syclcompat/launch/kernel_properties.cpp @@ -23,7 +23,7 @@ // We need hardware which can support at least 2 sub-group sizes, since that // hardware (presumably) supports the `intel_reqd_sub_group_size` attribute. // REQUIRES: sg-32 && sg-16 -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} -o - | FileCheck %s +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s #include #include #include diff --git a/sycl/test/syclcompat/launch/launch_inlining.cpp b/sycl/test/syclcompat/launch/launch_inlining.cpp new file mode 100644 index 0000000000000..a224837139a56 --- /dev/null +++ b/sycl/test/syclcompat/launch/launch_inlining.cpp @@ -0,0 +1,97 @@ +/*************************************************************************** + * + * Copyright (C) Codeplay Software Ltd. + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM + * Exceptions. See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SYCLcompat API + * + * launch_inlining.cpp + * + * Description: + * Ensure kernels are inlined + **************************************************************************/ +// RUN: %clangxx -fsycl -fgpu-inline-threshold=0 %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s +// We set -fgpu-inline-threshold=0 to disable heuristic inlining for the +// purposes of the test +#include +#include +#include +#include + +namespace compat_exp = syclcompat::experimental; +namespace sycl_exp = sycl::ext::oneapi::experimental; +namespace sycl_intel_exp = sycl::ext::intel::experimental; + +static constexpr int LOCAL_MEM_SIZE = 1024; + +// CHECK: define {{.*}}spir_kernel{{.*}}write_mem_kernel{{.*}} { +// CHECK-NOT: call {{.*}}write_mem_kernel +// CHECK: } + +template void write_mem_kernel(T *data, int num_elements) { + const int id = + sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0); + if (id < num_elements) { + data[id] = static_cast(id); + } +}; + +// CHECK: define {{.*}}spir_kernel{{.*}}dynamic_local_mem_typed_kernel{{.*}} { +// CHECK-NOT: call {{.*}}dynamic_local_mem_typed_kernel +// CHECK: } +template +void dynamic_local_mem_typed_kernel(T *data, char *local_mem) { + constexpr size_t num_elements = LOCAL_MEM_SIZE / sizeof(T); + T *typed_local_mem = reinterpret_cast(local_mem); + + const int id = + sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0); + if (id < num_elements) { + typed_local_mem[id] = static_cast(id); + } + sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_work_group<1>()); + if (id < num_elements) { + data[id] = typed_local_mem[num_elements - id - 1]; + } +}; + +int test_write_mem() { + compat_exp::launch_policy my_dim3_config(syclcompat::dim3{32}, + syclcompat::dim3{32}); + + const int memsize = 1024; + int *d_a = (int *)syclcompat::malloc(memsize); + compat_exp::launch>(my_dim3_config, d_a, + memsize / sizeof(int)) + .wait(); + + syclcompat::free(d_a); + return 0; +} + +int test_lmem_launch() { + int local_mem_size = LOCAL_MEM_SIZE; + + size_t num_elements = local_mem_size / sizeof(int); + int *d_a = (int *)syclcompat::malloc(local_mem_size); + + compat_exp::launch_policy my_config( + sycl::nd_range<1>{{256}, {256}}, + compat_exp::local_mem_size(local_mem_size)); + + compat_exp::launch>(my_config, d_a) + .wait(); + + syclcompat::free(d_a); + + return 0; +} From e7e3b9687a260df0f432ac91417c4a4e430a17b8 Mon Sep 17 00:00:00 2001 From: Udit Agarwal Date: Wed, 6 Nov 2024 10:20:44 -0800 Subject: [PATCH 14/18] [E2E] Fix no_zstd_warning.cpp when built with `clang-cl` (#16002) Currently, when this test is built with a compiler that accepts MSVC-style flags, compilation fails because `-O0` is unrecognized. This PR updates the test to replace `-O0` with `%O0`. Internal bug report: CMPLRTST-26037 --- sycl/test-e2e/Compression/no_zstd_warning.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/Compression/no_zstd_warning.cpp b/sycl/test-e2e/Compression/no_zstd_warning.cpp index 8a4460f9b8643..c87f2fe480096 100644 --- a/sycl/test-e2e/Compression/no_zstd_warning.cpp +++ b/sycl/test-e2e/Compression/no_zstd_warning.cpp @@ -1,4 +1,4 @@ // using --offload-compress without zstd should throw an error. // REQUIRES: !zstd -// RUN: not %{build} -O0 -g --offload-compress %S/Inputs/single_kernel.cpp -o %t_compress.out 2>&1 | FileCheck %s +// RUN: not %{build} %O0 -g --offload-compress %S/Inputs/single_kernel.cpp -o %t_compress.out 2>&1 | FileCheck %s // CHECK: '--offload-compress' option is specified but zstd is not available. The device image will not be compressed. From 66867d4faf87e03b855e3dc3d004f6b39c7553cd Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Wed, 6 Nov 2024 12:54:20 -0600 Subject: [PATCH 15/18] [SYCL][Graph] Adding new graph enqueue function to spec (#15677) replacing https://github.com/intel/llvm/pull/15385 after offline discussion. --- ...sycl_ext_oneapi_enqueue_functions.asciidoc | 28 +++++++++++++++++++ .../sycl_ext_oneapi_graph.asciidoc | 4 ++- .../oneapi/experimental/enqueue_functions.hpp | 12 ++++++++ .../ext_oneapi_enqueue_functions.cpp | 2 +- .../ext_oneapi_enqueue_functions_submit.cpp | 2 +- ...pi_enqueue_functions_submit_with_event.cpp | 5 ++-- 6 files changed, 48 insertions(+), 5 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc index ad8d6a7f50194..933a6aabd2bd4 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc @@ -716,6 +716,34 @@ optimize such partial barriers. _{endnote}_] |==== +==== Command Graph + +The functions in this section are only available if the +link:./sycl_ext_oneapi_graph.asciidoc[ + sycl_ext_oneapi_graph] extension is supported. + +|==== +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +void execute_graph(sycl::queue q, command_graph &g); + +void execute_graph(sycl::handler &h, command_graph &g); + +} +---- +!==== +_Constraints_: Device and context associated with queue need to be identical +to device and context provided at command graph creation. + +_Effects_: Submits an executable command graph to the `sycl::queue` or `sycl::handler`. + +|==== == Issues diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 56f09c04d3055..5dff0396f07fb 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -1975,7 +1975,9 @@ Removing this restriction is something we may look at for future revisions of The command submission functions defined in link:../experimental/sycl_ext_oneapi_enqueue_functions.asciidoc[sycl_ext_oneapi_enqueue_functions] -can be used to add nodes to a graph when creating a graph from queue recording. +can be used adding nodes to a graph when creating a graph from queue recording. +New methods are also defined that enable submitting an executable graph, +e.g. directly to a queue without returning an event. == Examples and Usage Guide diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index b3c758aaa891d..7ecf5ce4c8b14 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -383,6 +384,17 @@ inline void partial_barrier(queue Q, const std::vector &Events, submit(Q, [&](handler &CGH) { partial_barrier(CGH, Events); }, CodeLoc); } +inline void execute_graph(queue Q, command_graph &G, + const sycl::detail::code_location &CodeLoc = + sycl::detail::code_location::current()) { + Q.ext_oneapi_graph(G, CodeLoc); +} + +inline void execute_graph(handler &CGH, + command_graph &G) { + CGH.ext_oneapi_graph(G); +} + } // namespace ext::oneapi::experimental } // namespace _V1 } // namespace sycl diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp index ef3b440790c6b..6c6fe20337dab 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp @@ -43,7 +43,7 @@ int main() { auto GraphExec = Graph.finalize(); - InOrderQueue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + exp_ext::execute_graph(InOrderQueue, GraphExec); InOrderQueue.wait_and_throw(); free(PtrA, InOrderQueue); diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_submit.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_submit.cpp index 8eccce2ea8ef9..623d6fc817879 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_submit.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_submit.cpp @@ -60,7 +60,7 @@ int main() { auto GraphExec = Graph.finalize(); - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + exp_ext::execute_graph(Queue, GraphExec); Queue.wait_and_throw(); } diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_submit_with_event.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_submit_with_event.cpp index f66731d745bd2..4b8294be7e989 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_submit_with_event.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_submit_with_event.cpp @@ -52,8 +52,9 @@ int main() { auto GraphExec = Graph.finalize(); - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); - Queue.wait_and_throw(); + exp_ext::submit_with_event(Queue, [&](handler &CGH) { + exp_ext::execute_graph(CGH, GraphExec); + }).wait(); free(PtrA, Queue); free(PtrB, Queue); From b3fb5addeb3d1b8368226814811bf83cdd4cd8fa Mon Sep 17 00:00:00 2001 From: Udit Agarwal Date: Wed, 6 Nov 2024 16:15:36 -0800 Subject: [PATCH 16/18] [Clang] Fix passing `-offload-compress` when compiling and linking device images separately (#15997) When compiling and linking device images separately, compression related flags are not propagated correctly to `offload-wrapper`. This PR fixes that and updates the test accordingly. --- clang/lib/Driver/ToolChains/Clang.cpp | 57 ++++++++++++------- .../compression_separate_compile.cpp | 7 +++ 2 files changed, 43 insertions(+), 21 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 3fa80d479936f..f90ba124e5a09 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -10353,33 +10353,48 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA, assert(JA.getInputs().size() == Inputs.size() && "Not have inputs for all dependence actions??"); - // For FPGA, we wrap the host objects before archiving them when using - // -fsycl-link. This allows for better extraction control from the - // archive when we need the host objects for subsequent compilations. if (OffloadingKind == Action::OFK_None && - C.getArgs().hasArg(options::OPT_fintelfpga) && C.getArgs().hasArg(options::OPT_fsycl_link_EQ)) { - // Add offload targets and inputs. - CmdArgs.push_back(C.getArgs().MakeArgString( - Twine("-kind=") + Action::GetOffloadKindName(OffloadingKind))); - CmdArgs.push_back( - TCArgs.MakeArgString(Twine("-target=") + Triple.getTriple())); + // For FPGA, we wrap the host objects before archiving them when using + // -fsycl-link. This allows for better extraction control from the + // archive when we need the host objects for subsequent compilations. + if (C.getArgs().hasArg(options::OPT_fintelfpga)) { - if (Inputs[0].getType() == types::TY_Tempfiletable || - Inputs[0].getType() == types::TY_Tempfilelist) - // Input files are passed via the batch job file table. - CmdArgs.push_back(C.getArgs().MakeArgString("-batch")); + // Add offload targets and inputs. + CmdArgs.push_back(C.getArgs().MakeArgString( + Twine("-kind=") + Action::GetOffloadKindName(OffloadingKind))); + CmdArgs.push_back( + TCArgs.MakeArgString(Twine("-target=") + Triple.getTriple())); - // Add input. - assert(Inputs[0].isFilename() && "Invalid input."); - CmdArgs.push_back(TCArgs.MakeArgString(Inputs[0].getFilename())); + if (Inputs[0].getType() == types::TY_Tempfiletable || + Inputs[0].getType() == types::TY_Tempfilelist) + // Input files are passed via the batch job file table. + CmdArgs.push_back(C.getArgs().MakeArgString("-batch")); - C.addCommand(std::make_unique( - JA, *this, ResponseFileSupport::None(), - TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())), - CmdArgs, Inputs)); - return; + // Add input. + assert(Inputs[0].isFilename() && "Invalid input."); + CmdArgs.push_back(TCArgs.MakeArgString(Inputs[0].getFilename())); + + C.addCommand(std::make_unique( + JA, *this, ResponseFileSupport::None(), + TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())), + CmdArgs, Inputs)); + return; + } else { + // When compiling and linking separately, we need to propagate the + // compression related CLI options to offload-wrapper. Don't propagate + // these options when wrapping objects for FPGA. + if (C.getInputArgs().getLastArg(options::OPT_offload_compress)) { + CmdArgs.push_back( + C.getArgs().MakeArgString(Twine("-offload-compress"))); + // -offload-compression-level=<> + if (Arg *A = C.getInputArgs().getLastArg( + options::OPT_offload_compression_level_EQ)) + CmdArgs.push_back(C.getArgs().MakeArgString( + Twine("-offload-compression-level=") + A->getValue())); + } + } } // Add offload targets and inputs. diff --git a/sycl/test-e2e/Compression/compression_separate_compile.cpp b/sycl/test-e2e/Compression/compression_separate_compile.cpp index 9e47bbebdc875..dab17e3506b4e 100644 --- a/sycl/test-e2e/Compression/compression_separate_compile.cpp +++ b/sycl/test-e2e/Compression/compression_separate_compile.cpp @@ -10,6 +10,13 @@ ////////////////////// Link device images // RUN: %clangxx --offload-compress -fsycl -fsycl-link -fsycl-targets=spir64_x86_64 -fPIC %t_kernel1_aot.o %t_kernel2_aot.o -o %t_compressed_image.o -v +// Make sure the clang-offload-wrapper is called with the --offload-compress +// option. +// RUN: %clangxx --offload-compress -fsycl -fsycl-link -fsycl-targets=spir64_x86_64 -fPIC %t_kernel1_aot.o %t_kernel2_aot.o -o %t_compressed_image.o -### &> %t_driver_opts.txt +// RUN: FileCheck -input-file=%t_driver_opts.txt %s --check-prefix=CHECK-DRIVER-OPTS + +// CHECK-DRIVER-OPTS: clang-offload-wrapper{{.*}} "-offload-compress" + ////////////////////// Compile the host program // RUN: %clangxx -fsycl -std=c++17 -Wno-attributes -Wno-deprecated-declarations -fPIC -c %s -o %t_main.o From 3d26d63cec7eae3fd771e8bdb1f66a029c10f313 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 7 Nov 2024 06:38:44 +0000 Subject: [PATCH 17/18] [SYCL][COMPAT] Improve non-e2e tests (#15992) Use `-fsyntax-only` where possible, and avoid passing a specific target triple. --- .../syclcompat/launch/kernel_properties.cpp | 2 +- .../launch/launch_policy_lmem_neg.cpp | 2 +- .../syclcompat/launch/launch_policy_neg.cpp | 24 +++++++++---------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/sycl/test/syclcompat/launch/kernel_properties.cpp b/sycl/test/syclcompat/launch/kernel_properties.cpp index 78920c62c5347..6beefce73d14b 100644 --- a/sycl/test/syclcompat/launch/kernel_properties.cpp +++ b/sycl/test/syclcompat/launch/kernel_properties.cpp @@ -23,7 +23,7 @@ // We need hardware which can support at least 2 sub-group sizes, since that // hardware (presumably) supports the `intel_reqd_sub_group_size` attribute. // REQUIRES: sg-32 && sg-16 -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s +// RUN: %clangxx -fsycl -fsycl-device-only -Xclang -fsycl-is-device %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s #include #include #include diff --git a/sycl/test/syclcompat/launch/launch_policy_lmem_neg.cpp b/sycl/test/syclcompat/launch/launch_policy_lmem_neg.cpp index 98222d6cc374f..5c2750e86b705 100644 --- a/sycl/test/syclcompat/launch/launch_policy_lmem_neg.cpp +++ b/sycl/test/syclcompat/launch/launch_policy_lmem_neg.cpp @@ -22,7 +22,7 @@ * templates as tests in launch_policy_neg.cpp **************************************************************************/ -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out 2>&1 | FileCheck -vv %s +// RUN: not %clangxx -fsycl -fsyntax-only %s 2>&1 | FileCheck -vv %s #include #include diff --git a/sycl/test/syclcompat/launch/launch_policy_neg.cpp b/sycl/test/syclcompat/launch/launch_policy_neg.cpp index 558864084ff62..cee796471f23f 100644 --- a/sycl/test/syclcompat/launch/launch_policy_neg.cpp +++ b/sycl/test/syclcompat/launch/launch_policy_neg.cpp @@ -20,18 +20,18 @@ * Negative tests for new launch_policy. **************************************************************************/ -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK1 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK1 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK2 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK2 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK3 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK3 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK4 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK4 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK5 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK5 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK6 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK6 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK7 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK7 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK8 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK8 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK9 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK9 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK10 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK10 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK11 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK11 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK12 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK12 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK1 2>&1 | FileCheck -vv %s --check-prefixes=CHECK1 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK2 2>&1 | FileCheck -vv %s --check-prefixes=CHECK2 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK3 2>&1 | FileCheck -vv %s --check-prefixes=CHECK3 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK4 2>&1 | FileCheck -vv %s --check-prefixes=CHECK4 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK5 2>&1 | FileCheck -vv %s --check-prefixes=CHECK5 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK6 2>&1 | FileCheck -vv %s --check-prefixes=CHECK6 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK7 2>&1 | FileCheck -vv %s --check-prefixes=CHECK7 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK8 2>&1 | FileCheck -vv %s --check-prefixes=CHECK8 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK9 2>&1 | FileCheck -vv %s --check-prefixes=CHECK9 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK10 2>&1 | FileCheck -vv %s --check-prefixes=CHECK10 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK11 2>&1 | FileCheck -vv %s --check-prefixes=CHECK11 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK12 2>&1 | FileCheck -vv %s --check-prefixes=CHECK12 #include #include From dd2390c50259f24278a0ab866cf7898952a3c04a Mon Sep 17 00:00:00 2001 From: Rafal Bielski Date: Thu, 7 Nov 2024 09:34:45 +0000 Subject: [PATCH 18/18] [UR][HIP] Set the right HIP device before creating base event counter (#15964) Update UR tag to include https://github.com/oneapi-src/unified-runtime/pull/2276 --------- Co-authored-by: Callum Fare --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 15985fb0cc0b0..a7b68befa96bf 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit 3edf99755ce2af3b53102a7d8438e0fe969efac3 -# Merge: 5955bad3 0b968661 -# Author: Ross Brunton -# Date: Wed Nov 6 11:07:29 2024 +0000 -# Merge pull request #2082 from RossBrunton/ross/multiadapt -# [CI] Add "loader" support to conformance testing -set(UNIFIED_RUNTIME_TAG 3edf99755ce2af3b53102a7d8438e0fe969efac3) +# commit 2858a8a28d0b6524a3b2b0e25a597d1c8295ce9d +# Merge: a5a649f3 34b66fda +# Author: Callum Fare +# Date: Wed Nov 6 16:45:49 2024 +0000 +# Merge pull request #2276 from rafbiels/rafbiels/fix-hip-evbase +# Set the right HIP device before creating base event counter +set(UNIFIED_RUNTIME_TAG 2858a8a28d0b6524a3b2b0e25a597d1c8295ce9d)