diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 2b99cc35e0f1..8bf06d5a3a8e 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -326,6 +326,7 @@ LANGOPT(SYCLExperimentalRangeRounding, 1, 0, "Use experimental parallel for rang LANGOPT(SYCLEnableIntHeaderDiags, 1, 0, "Enable diagnostics that require the " "SYCL integration header") LANGOPT(SYCLIsNativeCPU , 1, 0, "Generate code for SYCL Native CPU") +LANGOPT(SYCLRTCMode, 1, 0, "Compile in RTC mode") LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP") LANGOPT(OffloadUniformBlock, 1, 0, "Assume that kernels are launched with uniform block sizes (default true for CUDA/HIP and false otherwise)") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 8ff41673575a..053bf5e63d9a 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6877,6 +6877,11 @@ defm sycl_esimd_force_stateless_mem : BoolFOption<"sycl-esimd-force-stateless-me NegFlag, BothFlags<[], [ClangOption, CLOption, CC1Option], "">>; +defm sycl_rtc_mode: BoolFOption<"sycl-rtc-mode", + LangOpts<"SYCLRTCMode">, DefaultFalse, + PosFlag, + NegFlag, + BothFlags<[HelpHidden], [ClangOption, CC1Option], " RTC mode in SYCL.">>; // TODO: Remove this option once ESIMD headers are updated to // guard vectors to be device only. def fno_sycl_esimd_build_host_code : Flag<["-"], "fno-sycl-esimd-build-host-code">, diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index bb3492887c8e..f90ba124e5a0 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5589,6 +5589,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.AddLastArg(CmdArgs, options::OPT_fsycl_decompose_functor, options::OPT_fno_sycl_decompose_functor); + Args.AddLastArg(CmdArgs, options::OPT_fsycl_rtc_mode, + options::OPT_fno_sycl_rtc_mode); + // Forward -fsycl-instrument-device-code option to cc1. This option will // only be used for SPIR/SPIR-V based targets. if (Triple.isSPIROrSPIRV()) @@ -10350,33 +10353,48 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA, assert(JA.getInputs().size() == Inputs.size() && "Not have inputs for all dependence actions??"); - // For FPGA, we wrap the host objects before archiving them when using - // -fsycl-link. This allows for better extraction control from the - // archive when we need the host objects for subsequent compilations. if (OffloadingKind == Action::OFK_None && - C.getArgs().hasArg(options::OPT_fintelfpga) && C.getArgs().hasArg(options::OPT_fsycl_link_EQ)) { - // Add offload targets and inputs. - CmdArgs.push_back(C.getArgs().MakeArgString( - Twine("-kind=") + Action::GetOffloadKindName(OffloadingKind))); - CmdArgs.push_back( - TCArgs.MakeArgString(Twine("-target=") + Triple.getTriple())); + // For FPGA, we wrap the host objects before archiving them when using + // -fsycl-link. This allows for better extraction control from the + // archive when we need the host objects for subsequent compilations. + if (C.getArgs().hasArg(options::OPT_fintelfpga)) { - if (Inputs[0].getType() == types::TY_Tempfiletable || - Inputs[0].getType() == types::TY_Tempfilelist) - // Input files are passed via the batch job file table. - CmdArgs.push_back(C.getArgs().MakeArgString("-batch")); + // Add offload targets and inputs. + CmdArgs.push_back(C.getArgs().MakeArgString( + Twine("-kind=") + Action::GetOffloadKindName(OffloadingKind))); + CmdArgs.push_back( + TCArgs.MakeArgString(Twine("-target=") + Triple.getTriple())); - // Add input. - assert(Inputs[0].isFilename() && "Invalid input."); - CmdArgs.push_back(TCArgs.MakeArgString(Inputs[0].getFilename())); + if (Inputs[0].getType() == types::TY_Tempfiletable || + Inputs[0].getType() == types::TY_Tempfilelist) + // Input files are passed via the batch job file table. + CmdArgs.push_back(C.getArgs().MakeArgString("-batch")); - C.addCommand(std::make_unique( - JA, *this, ResponseFileSupport::None(), - TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())), - CmdArgs, Inputs)); - return; + // Add input. + assert(Inputs[0].isFilename() && "Invalid input."); + CmdArgs.push_back(TCArgs.MakeArgString(Inputs[0].getFilename())); + + C.addCommand(std::make_unique( + JA, *this, ResponseFileSupport::None(), + TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())), + CmdArgs, Inputs)); + return; + } else { + // When compiling and linking separately, we need to propagate the + // compression related CLI options to offload-wrapper. Don't propagate + // these options when wrapping objects for FPGA. + if (C.getInputArgs().getLastArg(options::OPT_offload_compress)) { + CmdArgs.push_back( + C.getArgs().MakeArgString(Twine("-offload-compress"))); + // -offload-compression-level=<> + if (Arg *A = C.getInputArgs().getLastArg( + options::OPT_offload_compression_level_EQ)) + CmdArgs.push_back(C.getArgs().MakeArgString( + Twine("-offload-compression-level=") + A->getValue())); + } + } } // Add offload targets and inputs. diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index b333bb7b476d..6435618ae7f6 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -1328,6 +1328,9 @@ StringRef SYCL::gen::resolveGenDevice(StringRef DeviceName) { .Case("amd_gpu_gfx700", "gfx700") .Case("amd_gpu_gfx701", "gfx701") .Case("amd_gpu_gfx702", "gfx702") + .Case("amd_gpu_gfx703", "gfx703") + .Case("amd_gpu_gfx704", "gfx704") + .Case("amd_gpu_gfx705", "gfx705") .Case("amd_gpu_gfx801", "gfx801") .Case("amd_gpu_gfx802", "gfx802") .Case("amd_gpu_gfx803", "gfx803") @@ -1415,6 +1418,9 @@ SmallString<64> SYCL::gen::getGenDeviceMacro(StringRef DeviceName) { .Case("gfx700", "AMD_GPU_GFX700") .Case("gfx701", "AMD_GPU_GFX701") .Case("gfx702", "AMD_GPU_GFX702") + .Case("gfx703", "AMD_GPU_GFX703") + .Case("gfx704", "AMD_GPU_GFX704") + .Case("gfx705", "AMD_GPU_GFX705") .Case("gfx801", "AMD_GPU_GFX801") .Case("gfx802", "AMD_GPU_GFX802") .Case("gfx803", "AMD_GPU_GFX803") diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 996443bcbc8f..e45b038273d7 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -2040,6 +2040,11 @@ class SyclKernelFieldChecker : public SyclKernelFieldHandler { } bool handleStructType(ParmVarDecl *PD, QualType ParamTy) final { + if (SemaSYCLRef.getLangOpts().SYCLRTCMode) { + // When compiling in RTC mode, the restriction regarding forward + // declarations doesn't apply, as we don't need the integration header. + return isValid(); + } CXXRecordDecl *RD = ParamTy->getAsCXXRecordDecl(); // For free functions all struct/class kernel arguments are forward declared // in integration header, that adds additional restrictions for kernel @@ -6453,6 +6458,13 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { O << "} // namespace _V1\n"; O << "} // namespace sycl\n"; + // The rest of this function only applies to free-function kernels. However, + // in RTC mode, we do not need integration header information for + // free-function kernels, so we can return early here. + if (S.getLangOpts().SYCLRTCMode) { + return; + } + unsigned ShimCounter = 1; int FreeFunctionCount = 0; for (const KernelDesc &K : KernelDescs) { diff --git a/clang/test/CodeGenSYCL/free_function_int_header_rtc_mode.cpp b/clang/test/CodeGenSYCL/free_function_int_header_rtc_mode.cpp new file mode 100644 index 000000000000..214318b563fa --- /dev/null +++ b/clang/test/CodeGenSYCL/free_function_int_header_rtc_mode.cpp @@ -0,0 +1,80 @@ +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple spir64-unknown-unknown -sycl-std=2020 -fsycl-rtc-mode -fsycl-int-header=%t.rtc.h %s +// RUN: FileCheck -input-file=%t.rtc.h --check-prefixes=CHECK,CHECK-RTC %s + +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple spir64-unknown-unknown -sycl-std=2020 -fno-sycl-rtc-mode -fsycl-int-header=%t.nortc.h %s +// RUN: FileCheck -input-file=%t.nortc.h --check-prefixes=CHECK,CHECK-NORTC %s + +// This test checks that free-function kernel information is included or +// excluded from the integration header, depending on the '-fsycl-rtc-mode' +// flag. + +#include "sycl.hpp" + +[[__sycl_detail__::add_ir_attributes_function("sycl-single-task-kernel", 2)]] +void free_function_single(int* ptr, int start, int end){ + for(int i = start; i < end; ++i){ + ptr[i] = start + 66; + } +} + +[[__sycl_detail__::add_ir_attributes_function("sycl-nd-range-kernel", 2)]] +void free_function_nd_range(int* ptr, int start, int end){ + for(int i = start; i < end; ++i){ + ptr[i] = start + 66; + } +} + +template +__attribute__((sycl_kernel)) void kernel(const KernelFunc &kernelFunc){ + kernelFunc(); +} + +int main(){ + sycl::accessor accessorA; + kernel( + [=]() { + accessorA.use(); + }); + return 0; +} + + +// CHECK: const char* const kernel_names[] = { +// CHECK-NEXT: "{{.*}}__sycl_kernel_free_function_singlePiii", +// CHECK-NEXT: "{{.*}}__sycl_kernel_free_function_nd_rangePiii", +// CHECK-NEXT: "{{.*}}Kernel_Function", + + +// CHECK: static constexpr const char* getName() { return "{{.*}}__sycl_kernel_free_function_singlePiii"; } +// CHECK: static constexpr const char* getName() { return "{{.*}}__sycl_kernel_free_function_nd_rangePiii"; } +// CHECK: static constexpr const char* getName() { return "{{.*}}Kernel_Function"; } + +// CHECK-RTC-NOT: free_function_single_kernel +// CHECK-RTC-NOT: free_function_nd_range + +// CHECK-NORTC: void free_function_single(int *ptr, int start, int end); +// CHECK-NORTC: static constexpr auto __sycl_shim[[#FIRST:]]() +// CHECK-NORTC-NEXT: return (void (*)(int *, int, int))free_function_single; + +// CHECK-NORTC: struct ext::oneapi::experimental::is_kernel<__sycl_shim[[#FIRST]]()> { +// CHECK-NORTC-NEXT: static constexpr bool value = true; + +// CHECK-NORTC: struct ext::oneapi::experimental::is_single_task_kernel<__sycl_shim[[#FIRST]]()> { +// CHECK-NORTC-NEXT: static constexpr bool value = true; + + +// CHECK-NORTC: void free_function_nd_range(int *ptr, int start, int end); +// CHECK-NORTC: static constexpr auto __sycl_shim[[#SECOND:]]() { +// CHECK-NORTC-NEXT: return (void (*)(int *, int, int))free_function_nd_range; + +// CHECK-NORTC: struct ext::oneapi::experimental::is_kernel<__sycl_shim[[#SECOND]]()> { +// CHECK-NORTC-NEXT: static constexpr bool value = true; + +// CHECK-NORTC: struct ext::oneapi::experimental::is_nd_range_kernel<__sycl_shim2(), 2> { +// CHECK-NORTC-NEXT: static constexpr bool value = true; + +// CHECK-NORTC: kernel_id ext::oneapi::experimental::get_kernel_id<__sycl_shim[[#FIRST]]()>() { +// CHECK-NORTC-NEXT: return sycl::detail::get_kernel_id_impl(std::string_view{"{{.*}}__sycl_kernel_free_function_singlePiii"}); + +// CHECK-NORTC: kernel_id ext::oneapi::experimental::get_kernel_id<__sycl_shim[[#SECOND]]()>() { +// CHECK-NORTC-NEXT: return sycl::detail::get_kernel_id_impl(std::string_view{"{{.*}}__sycl_kernel_free_function_nd_rangePiii"}); diff --git a/clang/test/Driver/sycl-device-traits-macros-amdgcn.cpp b/clang/test/Driver/sycl-device-traits-macros-amdgcn.cpp index 8dacdd21f9b2..f7b8ae550f6e 100644 --- a/clang/test/Driver/sycl-device-traits-macros-amdgcn.cpp +++ b/clang/test/Driver/sycl-device-traits-macros-amdgcn.cpp @@ -9,6 +9,15 @@ // RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx702 \ // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx703 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx704 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx705 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE // RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx801 \ // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-DEVICE-TRIPLE @@ -120,6 +129,15 @@ // RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx702 \ // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-OFFLOAD-ARCH +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx703 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-OFFLOAD-ARCH +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx704 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-OFFLOAD-ARCH +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx705 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-OFFLOAD-ARCH // RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx801 \ // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefix=CHECK-SYCL-AMDGCN-AMD-AMDHSA-OFFLOAD-ARCH diff --git a/clang/test/Driver/sycl-oneapi-gpu-amdgpu.cpp b/clang/test/Driver/sycl-oneapi-gpu-amdgpu.cpp index bdc108c6a0f5..e6c8530c5313 100644 --- a/clang/test/Driver/sycl-oneapi-gpu-amdgpu.cpp +++ b/clang/test/Driver/sycl-oneapi-gpu-amdgpu.cpp @@ -9,6 +9,15 @@ // RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx702 \ // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefixes=DEVICE_AMD,MACRO_AMD -DDEV_STR=gfx702 -DMAC_STR=GFX702 +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx703 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefixes=DEVICE_AMD,MACRO_AMD -DDEV_STR=gfx703 -DMAC_STR=GFX703 +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx704 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefixes=DEVICE_AMD,MACRO_AMD -DDEV_STR=gfx704 -DMAC_STR=GFX704 +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx705 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefixes=DEVICE_AMD,MACRO_AMD -DDEV_STR=gfx705 -DMAC_STR=GFX705 // RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx801 \ // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefixes=DEVICE_AMD,MACRO_AMD -DDEV_STR=gfx801 -DMAC_STR=GFX801 diff --git a/clang/test/Driver/sycl-rtc-mode.cpp b/clang/test/Driver/sycl-rtc-mode.cpp new file mode 100644 index 000000000000..4a68c97db640 --- /dev/null +++ b/clang/test/Driver/sycl-rtc-mode.cpp @@ -0,0 +1,32 @@ +/// +/// Perform driver test for SYCL RTC mode. +/// + +/// Check that the '-fsycl-rtc-mode' is correctly forwarded to the device +/// compilation and only to the device compilation. + +// RUN: %clangxx -fsycl -fsycl-rtc-mode --no-offload-new-driver %s -### 2>&1 \ +// RUN: | FileCheck %s + +// RUN: %clangxx -fsycl -fsycl-rtc-mode --offload-new-driver %s -### 2>&1 \ +// RUN: | FileCheck %s + +// CHECK: clang{{.*}} "-fsycl-is-device" +// CHECK-SAME: -fsycl-rtc-mode +// CHECK: clang{{.*}} "-fsycl-is-host" +// CHECK-NOT: -fsycl-rtc-mode + + +/// Check that the '-fno-sycl-rtc-mode' is correctly forwarded to the device +/// compilation and only to the device compilation. + +// RUN: %clangxx -fsycl -fno-sycl-rtc-mode --no-offload-new-driver %s -### 2>&1 \ +// RUN: | FileCheck %s --check-prefix=NEGATIVE + +// RUN: %clangxx -fsycl -fno-sycl-rtc-mode --offload-new-driver %s -### 2>&1 \ +// RUN: | FileCheck %s --check-prefix=NEGATIVE + +// NEGATIVE: clang{{.*}} "-fsycl-is-device" +// NEGATIVE-SAME: -fno-sycl-rtc-mode +// NEGATIVE: clang{{.*}} "-fsycl-is-host" +// NEGATIVE-NOT: -fsycl-rtc-mode diff --git a/clang/tools/CMakeLists.txt b/clang/tools/CMakeLists.txt index adbb108129a2..9f238dd44109 100644 --- a/clang/tools/CMakeLists.txt +++ b/clang/tools/CMakeLists.txt @@ -7,7 +7,6 @@ add_clang_subdirectory(clang-diff) add_clang_subdirectory(clang-format) add_clang_subdirectory(clang-fuzzer) add_clang_subdirectory(clang-import-test) -add_clang_subdirectory(clang-nvlink-wrapper) add_clang_subdirectory(clang-linker-wrapper) add_clang_subdirectory(clang-nvlink-wrapper) add_clang_subdirectory(clang-offload-packager) diff --git a/devops/cts_exclude_filter_L0_GPU b/devops/cts_exclude_filter_L0_GPU index dfd8b4623bae..e11eb8767ed3 100644 --- a/devops/cts_exclude_filter_L0_GPU +++ b/devops/cts_exclude_filter_L0_GPU @@ -3,3 +3,5 @@ kernel_bundle marray # fix: https://github.com/KhronosGroup/SYCL-CTS/pull/964 accessor_legacy +# CMPLRLLVM-61839 +multi_ptr diff --git a/devops/cts_exclude_filter_OCL_CPU b/devops/cts_exclude_filter_OCL_CPU index 24f4a5c9eb41..d8317bae4691 100644 --- a/devops/cts_exclude_filter_OCL_CPU +++ b/devops/cts_exclude_filter_OCL_CPU @@ -7,3 +7,5 @@ math_builtin_api hierarchical # fix: https://github.com/KhronosGroup/SYCL-CTS/pull/964 accessor_legacy +# CMPLRLLVM-61839 +multi_ptr diff --git a/devops/dependencies.json b/devops/dependencies.json index 25283242aeed..944a3ffb7e8d 100644 --- a/devops/dependencies.json +++ b/devops/dependencies.json @@ -19,9 +19,9 @@ "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" }, "level_zero": { - "github_tag": "v1.18.3", - "version": "v1.18.3", - "url": "https://github.com/oneapi-src/level-zero/releases/tag/v1.18.3", + "github_tag": "v1.18.5", + "version": "v1.18.5", + "url": "https://github.com/oneapi-src/level-zero/releases/tag/v1.18.5", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" }, "tbb": { diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index ab1a18f49557..72687d6f1863 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -424,9 +424,19 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) set ( has_distinct_generic_addrspace TRUE ) if ( ARCH STREQUAL spirv OR ARCH STREQUAL spirv64 ) set( opt_flags -O3 ) + list( APPEND build_flags -DCLC_SPIRV ) set( spvflags --spirv-max-version=1.1 ) + set( MACRO_ARCH SPIRV32 ) + if( ARCH STREQUAL spirv64 ) + set( MACRO_ARCH SPIRV64 ) + endif() elseif( ARCH STREQUAL clspv OR ARCH STREQUAL clspv64 ) set( opt_flags -O3 ) + list( APPEND build_flags -DCLC_CLSPV ) + set( MACRO_ARCH CLSPV32 ) + if( ARCH STREQUAL clspv64 ) + set( MACRO_ARCH CLSPV64 ) + endif() elseif( ARCH STREQUAL nvptx OR ARCH STREQUAL nvptx64 ) set( opt_flags -O3 "--nvvm-reflect-enable=false" ) set( has_distinct_generic_addrspace FALSE ) @@ -437,6 +447,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) set( has_distinct_generic_addrspace FALSE ) else() set( opt_flags -O3 ) + set( MACRO_ARCH ${ARCH} ) endif() # Enable SPIR-V builtin function declarations, so they don't @@ -483,7 +494,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) list(APPEND build_flags -D__unix__) endif() - string( TOUPPER "CLC_${ARCH}" CLC_TARGET_DEFINE ) + string( TOUPPER "CLC_${MACRO_ARCH}" CLC_TARGET_DEFINE ) list( APPEND build_flags -D__CLC_INTERNAL diff --git a/libclc/clc/include/clc/clcfunc.h b/libclc/clc/include/clc/clcfunc.h index e04a405a94be..6c95a07b1c18 100644 --- a/libclc/clc/include/clc/clcfunc.h +++ b/libclc/clc/include/clc/clcfunc.h @@ -10,9 +10,9 @@ // avoid inlines for SPIR-V related targets since we'll optimise later in the // chain -#if defined(CLC_SPIRV) || defined(CLC_SPIRV64) +#if defined(CLC_SPIRV) #define _CLC_DEF -#elif defined(CLC_CLSPV) || defined(CLC_CLSPV64) +#elif defined(CLC_CLSPV) #define _CLC_DEF __attribute__((noinline)) __attribute__((clspv_libclc_builtin)) #else #define _CLC_DEF __attribute__((always_inline)) diff --git a/libclc/generic/include/clc/integer/gentype.inc b/libclc/clc/include/clc/integer/gentype.inc similarity index 99% rename from libclc/generic/include/clc/integer/gentype.inc rename to libclc/clc/include/clc/integer/gentype.inc index 032bdc0cadba..e8ca005d4ccb 100644 --- a/libclc/generic/include/clc/integer/gentype.inc +++ b/libclc/clc/include/clc/integer/gentype.inc @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -//These 2 defines only change when switching between data sizes or base types to -//keep this file manageable. +// These 2 defines only change when switching between data sizes or base types +// to keep this file manageable. #define __CLC_GENSIZE 8 #define __CLC_SCALAR_GENTYPE char diff --git a/libclc/generic/include/clc/math/gentype.inc b/libclc/clc/include/clc/math/gentype.inc similarity index 100% rename from libclc/generic/include/clc/math/gentype.inc rename to libclc/clc/include/clc/math/gentype.inc diff --git a/libclc/clc/include/clc/shared/clc_clamp.h b/libclc/clc/include/clc/shared/clc_clamp.h new file mode 100644 index 000000000000..5c044c9a1a51 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_clamp.h @@ -0,0 +1,15 @@ +#if defined(CLC_CLSPV) || defined(CLC_SPIRV) +// clspv and spir-v targets provide their own OpenCL-compatible clamp +#define __clc_clamp clamp +#else + +#include +#include + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif diff --git a/libclc/clc/include/clc/shared/clc_clamp.inc b/libclc/clc/include/clc/shared/clc_clamp.inc new file mode 100644 index 000000000000..cf6b0b2789bc --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_clamp.inc @@ -0,0 +1,9 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_clamp(__CLC_GENTYPE x, + __CLC_GENTYPE y, + __CLC_GENTYPE z); + +#ifndef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_clamp(__CLC_GENTYPE x, + __CLC_SCALAR_GENTYPE y, + __CLC_SCALAR_GENTYPE z); +#endif diff --git a/libclc/clc/include/clc/shared/clc_max.h b/libclc/clc/include/clc/shared/clc_max.h new file mode 100644 index 000000000000..2825640f6c29 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_max.h @@ -0,0 +1,12 @@ +#if defined(CLC_CLSPV) || defined(CLC_SPIRV) +// clspv and spir-v targets provide their own OpenCL-compatible max +#define __clc_max max +#else + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif diff --git a/libclc/clc/include/clc/shared/clc_max.inc b/libclc/clc/include/clc/shared/clc_max.inc new file mode 100644 index 000000000000..bddb3fa3d920 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_max.inc @@ -0,0 +1,7 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_max(__CLC_GENTYPE a, + __CLC_GENTYPE b); + +#ifndef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_max(__CLC_GENTYPE a, + __CLC_SCALAR_GENTYPE b); +#endif diff --git a/libclc/clc/include/clc/shared/clc_min.h b/libclc/clc/include/clc/shared/clc_min.h new file mode 100644 index 000000000000..0b7ee140b8f4 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_min.h @@ -0,0 +1,12 @@ +#if defined(CLC_CLSPV) || defined(CLC_SPIRV) +// clspv and spir-v targets provide their own OpenCL-compatible min +#define __clc_min min +#else + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif diff --git a/libclc/clc/include/clc/shared/clc_min.inc b/libclc/clc/include/clc/shared/clc_min.inc new file mode 100644 index 000000000000..3e1da96df43d --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_min.inc @@ -0,0 +1,7 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_min(__CLC_GENTYPE a, + __CLC_GENTYPE b); + +#ifndef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_min(__CLC_GENTYPE a, + __CLC_SCALAR_GENTYPE b); +#endif diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index fa2e4f50b99c..db523adb6383 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -1 +1,4 @@ geometric/clc_dot.cl +shared/clc_clamp.cl +shared/clc_max.cl +shared/clc_min.cl diff --git a/libclc/clc/lib/generic/shared/clc_clamp.cl b/libclc/clc/lib/generic/shared/clc_clamp.cl new file mode 100644 index 000000000000..1d40da3cf229 --- /dev/null +++ b/libclc/clc/lib/generic/shared/clc_clamp.cl @@ -0,0 +1,7 @@ +#include + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include diff --git a/libclc/generic/libspirv/core/shared/clc_clamp.inc b/libclc/clc/lib/generic/shared/clc_clamp.inc similarity index 62% rename from libclc/generic/libspirv/core/shared/clc_clamp.inc rename to libclc/clc/lib/generic/shared/clc_clamp.inc index 571f6470e570..da67cd2ad69d 100644 --- a/libclc/generic/libspirv/core/shared/clc_clamp.inc +++ b/libclc/clc/lib/generic/shared/clc_clamp.inc @@ -1,11 +1,3 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_clamp(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z) { diff --git a/libclc/clc/lib/generic/shared/clc_max.cl b/libclc/clc/lib/generic/shared/clc_max.cl new file mode 100644 index 000000000000..e1050ed0007e --- /dev/null +++ b/libclc/clc/lib/generic/shared/clc_max.cl @@ -0,0 +1,7 @@ +#include + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include diff --git a/libclc/generic/libspirv/core/shared/clc_max.inc b/libclc/clc/lib/generic/shared/clc_max.inc similarity index 50% rename from libclc/generic/libspirv/core/shared/clc_max.inc rename to libclc/clc/lib/generic/shared/clc_max.inc index 882f29ce30d9..f4234cb359d8 100644 --- a/libclc/generic/libspirv/core/shared/clc_max.inc +++ b/libclc/clc/lib/generic/shared/clc_max.inc @@ -1,11 +1,3 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_max(__CLC_GENTYPE a, __CLC_GENTYPE b) { return (a > b ? a : b); diff --git a/libclc/clc/lib/generic/shared/clc_min.cl b/libclc/clc/lib/generic/shared/clc_min.cl new file mode 100644 index 000000000000..12a26f535240 --- /dev/null +++ b/libclc/clc/lib/generic/shared/clc_min.cl @@ -0,0 +1,7 @@ +#include + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include diff --git a/libclc/generic/libspirv/core/shared/clc_min.inc b/libclc/clc/lib/generic/shared/clc_min.inc similarity index 50% rename from libclc/generic/libspirv/core/shared/clc_min.inc rename to libclc/clc/lib/generic/shared/clc_min.inc index d8a51291dbc2..e9c85ddd3aff 100644 --- a/libclc/generic/libspirv/core/shared/clc_min.inc +++ b/libclc/clc/lib/generic/shared/clc_min.inc @@ -1,11 +1,3 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_min(__CLC_GENTYPE a, __CLC_GENTYPE b) { return (b < a ? b : a); diff --git a/libclc/generic/lib/common/smoothstep.cl b/libclc/generic/lib/common/smoothstep.cl index 8d18024c446d..99553cac901d 100644 --- a/libclc/generic/lib/common/smoothstep.cl +++ b/libclc/generic/lib/common/smoothstep.cl @@ -49,7 +49,7 @@ _CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, smoothstep, double, _CLC_V_S_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, smoothstep, double, double, double); -#if !defined(CLC_SPIRV) && !defined(CLC_SPIRV64) +#if !defined(CLC_SPIRV) SMOOTH_STEP_DEF(float, double, SMOOTH_STEP_IMPL_D); SMOOTH_STEP_DEF(double, float, SMOOTH_STEP_IMPL_D); diff --git a/libclc/generic/lib/common/step.cl b/libclc/generic/lib/common/step.cl index f8cbd125c063..46e86249b57e 100644 --- a/libclc/generic/lib/common/step.cl +++ b/libclc/generic/lib/common/step.cl @@ -44,7 +44,7 @@ STEP_DEF(double, double); _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, step, double, double); _CLC_V_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, step, double, double); -#if !defined(CLC_SPIRV) && !defined(CLC_SPIRV64) +#if !defined(CLC_SPIRV) STEP_DEF(float, double); STEP_DEF(double, float); diff --git a/libclc/generic/lib/math/clc_ldexp.cl b/libclc/generic/lib/math/clc_ldexp.cl index ae6117b7b292..438c31835a36 100644 --- a/libclc/generic/lib/math/clc_ldexp.cl +++ b/libclc/generic/lib/math/clc_ldexp.cl @@ -20,76 +20,78 @@ * THE SOFTWARE. */ -#include -#include "config.h" #include "../clcmacro.h" +#include "config.h" #include "math.h" +#include +#include _CLC_DEF _CLC_OVERLOAD float __clc_ldexp(float x, int n) { - if (!__clc_fp32_subnormals_supported()) { - - // This treats subnormals as zeros - int i = as_int(x); - int e = (i >> 23) & 0xff; - int m = i & 0x007fffff; - int s = i & 0x80000000; - int v = add_sat(e, n); - v = clamp(v, 0, 0xff); - int mr = e == 0 | v == 0 | v == 0xff ? 0 : m; - int c = e == 0xff; - mr = c ? m : mr; - int er = c ? e : v; - er = e ? er : e; - return as_float( s | (er << 23) | mr ); - } - - /* supports denormal values */ - const int multiplier = 24; - float val_f; - uint val_ui; - uint sign; - int exponent; - val_ui = as_uint(x); - sign = val_ui & 0x80000000; - val_ui = val_ui & 0x7fffffff;/* remove the sign bit */ - int val_x = val_ui; - - exponent = val_ui >> 23; /* get the exponent */ - int dexp = exponent; - - /* denormal support */ - int fbh = 127 - (as_uint((float)(as_float(val_ui | 0x3f800000) - 1.0f)) >> 23); - int dexponent = 25 - fbh; - uint dval_ui = (( (val_ui << fbh) & 0x007fffff) | (dexponent << 23)); - int ex = dexponent + n - multiplier; - dexponent = ex; - uint val = sign | (ex << 23) | (dval_ui & 0x007fffff); - int ex1 = dexponent + multiplier; - ex1 = -ex1 +25; - dval_ui = (((dval_ui & 0x007fffff )| 0x800000) >> ex1); - dval_ui = dexponent > 0 ? val :dval_ui; - dval_ui = dexponent > 254 ? 0x7f800000 :dval_ui; /*overflow*/ - dval_ui = dexponent < -multiplier ? 0 : dval_ui; /*underflow*/ - dval_ui = dval_ui | sign; - val_f = as_float(dval_ui); - - exponent += n; - - val = sign | (exponent << 23) | (val_ui & 0x007fffff); - ex1 = exponent + multiplier; - ex1 = -ex1 +25; - val_ui = (((val_ui & 0x007fffff )| 0x800000) >> ex1); - val_ui = exponent > 0 ? val :val_ui; - val_ui = exponent > 254 ? 0x7f800000 :val_ui; /*overflow*/ - val_ui = exponent < -multiplier ? 0 : val_ui; /*underflow*/ - val_ui = val_ui | sign; - - val_ui = dexp == 0? dval_ui : val_ui; - val_f = as_float(val_ui); - - val_f = isnan(x) | isinf(x) | val_x == 0 ? x : val_f; - return val_f; + if (!__clc_fp32_subnormals_supported()) { + + // This treats subnormals as zeros + int i = as_int(x); + int e = (i >> 23) & 0xff; + int m = i & 0x007fffff; + int s = i & 0x80000000; + int v = add_sat(e, n); + v = __clc_clamp(v, 0, 0xff); + int mr = e == 0 | v == 0 | v == 0xff ? 0 : m; + int c = e == 0xff; + mr = c ? m : mr; + int er = c ? e : v; + er = e ? er : e; + return as_float(s | (er << 23) | mr); + } + + /* supports denormal values */ + const int multiplier = 24; + float val_f; + uint val_ui; + uint sign; + int exponent; + val_ui = as_uint(x); + sign = val_ui & 0x80000000; + val_ui = val_ui & 0x7fffffff; /* remove the sign bit */ + int val_x = val_ui; + + exponent = val_ui >> 23; /* get the exponent */ + int dexp = exponent; + + /* denormal support */ + int fbh = + 127 - (as_uint((float)(as_float(val_ui | 0x3f800000) - 1.0f)) >> 23); + int dexponent = 25 - fbh; + uint dval_ui = (((val_ui << fbh) & 0x007fffff) | (dexponent << 23)); + int ex = dexponent + n - multiplier; + dexponent = ex; + uint val = sign | (ex << 23) | (dval_ui & 0x007fffff); + int ex1 = dexponent + multiplier; + ex1 = -ex1 + 25; + dval_ui = (((dval_ui & 0x007fffff) | 0x800000) >> ex1); + dval_ui = dexponent > 0 ? val : dval_ui; + dval_ui = dexponent > 254 ? 0x7f800000 : dval_ui; /*overflow*/ + dval_ui = dexponent < -multiplier ? 0 : dval_ui; /*underflow*/ + dval_ui = dval_ui | sign; + val_f = as_float(dval_ui); + + exponent += n; + + val = sign | (exponent << 23) | (val_ui & 0x007fffff); + ex1 = exponent + multiplier; + ex1 = -ex1 + 25; + val_ui = (((val_ui & 0x007fffff) | 0x800000) >> ex1); + val_ui = exponent > 0 ? val : val_ui; + val_ui = exponent > 254 ? 0x7f800000 : val_ui; /*overflow*/ + val_ui = exponent < -multiplier ? 0 : val_ui; /*underflow*/ + val_ui = val_ui | sign; + + val_ui = dexp == 0 ? dval_ui : val_ui; + val_f = as_float(val_ui); + + val_f = isnan(x) | isinf(x) | val_x == 0 ? x : val_f; + return val_f; } #ifdef cl_khr_fp64 @@ -97,32 +99,32 @@ _CLC_DEF _CLC_OVERLOAD float __clc_ldexp(float x, int n) { #pragma OPENCL EXTENSION cl_khr_fp64 : enable _CLC_DEF _CLC_OVERLOAD double __clc_ldexp(double x, int n) { - long l = as_ulong(x); - int e = (l >> 52) & 0x7ff; - long s = l & 0x8000000000000000; + long l = as_ulong(x); + int e = (l >> 52) & 0x7ff; + long s = l & 0x8000000000000000; - ulong ux = as_ulong(x * 0x1.0p+53); - int de = ((int)(ux >> 52) & 0x7ff) - 53; - int c = e == 0; - e = c ? de: e; + ulong ux = as_ulong(x * 0x1.0p+53); + int de = ((int)(ux >> 52) & 0x7ff) - 53; + int c = e == 0; + e = c ? de : e; - ux = c ? ux : l; + ux = c ? ux : l; - int v = e + n; - v = clamp(v, -0x7ff, 0x7ff); + int v = e + n; + v = __clc_clamp(v, -0x7ff, 0x7ff); - ux &= ~EXPBITS_DP64; + ux &= ~EXPBITS_DP64; - double mr = as_double(ux | ((ulong)(v+53) << 52)); - mr = mr * 0x1.0p-53; + double mr = as_double(ux | ((ulong)(v + 53) << 52)); + mr = mr * 0x1.0p-53; - mr = v > 0 ? as_double(ux | ((ulong)v << 52)) : mr; + mr = v > 0 ? as_double(ux | ((ulong)v << 52)) : mr; - mr = v == 0x7ff ? as_double(s | PINFBITPATT_DP64) : mr; - mr = v < -53 ? as_double(s) : mr; + mr = v == 0x7ff ? as_double(s | PINFBITPATT_DP64) : mr; + mr = v < -53 ? as_double(s) : mr; - mr = ((n == 0) | isinf(x) | (x == 0) ) ? x : mr; - return mr; + mr = ((n == 0) | isinf(x) | (x == 0)) ? x : mr; + return mr; } #endif @@ -132,7 +134,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_ldexp(double x, int n) { #pragma OPENCL EXTENSION cl_khr_fp16 : enable _CLC_OVERLOAD _CLC_DEF half __clc_ldexp(half x, int n) { - return (half)__clc_ldexp((float)x, n); + return (half)__clc_ldexp((float)x, n); } _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __clc_ldexp, half, int); diff --git a/libclc/generic/lib/math/math.h b/libclc/generic/lib/math/math.h index 62a4c925db51..b1f82b7991ea 100644 --- a/libclc/generic/lib/math/math.h +++ b/libclc/generic/lib/math/math.h @@ -40,7 +40,7 @@ #if (defined __AMDGCN__ || defined __R600__) && !defined __HAS_FMAF__ #define HAVE_HW_FMA32() (0) -#elif defined CLC_SPIRV || defined CLC_SPIRV64 +#elif defined(CLC_SPIRV) bool __attribute__((noinline)) __clc_runtime_has_hw_fma32(void); #define HAVE_HW_FMA32() __clc_runtime_has_hw_fma32() #else diff --git a/libclc/generic/lib/shared/clamp.cl b/libclc/generic/lib/shared/clamp.cl index 51a009281be2..f470fc822f75 100644 --- a/libclc/generic/lib/shared/clamp.cl +++ b/libclc/generic/lib/shared/clamp.cl @@ -1,6 +1,5 @@ #include -#include -#include +#include #define __CLC_BODY #include diff --git a/libclc/generic/lib/shared/max.cl b/libclc/generic/lib/shared/max.cl index a753b702f658..2266d5905afd 100644 --- a/libclc/generic/lib/shared/max.cl +++ b/libclc/generic/lib/shared/max.cl @@ -1,6 +1,5 @@ #include -#include -#include +#include #define __CLC_BODY #include diff --git a/libclc/generic/lib/shared/max.inc b/libclc/generic/lib/shared/max.inc index 3c83e29c3f2d..ec433a89c6e9 100644 --- a/libclc/generic/lib/shared/max.inc +++ b/libclc/generic/lib/shared/max.inc @@ -3,7 +3,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a, __CLC_GENTYPE b) { } #ifndef __CLC_SCALAR -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b) { +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a, + __CLC_SCALAR_GENTYPE b) { return __clc_max(a, b); } #endif diff --git a/libclc/generic/lib/shared/min.cl b/libclc/generic/lib/shared/min.cl index 31c47872f6aa..f5c4d57f4b8d 100644 --- a/libclc/generic/lib/shared/min.cl +++ b/libclc/generic/lib/shared/min.cl @@ -1,6 +1,5 @@ #include -#include -#include +#include #define __CLC_BODY #include diff --git a/libclc/generic/lib/shared/min.inc b/libclc/generic/lib/shared/min.inc index f70956479d6d..6a00944cbe35 100644 --- a/libclc/generic/lib/shared/min.inc +++ b/libclc/generic/lib/shared/min.inc @@ -3,7 +3,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_GENTYPE b) { } #ifndef __CLC_SCALAR -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b) { +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, + __CLC_SCALAR_GENTYPE b) { return __clc_min(a, b); } #endif diff --git a/libclc/generic/libspirv/SOURCES b/libclc/generic/libspirv/SOURCES index a222a1f7281a..10d86bd5a67f 100644 --- a/libclc/generic/libspirv/SOURCES +++ b/libclc/generic/libspirv/SOURCES @@ -39,9 +39,6 @@ core/integer/clc_mul_hi.cl core/integer/clc_rhadd.cl core/integer/clc_sub_sat.cl core/integer/clc_upsample.cl -core/shared/clc_clamp.cl -core/shared/clc_max.cl -core/shared/clc_min.cl geometric/cross.cl geometric/distance.cl geometric/dot.cl diff --git a/libclc/generic/libspirv/core/shared/clc_clamp.cl b/libclc/generic/libspirv/core/shared/clc_clamp.cl deleted file mode 100644 index d85c0a420f16..000000000000 --- a/libclc/generic/libspirv/core/shared/clc_clamp.cl +++ /dev/null @@ -1,15 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include - -#define __CLC_BODY -#include - -#define __CLC_BODY -#include diff --git a/libclc/generic/libspirv/core/shared/clc_max.cl b/libclc/generic/libspirv/core/shared/clc_max.cl deleted file mode 100644 index 211c2c3ebba1..000000000000 --- a/libclc/generic/libspirv/core/shared/clc_max.cl +++ /dev/null @@ -1,15 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include - -#define __CLC_BODY -#include - -#define __CLC_BODY -#include diff --git a/libclc/generic/libspirv/core/shared/clc_min.cl b/libclc/generic/libspirv/core/shared/clc_min.cl deleted file mode 100644 index 9c7ea87286d8..000000000000 --- a/libclc/generic/libspirv/core/shared/clc_min.cl +++ /dev/null @@ -1,15 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include - -#define __CLC_BODY -#include - -#define __CLC_BODY -#include diff --git a/libclc/generic/libspirv/math/clc_hypot.cl b/libclc/generic/libspirv/math/clc_hypot.cl index b34a5e5107b4..2e0c2737e164 100644 --- a/libclc/generic/libspirv/math/clc_hypot.cl +++ b/libclc/generic/libspirv/math/clc_hypot.cl @@ -8,82 +8,84 @@ #include +#include +#include #include #include #include #include -// Returns sqrt(x*x + y*y) with no overflow or underflow unless the result warrants it -_CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) -{ - uint ux = as_uint(x); - uint aux = ux & EXSIGNBIT_SP32; - uint uy = as_uint(y); - uint auy = uy & EXSIGNBIT_SP32; - float retval; - int c = aux > auy; - ux = c ? aux : auy; - uy = c ? auy : aux; - - int xexp = __spirv_ocl_s_clamp( - (int)(ux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32, -126, 126); - float fx_exp = as_float((xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); - float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); - float fx = as_float(ux) * fi_exp; - float fy = as_float(uy) * fi_exp; - retval = __spirv_ocl_sqrt(__spirv_ocl_mad(fx, fx, fy * fy)) * fx_exp; - - retval = ux > PINFBITPATT_SP32 || uy == 0 ? as_float(ux) : retval; - retval = ux == PINFBITPATT_SP32 || uy == PINFBITPATT_SP32 - ? as_float(PINFBITPATT_SP32) - : retval; - return retval; +// Returns sqrt(x*x + y*y) with no overflow or underflow unless the result +// warrants it +_CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) { + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + uint uy = as_uint(y); + uint auy = uy & EXSIGNBIT_SP32; + float retval; + int c = aux > auy; + ux = c ? aux : auy; + uy = c ? auy : aux; + + int xexp = + __clc_clamp((int)(ux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32, -126, 126); + float fx_exp = as_float((xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); + float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); + float fx = as_float(ux) * fi_exp; + float fy = as_float(uy) * fi_exp; + retval = __spirv_ocl_sqrt(__spirv_ocl_mad(fx, fx, fy * fy)) * fx_exp; + + retval = ux > PINFBITPATT_SP32 || uy == 0 ? as_float(ux) : retval; + retval = ux == PINFBITPATT_SP32 || uy == PINFBITPATT_SP32 + ? as_float(PINFBITPATT_SP32) + : retval; + return retval; } _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_hypot, float, float) #ifdef cl_khr_fp64 -_CLC_DEF _CLC_OVERLOAD double __clc_hypot(double x, double y) -{ - ulong ux = as_ulong(x) & ~SIGNBIT_DP64; - int xexp = ux >> EXPSHIFTBITS_DP64; - x = as_double(ux); +_CLC_DEF _CLC_OVERLOAD double __clc_hypot(double x, double y) { + ulong ux = as_ulong(x) & ~SIGNBIT_DP64; + int xexp = ux >> EXPSHIFTBITS_DP64; + x = as_double(ux); - ulong uy = as_ulong(y) & ~SIGNBIT_DP64; - int yexp = uy >> EXPSHIFTBITS_DP64; - y = as_double(uy); + ulong uy = as_ulong(y) & ~SIGNBIT_DP64; + int yexp = uy >> EXPSHIFTBITS_DP64; + y = as_double(uy); - int c = xexp > EXPBIAS_DP64 + 500 | yexp > EXPBIAS_DP64 + 500; - double preadjust = c ? 0x1.0p-600 : 1.0; - double postadjust = c ? 0x1.0p+600 : 1.0; + int c = xexp > EXPBIAS_DP64 + 500 | yexp > EXPBIAS_DP64 + 500; + double preadjust = c ? 0x1.0p-600 : 1.0; + double postadjust = c ? 0x1.0p+600 : 1.0; - c = xexp < EXPBIAS_DP64 - 500 | yexp < EXPBIAS_DP64 - 500; - preadjust = c ? 0x1.0p+600 : preadjust; - postadjust = c ? 0x1.0p-600 : postadjust; + c = xexp < EXPBIAS_DP64 - 500 | yexp < EXPBIAS_DP64 - 500; + preadjust = c ? 0x1.0p+600 : preadjust; + postadjust = c ? 0x1.0p-600 : postadjust; - double ax = x * preadjust; - double ay = y * preadjust; + double ax = x * preadjust; + double ay = y * preadjust; - // The post adjust may overflow, but this can't be avoided in any case - double r = __spirv_ocl_sqrt(__spirv_ocl_fma(ax, ax, ay * ay)) * postadjust; + // The post adjust may overflow, but this can't be avoided in any case + double r = __spirv_ocl_sqrt(__spirv_ocl_fma(ax, ax, ay * ay)) * postadjust; - // If the difference in exponents between x and y is large - double s = x + y; - c = __spirv_ocl_s_abs(xexp - yexp) > MANTLENGTH_DP64 + 1; - r = c ? s : r; + // If the difference in exponents between x and y is large + double s = x + y; + c = __spirv_ocl_s_abs(xexp - yexp) > MANTLENGTH_DP64 + 1; + r = c ? s : r; - // Check for NaN - //c = x != x | y != y; - c = __spirv_IsNan(x) | __spirv_IsNan(y); - r = c ? as_double(QNANBITPATT_DP64) : r; + // Check for NaN + // c = x != x | y != y; + c = __spirv_IsNan(x) | __spirv_IsNan(y); + r = c ? as_double(QNANBITPATT_DP64) : r; - // If either is Inf, we must return Inf - c = x == as_double(PINFBITPATT_DP64) | y == as_double(PINFBITPATT_DP64); - r = c ? as_double(PINFBITPATT_DP64) : r; + // If either is Inf, we must return Inf + c = x == as_double(PINFBITPATT_DP64) | y == as_double(PINFBITPATT_DP64); + r = c ? as_double(PINFBITPATT_DP64) : r; - return r; + return r; } -_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_hypot, double, double) +_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_hypot, double, + double) #endif #ifdef cl_khr_fp16 diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td index 686c1de58a9f..5fc1cf79a1ca 100644 --- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td +++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td @@ -320,6 +320,9 @@ def : HipTargetInfo<"amd_gpu_gfx906", !listconcat(HipMinAspects, AllUSMAspects), def : HipTargetInfo<"amd_gpu_gfx700", HipMinAspects, HipSubgroupSizesGCN2>; def : HipTargetInfo<"amd_gpu_gfx701", HipMinAspects, HipSubgroupSizesGCN2>; def : HipTargetInfo<"amd_gpu_gfx702", HipMinAspects, HipSubgroupSizesGCN2>; +def : HipTargetInfo<"amd_gpu_gfx703", HipMinAspects, HipSubgroupSizesGCN2>; +def : HipTargetInfo<"amd_gpu_gfx704", HipMinAspects, HipSubgroupSizesGCN2>; +def : HipTargetInfo<"amd_gpu_gfx705", HipMinAspects, HipSubgroupSizesGCN2>; def : HipTargetInfo<"amd_gpu_gfx801", HipMinAspects, HipSubgroupSizesGCN3>; def : HipTargetInfo<"amd_gpu_gfx802", HipMinAspects, HipSubgroupSizesGCN3>; def : HipTargetInfo<"amd_gpu_gfx803", HipMinAspects, HipSubgroupSizesGCN3>; diff --git a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp index 6d8c248a8160..126a03bdf03b 100644 --- a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp +++ b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp @@ -647,6 +647,38 @@ void setSyclFixedTargetsMD(const std::vector &EntryPoints, F->setMetadata("sycl_fixed_targets", MDN); } +void collectVirtualFunctionSetInfo( + Function &F, StringMap> &VirtualFunctionSets) { + if (!F.hasFnAttribute("indirectly-callable")) + return; + Attribute IndirectlyCallableAttr = F.getFnAttribute("indirectly-callable"); + StringRef SetName = IndirectlyCallableAttr.getValueAsString(); + VirtualFunctionSets[SetName].push_back(&F); +} + +// For each set S of virtual functions that F declares, +// propagate S through the CG and then add the aspects +// used by S to F. +void processDeclaredVirtualFunctionSets( + Function *F, CallGraphTy &CG, FunctionToAspectsMapTy &AspectsMap, + SmallPtrSet &Visited, + StringMap> &VirtualFunctionSets) { + if (!F->hasFnAttribute("calls-indirectly")) + return; + Attribute CallsIndirectlyAttr = F->getFnAttribute("calls-indirectly"); + SmallVector DeclaredVirtualFunctionSetNames; + CallsIndirectlyAttr.getValueAsString().split(DeclaredVirtualFunctionSetNames, + ","); + auto &AspectsF = AspectsMap[F]; + for (auto Name : DeclaredVirtualFunctionSetNames) { + for (auto VFn : VirtualFunctionSets[Name]) { + propagateAspectsThroughCG(VFn, CG, AspectsMap, Visited); + for (auto Aspect : AspectsMap[VFn]) + AspectsF.insert(Aspect); + } + } +} + /// Returns a map of functions with corresponding used aspects. std::pair buildFunctionsToAspectsMap(Module &M, TypeToAspectsMapTy &TypesWithAspects, @@ -655,16 +687,21 @@ buildFunctionsToAspectsMap(Module &M, TypeToAspectsMapTy &TypesWithAspects, bool ValidateAspects, bool FP64ConvEmu) { FunctionToAspectsMapTy FunctionToUsedAspects; FunctionToAspectsMapTy FunctionToDeclaredAspects; + StringMap> VirtualFunctionSets; CallGraphTy CG; for (Function &F : M.functions()) { processFunction(F, FunctionToUsedAspects, FunctionToDeclaredAspects, TypesWithAspects, CG, AspectValues, FP64ConvEmu); + collectVirtualFunctionSetInfo(F, VirtualFunctionSets); } SmallPtrSet Visited; - for (Function *F : EntryPoints) + for (Function *F : EntryPoints) { propagateAspectsThroughCG(F, CG, FunctionToUsedAspects, Visited); + processDeclaredVirtualFunctionSets(F, CG, FunctionToUsedAspects, Visited, + VirtualFunctionSets); + } if (ValidateAspects) validateUsedAspectsForFunctions(FunctionToUsedAspects, AspectValues, diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-1.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-1.ll new file mode 100644 index 000000000000..709ca33eae3b --- /dev/null +++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-1.ll @@ -0,0 +1,20 @@ +; RUN: opt -passes=sycl-propagate-aspects-usage < %s -S | FileCheck %s + +; CHECK: @vfn() #0 !sycl_used_aspects ![[#aspects:]] +define spir_func void @vfn() #0 { + %tmp = alloca double + ret void +} + +; CHECK: @foo() #1 !sycl_used_aspects ![[#aspects]] +define spir_kernel void @foo() #1 { + ret void +} + +; CHECK: ![[#aspects]] = !{i32 6} + +attributes #0 = { "indirectly-callable"="_ZTSv" } +attributes #1 = { "calls-indirectly"="_ZTSv" } + +!sycl_aspects = !{!0} +!0 = !{!"fp64", i32 6} diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-2.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-2.ll new file mode 100644 index 000000000000..ae600413378f --- /dev/null +++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-2.ll @@ -0,0 +1,36 @@ +; RUN: opt -passes=sycl-propagate-aspects-usage < %s -S | FileCheck %s + +%Foo = type { i32 } +%Bar = type { i32 } + +; CHECK: @vfnFoo() #0 !sycl_used_aspects ![[#aspectsFoo:]] +define spir_func void @vfnFoo() #0 { + %tmp = alloca %Foo + ret void +} + +; CHECK: @vfnBar() #1 !sycl_used_aspects ![[#aspectsBar:]] +define spir_func void @vfnBar() #1 { + %tmp = alloca %Bar + ret void +} + +; CHECK: @kernel() #2 !sycl_used_aspects ![[#aspectsKernel:]] +define spir_kernel void @kernel() #2 { + ret void +} + +; CHECK: ![[#aspectsFoo]] = !{i32 1} +; CHECK: ![[#aspectsBar]] = !{i32 2} +; CHECK: ![[#aspectsKernel]] = !{i32 1, i32 2} + +attributes #0 = { "indirectly-callable"="setFoo" } +attributes #1 = { "indirectly-callable"="setBar" } +attributes #2 = { "calls-indirectly"="setFoo,setBar" } + +!sycl_aspects = !{!0} +!0 = !{!"fp64", i32 6} + +!sycl_types_that_use_aspects = !{!1, !2} +!1 = !{!"Foo", i32 1} +!2 = !{!"Bar", i32 2} diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-3.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-3.ll new file mode 100644 index 000000000000..ada0f533ced5 --- /dev/null +++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-3.ll @@ -0,0 +1,51 @@ +; RUN: opt -passes=sycl-propagate-aspects-usage < %s -S | FileCheck %s + +%Foo = type { i32 } +%Bar = type { i32 } + +; CHECK: @vfnFoo() #0 !sycl_used_aspects ![[#aspectsFoo:]] +define spir_func void @vfnFoo() #0 { + call void @subFoo() + ret void +} + +define spir_func void @subFoo() { + %tmp = alloca %Foo + ret void +} + +; CHECK: @vfnBar() #1 !sycl_used_aspects ![[#aspectsBar:]] +define spir_func void @vfnBar() #1 { + call void @subBar() + ret void +} + +define spir_func void @subBar() { + %tmp = alloca %Bar + ret void +} + +; CHECK: @kernelA() #2 !sycl_used_aspects ![[#aspectsFoo]] +define spir_kernel void @kernelA() #2 { + ret void +} + +; CHECK: @kernelB() #3 !sycl_used_aspects ![[#aspectsBar]] +define spir_kernel void @kernelB() #3 { + ret void +} + +; CHECK: ![[#aspectsFoo]] = !{i32 1} +; CHECK: ![[#aspectsBar]] = !{i32 2} + +attributes #0 = { "indirectly-callable"="setFoo" } +attributes #1 = { "indirectly-callable"="setBar" } +attributes #2 = { "calls-indirectly"="setFoo" } +attributes #3 = { "calls-indirectly"="setBar" } + +!sycl_aspects = !{!0} +!0 = !{!"fp64", i32 6} + +!sycl_types_that_use_aspects = !{!1, !2} +!1 = !{!"Foo", i32 1} +!2 = !{!"Bar", i32 2} diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-4.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-4.ll new file mode 100644 index 000000000000..700a2dc7a8b7 --- /dev/null +++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/VirtualFunctions/virtual-functions-4.ll @@ -0,0 +1,26 @@ +; RUN: opt -passes=sycl-propagate-aspects-usage < %s -S | FileCheck %s +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" +target triple = "spir64-unknown-unknown" + +@vtable = linkonce_odr dso_local unnamed_addr addrspace(1) constant { [3 x ptr addrspace(4)] } { [3 x ptr addrspace(4)] [ptr addrspace(4) null, ptr addrspace(4) null, ptr addrspace(4) addrspacecast (ptr @foo to ptr addrspace(4))] }, align 8 + +; CHECK: @foo() #0 !sycl_used_aspects ![[#aspects:]] +define linkonce_odr spir_func void @foo() #0 { +entry: + %tmp = alloca double + ret void +} + +; CHECK-NOT: @construct({{.*}}){{.*}}!sycl_used_aspects +define weak_odr dso_local spir_kernel void @construct(ptr addrspace(1) noundef align 8 %_arg_StorageAcc) { +entry: + store ptr addrspace(1) getelementptr inbounds inrange(-16, 8) (i8, ptr addrspace(1) @vtable, i64 16), ptr addrspace(1) %_arg_StorageAcc, align 8 + ret void +} + +; CHECK: ![[#aspects]] = !{i32 6} + +attributes #0 = { "indirectly-callable"="set-foo" } + +!sycl_aspects = !{!0} +!0 = !{!"fp64", i32 6} \ No newline at end of file diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 01bd0cd4d958..a7b68befa96b 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit b0a9e2be61ad42d3447f1f246120ab25119a03e0 -# Merge: fa8cc8ec a0cf2ce2 +# commit 2858a8a28d0b6524a3b2b0e25a597d1c8295ce9d +# Merge: a5a649f3 34b66fda # Author: Callum Fare -# Date: Mon Nov 4 10:00:08 2024 +0000 -# Merge pull request #2165 from aarongreig/aaron/makeUSMPoolsOptional -# Make USM pools optional with a device query to report support. -set(UNIFIED_RUNTIME_TAG b0a9e2be61ad42d3447f1f246120ab25119a03e0) +# Date: Wed Nov 6 16:45:49 2024 +0000 +# Merge pull request #2276 from rafbiels/rafbiels/fix-hip-evbase +# Set the right HIP device before creating base event counter +set(UNIFIED_RUNTIME_TAG 2858a8a28d0b6524a3b2b0e25a597d1c8295ce9d) diff --git a/sycl/doc/UsersManual.md b/sycl/doc/UsersManual.md index 3f184edc12de..efdf03249616 100644 --- a/sycl/doc/UsersManual.md +++ b/sycl/doc/UsersManual.md @@ -95,6 +95,9 @@ and not recommended to use in production environment. * amd_gpu_gfx700 - AMD GCN GFX7 (Sea Islands (CI)) architecture * amd_gpu_gfx701 - AMD GCN GFX7 (Sea Islands (CI)) architecture * amd_gpu_gfx702 - AMD GCN GFX7 (Sea Islands (CI)) architecture + * amd_gpu_gfx703 - AMD GCN GFX7 (Sea Islands (CI)) architecture + * amd_gpu_gfx704 - AMD GCN GFX7 (Sea Islands (CI)) architecture + * amd_gpu_gfx705 - AMD GCN GFX7 (Sea Islands (CI)) architecture * amd_gpu_gfx801 - AMD GCN GFX8 (Volcanic Islands (VI)) architecture * amd_gpu_gfx802 - AMD GCN GFX8 (Volcanic Islands (VI)) architecture * amd_gpu_gfx803 - AMD GCN GFX8 (Volcanic Islands (VI)) architecture @@ -104,13 +107,13 @@ and not recommended to use in production environment. * amd_gpu_gfx902 - AMD GCN GFX9 (Vega) architecture * amd_gpu_gfx904 - AMD GCN GFX9 (Vega) architecture * amd_gpu_gfx906 - AMD GCN GFX9 (Vega) architecture - * amd_gpu_gfx908 - AMD GCN GFX9 (Vega) architecture + * amd_gpu_gfx908 - AMD GCN GFX9 (CDNA1) architecture * amd_gpu_gfx909 - AMD GCN GFX9 (Vega) architecture - * amd_gpu_gfx90a - AMD GCN GFX9 (Vega) architecture + * amd_gpu_gfx90a - AMD GCN GFX9 (CDNA2) architecture * amd_gpu_gfx90c - AMD GCN GFX9 (Vega) architecture - * amd_gpu_gfx940 - AMD GCN GFX9 (Vega) architecture - * amd_gpu_gfx941 - AMD GCN GFX9 (Vega) architecture - * amd_gpu_gfx942 - AMD GCN GFX9 (Vega) architecture + * amd_gpu_gfx940 - AMD GCN GFX9 (CDNA3) architecture + * amd_gpu_gfx941 - AMD GCN GFX9 (CDNA3) architecture + * amd_gpu_gfx942 - AMD GCN GFX9 (CDNA3) architecture * amd_gpu_gfx1010 - AMD GCN GFX10.1 (RDNA 1) architecture * amd_gpu_gfx1011 - AMD GCN GFX10.1 (RDNA 1) architecture * amd_gpu_gfx1012 - AMD GCN GFX10.1 (RDNA 1) architecture diff --git a/sycl/doc/design/DeviceIf.md b/sycl/doc/design/DeviceIf.md index 5d26bdab639d..606ad0d3be72 100644 --- a/sycl/doc/design/DeviceIf.md +++ b/sycl/doc/design/DeviceIf.md @@ -88,6 +88,9 @@ recognizes: * `amd_gpu_gfx700` * `amd_gpu_gfx701` * `amd_gpu_gfx702` +* `amd_gpu_gfx703` +* `amd_gpu_gfx704` +* `amd_gpu_gfx705` * `amd_gpu_gfx801` * `amd_gpu_gfx802` * `amd_gpu_gfx803` @@ -187,6 +190,9 @@ one of the following corresponding C++ macro names: * `__SYCL_TARGET_AMD_GPU_GFX700__` * `__SYCL_TARGET_AMD_GPU_GFX701__` * `__SYCL_TARGET_AMD_GPU_GFX702__` +* `__SYCL_TARGET_AMD_GPU_GFX703__` +* `__SYCL_TARGET_AMD_GPU_GFX704__` +* `__SYCL_TARGET_AMD_GPU_GFX705__` * `__SYCL_TARGET_AMD_GPU_GFX801__` * `__SYCL_TARGET_AMD_GPU_GFX802__` * `__SYCL_TARGET_AMD_GPU_GFX803__` diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc index f2ebcc594446..e86661312036 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc @@ -551,6 +551,9 @@ a| amd_gpu_gfx700 amd_gpu_gfx701 amd_gpu_gfx702 +amd_gpu_gfx703 +amd_gpu_gfx704 +amd_gpu_gfx705 ---- |- |AMD GCN 2.0 architecture. diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc index ad8d6a7f5019..933a6aabd2bd 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc @@ -716,6 +716,34 @@ optimize such partial barriers. _{endnote}_] |==== +==== Command Graph + +The functions in this section are only available if the +link:./sycl_ext_oneapi_graph.asciidoc[ + sycl_ext_oneapi_graph] extension is supported. + +|==== +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +void execute_graph(sycl::queue q, command_graph &g); + +void execute_graph(sycl::handler &h, command_graph &g); + +} +---- +!==== +_Constraints_: Device and context associated with queue need to be identical +to device and context provided at command graph creation. + +_Effects_: Submits an executable command graph to the `sycl::queue` or `sycl::handler`. + +|==== == Issues diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 56f09c04d305..5dff0396f07f 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -1975,7 +1975,9 @@ Removing this restriction is something we may look at for future revisions of The command submission functions defined in link:../experimental/sycl_ext_oneapi_enqueue_functions.asciidoc[sycl_ext_oneapi_enqueue_functions] -can be used to add nodes to a graph when creating a graph from queue recording. +can be used adding nodes to a graph when creating a graph from queue recording. +New methods are also defined that enable submitting an executable graph, +e.g. directly to a queue without returning an event. == Examples and Usage Guide diff --git a/sycl/include/sycl/ext/oneapi/experimental/architectures.def b/sycl/include/sycl/ext/oneapi/experimental/architectures.def index b8148f673814..08ce75a37011 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/architectures.def +++ b/sycl/include/sycl/ext/oneapi/experimental/architectures.def @@ -114,6 +114,9 @@ __SYCL_ARCHITECTURE(nvidia_gpu_sm_90a, 0x01000000000090a0) __SYCL_ARCHITECTURE(amd_gpu_gfx700, 0x0200000000070000) __SYCL_ARCHITECTURE(amd_gpu_gfx701, 0x0200000000070100) __SYCL_ARCHITECTURE(amd_gpu_gfx702, 0x0200000000070200) +__SYCL_ARCHITECTURE(amd_gpu_gfx703, 0x0200000000070300) +__SYCL_ARCHITECTURE(amd_gpu_gfx704, 0x0200000000070400) +__SYCL_ARCHITECTURE(amd_gpu_gfx705, 0x0200000000070500) __SYCL_ARCHITECTURE(amd_gpu_gfx801, 0x0200000000080100) __SYCL_ARCHITECTURE(amd_gpu_gfx802, 0x0200000000080200) __SYCL_ARCHITECTURE(amd_gpu_gfx803, 0x0200000000080300) diff --git a/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp b/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp index 9638ac91ac79..ab6c011413f1 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp @@ -203,6 +203,15 @@ static constexpr ext::oneapi::experimental::architecture #ifndef __SYCL_TARGET_AMD_GPU_GFX702__ #define __SYCL_TARGET_AMD_GPU_GFX702__ 0 #endif +#ifndef __SYCL_TARGET_AMD_GPU_GFX703__ +#define __SYCL_TARGET_AMD_GPU_GFX703__ 0 +#endif +#ifndef __SYCL_TARGET_AMD_GPU_GFX704__ +#define __SYCL_TARGET_AMD_GPU_GFX704__ 0 +#endif +#ifndef __SYCL_TARGET_AMD_GPU_GFX705__ +#define __SYCL_TARGET_AMD_GPU_GFX705__ 0 +#endif #ifndef __SYCL_TARGET_AMD_GPU_GFX801__ #define __SYCL_TARGET_AMD_GPU_GFX801__ 0 #endif @@ -360,6 +369,9 @@ static constexpr bool is_allowable_aot_mode = (__SYCL_TARGET_AMD_GPU_GFX700__ == 1) || (__SYCL_TARGET_AMD_GPU_GFX701__ == 1) || (__SYCL_TARGET_AMD_GPU_GFX702__ == 1) || + (__SYCL_TARGET_AMD_GPU_GFX703__ == 1) || + (__SYCL_TARGET_AMD_GPU_GFX704__ == 1) || + (__SYCL_TARGET_AMD_GPU_GFX705__ == 1) || (__SYCL_TARGET_AMD_GPU_GFX801__ == 1) || (__SYCL_TARGET_AMD_GPU_GFX802__ == 1) || (__SYCL_TARGET_AMD_GPU_GFX803__ == 1) || @@ -538,6 +550,15 @@ get_current_architecture_aot() { #if __SYCL_TARGET_AMD_GPU_GFX702__ return ext::oneapi::experimental::architecture::amd_gpu_gfx702; #endif +#if __SYCL_TARGET_AMD_GPU_GFX703__ + return ext::oneapi::experimental::architecture::amd_gpu_gfx703; +#endif +#if __SYCL_TARGET_AMD_GPU_GFX704__ + return ext::oneapi::experimental::architecture::amd_gpu_gfx704; +#endif +#if __SYCL_TARGET_AMD_GPU_GFX705__ + return ext::oneapi::experimental::architecture::amd_gpu_gfx705; +#endif #if __SYCL_TARGET_AMD_GPU_GFX801__ return ext::oneapi::experimental::architecture::amd_gpu_gfx801; #endif diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index b3c758aaa891..7ecf5ce4c8b1 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -383,6 +384,17 @@ inline void partial_barrier(queue Q, const std::vector &Events, submit(Q, [&](handler &CGH) { partial_barrier(CGH, Events); }, CodeLoc); } +inline void execute_graph(queue Q, command_graph &G, + const sycl::detail::code_location &CodeLoc = + sycl::detail::code_location::current()) { + Q.ext_oneapi_graph(G, CodeLoc); +} + +inline void execute_graph(handler &CGH, + command_graph &G) { + CGH.ext_oneapi_graph(G); +} + } // namespace ext::oneapi::experimental } // namespace _V1 } // namespace sycl diff --git a/sycl/include/syclcompat/launch_policy.hpp b/sycl/include/syclcompat/launch_policy.hpp index 1c5f6ed3e97d..f6a30fc46db1 100644 --- a/sycl/include/syclcompat/launch_policy.hpp +++ b/sycl/include/syclcompat/launch_policy.hpp @@ -192,6 +192,17 @@ launch_policy(dim3, dim3, Ts...) -> launch_policy< detail::has_type>::value>; namespace detail { +// Custom std::apply helpers to enable inlining +template +__syclcompat_inline__ constexpr void apply_expand(F f, Tuple t, + std::index_sequence) { + [[clang::always_inline]] f(get(t)...); +} + +template +__syclcompat_inline__ constexpr void apply_helper(F f, Tuple t) { + apply_expand(f, t, std::make_index_sequence{}>{}); +} template @@ -211,12 +222,16 @@ struct KernelFunctor { operator()(syclcompat::detail::range_to_item_t) const { if constexpr (HasLocalMem) { char *local_mem_ptr = static_cast( - _local_acc.template get_multi_ptr().get()); - std::apply( - [lmem_ptr = local_mem_ptr](auto &&...args) { F(args..., lmem_ptr); }, + _local_acc.template get_multi_ptr() + .get()); + apply_helper( + [lmem_ptr = local_mem_ptr](auto &&...args) { + [[clang::always_inline]] F(args..., lmem_ptr); + }, _argument_tuple); } else { - std::apply([](auto &&...args) { F(args...); }, _argument_tuple); + apply_helper([](auto &&...args) { [[clang::always_inline]] F(args...); }, + _argument_tuple); } } diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp index b0b8a93d6697..b7842a8fd99e 100644 --- a/sycl/include/syclcompat/math.hpp +++ b/sycl/include/syclcompat/math.hpp @@ -118,6 +118,37 @@ class vectorized_binary { } }; +// Vectorized_binary for logical operations +template +class vectorized_binary< + VecT, BinaryOperation, + std::enable_if_t()( + std::declval(), + std::declval()))>>> { +public: + inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) { + unsigned result = 0; + constexpr size_t elem_size = 8 * sizeof(typename VecT::element_type); + static_assert(elem_size < 32, + "Vector element size must be less than 4 bytes"); + constexpr unsigned bool_mask = (1U << elem_size) - 1; + + for (size_t i = 0; i < a.size(); ++i) { + bool comp_result = binary_op(a[i], b[i]); + result |= (comp_result ? bool_mask : 0U) << (i * elem_size); + } + + VecT v4; + for (size_t i = 0; i < v4.size(); ++i) { + v4[i] = static_cast( + (result >> (i * elem_size)) & bool_mask); + } + + return v4; + } +}; + /// Extend the 'val' to 'bit' size, zero extend for unsigned int and signed /// extend for signed int. Returns a signed integer type. template @@ -1040,7 +1071,7 @@ struct average { } // namespace detail -/// Compute vectorized binary operation value for two values, with each value +/// Compute vectorized binary operation value for two/four values, with each /// treated as a vector type \p VecT. /// \tparam [in] VecT The type of the vector /// \tparam [in] BinaryOperation The binary operation class @@ -1052,14 +1083,19 @@ struct average { template inline unsigned vectorized_binary(unsigned a, unsigned b, const BinaryOperation binary_op, - bool need_relu = false) { + [[maybe_unused]] bool need_relu = false) { sycl::vec v0{a}, v1{b}; auto v2 = v0.as(); auto v3 = v1.as(); auto v4 = detail::vectorized_binary()(v2, v3, binary_op); - if (need_relu) - v4 = relu(v4); + if constexpr (!std::is_same_v< + bool, decltype(std::declval()( + std::declval(), + std::declval()))>) { + if (need_relu) + v4 = relu(v4); + } v0 = v4.template as>(); return v0; } diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index 6eedec5b4f40..38e5988a3745 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -626,6 +626,9 @@ constexpr std::pair NvidiaAmdGPUArchitectures[] = {"9.0", oneapi_exp_arch::nvidia_gpu_sm_90}, {"gfx701", oneapi_exp_arch::amd_gpu_gfx701}, {"gfx702", oneapi_exp_arch::amd_gpu_gfx702}, + {"gfx703", oneapi_exp_arch::amd_gpu_gfx703}, + {"gfx704", oneapi_exp_arch::amd_gpu_gfx704}, + {"gfx705", oneapi_exp_arch::amd_gpu_gfx705}, {"gfx801", oneapi_exp_arch::amd_gpu_gfx801}, {"gfx802", oneapi_exp_arch::amd_gpu_gfx802}, {"gfx803", oneapi_exp_arch::amd_gpu_gfx803}, @@ -838,8 +841,8 @@ struct get_device_info_impl< }; else if ((architecture::intel_gpu_pvc == DeviceArch) || (architecture::intel_gpu_bmg_g21 == DeviceArch) || - (architecture::intel_gpu_lnl_m == DeviceArch)) - return { + (architecture::intel_gpu_lnl_m == DeviceArch)) { + std::vector pvc_combs = { {8, 0, 0, 0, 16, 32, matrix_type::uint8, matrix_type::uint8, matrix_type::sint32, matrix_type::sint32}, {8, 0, 0, 0, 16, 32, matrix_type::uint8, matrix_type::sint8, @@ -947,10 +950,11 @@ struct get_device_info_impl< {8, 0, 0, 0, 16, 8, matrix_type::tf32, matrix_type::tf32, matrix_type::fp32, matrix_type::fp32}, }; - else if ((architecture::intel_gpu_dg2_g10 == DeviceArch) || - (architecture::intel_gpu_dg2_g11 == DeviceArch) || - (architecture::intel_gpu_dg2_g12 == DeviceArch) || - (architecture::intel_gpu_arl_h == DeviceArch)) + return pvc_combs; + } else if ((architecture::intel_gpu_dg2_g10 == DeviceArch) || + (architecture::intel_gpu_dg2_g11 == DeviceArch) || + (architecture::intel_gpu_dg2_g12 == DeviceArch) || + (architecture::intel_gpu_arl_h == DeviceArch)) return { {8, 0, 0, 0, 8, 32, matrix_type::uint8, matrix_type::uint8, matrix_type::sint32, matrix_type::sint32}, diff --git a/sycl/test-e2e/Compression/compression_separate_compile.cpp b/sycl/test-e2e/Compression/compression_separate_compile.cpp index 9e47bbebdc87..dab17e3506b4 100644 --- a/sycl/test-e2e/Compression/compression_separate_compile.cpp +++ b/sycl/test-e2e/Compression/compression_separate_compile.cpp @@ -10,6 +10,13 @@ ////////////////////// Link device images // RUN: %clangxx --offload-compress -fsycl -fsycl-link -fsycl-targets=spir64_x86_64 -fPIC %t_kernel1_aot.o %t_kernel2_aot.o -o %t_compressed_image.o -v +// Make sure the clang-offload-wrapper is called with the --offload-compress +// option. +// RUN: %clangxx --offload-compress -fsycl -fsycl-link -fsycl-targets=spir64_x86_64 -fPIC %t_kernel1_aot.o %t_kernel2_aot.o -o %t_compressed_image.o -### &> %t_driver_opts.txt +// RUN: FileCheck -input-file=%t_driver_opts.txt %s --check-prefix=CHECK-DRIVER-OPTS + +// CHECK-DRIVER-OPTS: clang-offload-wrapper{{.*}} "-offload-compress" + ////////////////////// Compile the host program // RUN: %clangxx -fsycl -std=c++17 -Wno-attributes -Wno-deprecated-declarations -fPIC -c %s -o %t_main.o diff --git a/sycl/test-e2e/Compression/no_zstd_warning.cpp b/sycl/test-e2e/Compression/no_zstd_warning.cpp index 8a4460f9b864..c87f2fe48009 100644 --- a/sycl/test-e2e/Compression/no_zstd_warning.cpp +++ b/sycl/test-e2e/Compression/no_zstd_warning.cpp @@ -1,4 +1,4 @@ // using --offload-compress without zstd should throw an error. // REQUIRES: !zstd -// RUN: not %{build} -O0 -g --offload-compress %S/Inputs/single_kernel.cpp -o %t_compress.out 2>&1 | FileCheck %s +// RUN: not %{build} %O0 -g --offload-compress %S/Inputs/single_kernel.cpp -o %t_compress.out 2>&1 | FileCheck %s // CHECK: '--offload-compress' option is specified but zstd is not available. The device image will not be compressed. diff --git a/sycl/test-e2e/Graph/Explicit/local_accessor.cpp b/sycl/test-e2e/Graph/Explicit/local_accessor.cpp new file mode 100644 index 000000000000..fbeb2c6a5ef5 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/local_accessor.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/local_accessor.cpp" diff --git a/sycl/test-e2e/Graph/Inputs/local_accessor.cpp b/sycl/test-e2e/Graph/Inputs/local_accessor.cpp new file mode 100644 index 000000000000..b3ac9fde67b6 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/local_accessor.cpp @@ -0,0 +1,54 @@ +// Tests basic adding of nodes with local accessors, +// and submission of the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + using T = int; + + const size_t LocalSize = 128; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 10); + + std::vector ReferenceA(DataA); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.wait_and_throw(); + + auto node = add_node(Graph, Queue, [&](handler &CGH) { + local_accessor LocalMem(LocalSize, CGH); + + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + LocalMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2; + PtrA[Item.get_global_linear_id()] += LocalMem[Item.get_local_linear_id()]; + }); + }); + + auto GraphExec = Graph.finalize(); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + + for (size_t i = 0; i < Size; i++) { + T Ref = 10 + i + (i * 2); + check_value(i, Ref, ReferenceA[i], "PtrA"); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp new file mode 100644 index 000000000000..100792a2e476 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp @@ -0,0 +1,93 @@ +// Tests whole graph update of nodes with local accessors, +// and submission of the graph. + +#include "../graph_common.hpp" + +using T = int; + +auto add_graph_node( + exp_ext::command_graph &Graph, + queue &Queue, size_t Size, size_t LocalSize, T *Ptr) { + return add_node(Graph, Queue, [&](handler &CGH) { + local_accessor LocalMem(LocalSize, CGH); + + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + LocalMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2; + Ptr[Item.get_global_linear_id()] += + LocalMem[Item.get_local_linear_id()] + Item.get_local_range(0); + }); + }); +} +int main() { + queue Queue{}; + + const size_t LocalSize = 128; + + std::vector DataA(Size), DataB(Size); + + std::iota(DataA.begin(), DataA.end(), 10); + std::iota(DataB.begin(), DataB.end(), 10); + + std::vector ReferenceA(DataA), ReferenceB(DataB); + + exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.wait_and_throw(); + + auto NodeA = add_graph_node(GraphA, Queue, Size, LocalSize / 2, PtrA); + + auto GraphExecA = GraphA.finalize(exp_ext::property::graph::updatable{}); + + // Create second graph for whole graph update with a different local size + exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()}; + auto NodeB = add_graph_node(GraphB, Queue, Size, LocalSize, PtrB); + + // Execute graphs before updating and check outputs + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecA); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.wait_and_throw(); + + for (size_t i = 0; i < Size; i++) { + T RefA = 10 + i + (i * 2) + LocalSize / 2; + T RefB = 10 + i; + check_value(i, RefA, ReferenceA[i], "PtrA"); + check_value(i, RefB, ReferenceB[i], "PtrB"); + } + + // Update GraphExecA using whole graph update + + GraphExecA.update(GraphB); + + // Execute graphs again and check outputs + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecA); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.wait_and_throw(); + + for (size_t i = 0; i < Size; i++) { + T RefA = 10 + i + (i * 2) + LocalSize / 2; + T RefB = 10 + i + (i * 2) + LocalSize; + check_value(i, RefA, ReferenceA[i], "PtrA"); + check_value(i, RefB, ReferenceB[i], "PtrB"); + } + + free(PtrA, Queue); + free(PtrB, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp index ef3b440790c6..6c6fe20337da 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp @@ -43,7 +43,7 @@ int main() { auto GraphExec = Graph.finalize(); - InOrderQueue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + exp_ext::execute_graph(InOrderQueue, GraphExec); InOrderQueue.wait_and_throw(); free(PtrA, InOrderQueue); diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_submit.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_submit.cpp index 8eccce2ea8ef..623d6fc81787 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_submit.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_submit.cpp @@ -60,7 +60,7 @@ int main() { auto GraphExec = Graph.finalize(); - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + exp_ext::execute_graph(Queue, GraphExec); Queue.wait_and_throw(); } diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_submit_with_event.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_submit_with_event.cpp index f66731d745bd..4b8294be7e98 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_submit_with_event.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_submit_with_event.cpp @@ -52,8 +52,9 @@ int main() { auto GraphExec = Graph.finalize(); - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); - Queue.wait_and_throw(); + exp_ext::submit_with_event(Queue, [&](handler &CGH) { + exp_ext::execute_graph(CGH, GraphExec); + }).wait(); free(PtrA, Queue); free(PtrB, Queue); diff --git a/sycl/test-e2e/Graph/RecordReplay/local_accessor.cpp b/sycl/test-e2e/Graph/RecordReplay/local_accessor.cpp new file mode 100644 index 000000000000..245983f67da4 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/local_accessor.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/local_accessor.cpp" diff --git a/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp b/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp new file mode 100644 index 000000000000..1db9905457ae --- /dev/null +++ b/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../../Inputs/whole_update_local_acc.cpp" diff --git a/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp new file mode 100644 index 000000000000..03645b2f19bf --- /dev/null +++ b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../../Inputs/whole_update_local_acc.cpp" diff --git a/sycl/test-e2e/Matrix/common.hpp b/sycl/test-e2e/Matrix/common.hpp index 8d2e46011635..90f5508d97cf 100644 --- a/sycl/test-e2e/Matrix/common.hpp +++ b/sycl/test-e2e/Matrix/common.hpp @@ -63,8 +63,7 @@ void matrix_multiply_ref(Ta *A, Tb *B, Tc *C, int M, int N, int K, if constexpr (std::is_same_v && std::is_same_v) acc += make_fp32(va[i]) * make_fp32(vb[i]); - else if constexpr (std::is_same_v && - std::is_same_v) + else if constexpr (std::is_same_v) acc += (float)va[i] * (float)vb[i]; else if constexpr (std::is_same_v && std::is_same_v || @@ -135,7 +134,8 @@ void matrix_rand(unsigned int rows, unsigned int cols, T *src, T val) { for (unsigned int i = 0; i < rows; i++) { for (unsigned int j = 0; j < cols; j++) { - if constexpr (std::is_same_v || std::is_same_v || + if constexpr (std::is_same_v || + std::is_same_v || std::is_same_v || std::is_same_v) { src[i * cols + j] = T(fdistr(dev)); } else if constexpr (std::is_integral_v) { diff --git a/sycl/test-e2e/Matrix/slm_utils.hpp b/sycl/test-e2e/Matrix/slm_utils.hpp index 6618f704754d..1c481336c01f 100644 --- a/sycl/test-e2e/Matrix/slm_utils.hpp +++ b/sycl/test-e2e/Matrix/slm_utils.hpp @@ -49,13 +49,13 @@ slm_read_write(multi_ptr pA, // NCache2*KCache2/(SGs*SG_SIZE) = 16 size_t elemsPerLoadB = NCache2 * KCache2 / (SGs * sgSize); size_t sgsPerRow = (NCache2 * vnniFactor) / (elemsPerLoadB * sgSize); - size_t GlOffsetB = - (k2 * (KCache2 / vnniFactor) + (uint)(sg.get_group_id() / sgsPerRow)) * - (colsB * vnniFactor) + - n2 * NCache2 * vnniFactor + - (sg.get_group_id() % sgsPerRow) * (elemsPerLoadB * sgSize); + size_t GlOffsetB = (k2 * (KCache2 / vnniFactor) + + (unsigned int)(sg.get_group_id() / sgsPerRow)) * + (colsB * vnniFactor) + + n2 * NCache2 * vnniFactor + + (sg.get_group_id() % sgsPerRow) * (elemsPerLoadB * sgSize); size_t LocOffsetB = - ((uint)(sg.get_group_id() / sgsPerRow)) * NCache2 * vnniFactor + + ((unsigned int)(sg.get_group_id() / sgsPerRow)) * NCache2 * vnniFactor + (sg.get_group_id() % sgsPerRow) * elemsPerLoadB * sgSize; auto SrcB = pB + GlOffsetB; auto DstB = diff --git a/sycl/test-e2e/syclcompat/math/math_vectorized.cpp b/sycl/test-e2e/syclcompat/math/math_vectorized.cpp index 9c57c88ce445..630d4b9c9f15 100644 --- a/sycl/test-e2e/syclcompat/math/math_vectorized.cpp +++ b/sycl/test-e2e/syclcompat/math/math_vectorized.cpp @@ -48,6 +48,18 @@ void test_vectorized_binary(unsigned op1, unsigned op2, unsigned expected, op1, op2, expected, need_relu); } +template +void test_vectorized_binary_logical(unsigned op1, unsigned op2, + unsigned expected) { + std::cout << __PRETTY_FUNCTION__ << std::endl; + constexpr syclcompat::dim3 grid{1}; + constexpr syclcompat::dim3 threads{1}; + + BinaryOpTestLauncher(grid, threads) + .template launch_test>( + op1, op2, expected, false); +} + template void vectorized_unary_kernel(unsigned *a, unsigned *r) { *r = syclcompat::vectorized_unary(*a, UnaryOp()); @@ -203,5 +215,67 @@ int main() { test_vectorized_binary_with_pred( 0x80010002, 0x00040002, 0x00040002, false, true); + // Logical Binary Operators v2 + test_vectorized_binary_logical, sycl::short2>( + 0xFFF00002, 0xFFF00001, 0xFFFF0000); + test_vectorized_binary_logical, sycl::short2>( + 0x0001F00F, 0x0003F00F, 0x0000FFFF); + + test_vectorized_binary_logical, sycl::short2>( + 0xFFF00002, 0xFFF00001, 0x0000FFFF); + test_vectorized_binary_logical, sycl::short2>( + 0x0001F00F, 0x0003F00F, 0xFFFF0000); + + test_vectorized_binary_logical, sycl::short2>( + 0xFFF00002, 0xFFF00001, 0xFFFFFFFF); + test_vectorized_binary_logical, sycl::short2>( + 0x0001F00F, 0x0003F001, 0x0000FFFF); + + test_vectorized_binary_logical, sycl::short2>( + 0xFFF00002, 0xFFF00001, 0x0000FFFF); + test_vectorized_binary_logical, sycl::short2>( + 0x0003F00F, 0x0001F00F, 0xFFFF0000); + + test_vectorized_binary_logical, sycl::short2>( + 0xFFF00001, 0xF0F00002, 0x0000FFFF); + test_vectorized_binary_logical, sycl::short2>( + 0x0001FF0F, 0x0003F00F, 0xFFFF0000); + + test_vectorized_binary_logical, sycl::short2>( + 0xFFF00001, 0xFFF00002, 0x0000FFFF); + test_vectorized_binary_logical, sycl::short2>( + 0x0001F00F, 0x0003F00F, 0xFFFF0000); + + // Logical Binary Operators v4 + test_vectorized_binary_logical, sycl::uchar4>( + 0x0001F00F, 0x0003F00F, 0xFF00FFFF); + test_vectorized_binary_logical, sycl::uchar4>( + 0x0102F0F0, 0x0202F0FF, 0x00FFFF00); + + test_vectorized_binary_logical, sycl::uchar4>( + 0x0001F00F, 0xFF01F10F, 0xFF00FF00); + test_vectorized_binary_logical, sycl::uchar4>( + 0x0201F0F0, 0x0202F0FF, 0x00FF00FF); + + test_vectorized_binary_logical, sycl::uchar4>( + 0xFFF00002, 0xFFF10101, 0xFF0000FF); + test_vectorized_binary_logical, sycl::uchar4>( + 0x0001F1F0, 0x0103F001, 0x0000FFFF); + + test_vectorized_binary_logical, sycl::uchar4>( + 0xFFF00002, 0xF0F00001, 0xFF0000FF); + test_vectorized_binary_logical, sycl::uchar4>( + 0x0103F0F1, 0x0102F0F0, 0x00FF00FF); + + test_vectorized_binary_logical, sycl::uchar4>( + 0xFFF10001, 0xFFF00100, 0xFF00FF00); + test_vectorized_binary_logical, sycl::uchar4>( + 0x0101F1F0, 0x0003F0F1, 0x00FF00FF); + + test_vectorized_binary_logical, sycl::uchar4>( + 0xFFF10001, 0xFFF20100, 0x00FFFF00); + test_vectorized_binary_logical, sycl::uchar4>( + 0x0101F1F0, 0x0102F1F1, 0x00FF00FF); + return 0; } diff --git a/sycl/test/syclcompat/launch/kernel_properties.cpp b/sycl/test/syclcompat/launch/kernel_properties.cpp index f17571fae0c2..6beefce73d14 100644 --- a/sycl/test/syclcompat/launch/kernel_properties.cpp +++ b/sycl/test/syclcompat/launch/kernel_properties.cpp @@ -23,7 +23,7 @@ // We need hardware which can support at least 2 sub-group sizes, since that // hardware (presumably) supports the `intel_reqd_sub_group_size` attribute. // REQUIRES: sg-32 && sg-16 -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} -o - | FileCheck %s +// RUN: %clangxx -fsycl -fsycl-device-only -Xclang -fsycl-is-device %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s #include #include #include diff --git a/sycl/test/syclcompat/launch/launch_inlining.cpp b/sycl/test/syclcompat/launch/launch_inlining.cpp new file mode 100644 index 000000000000..a224837139a5 --- /dev/null +++ b/sycl/test/syclcompat/launch/launch_inlining.cpp @@ -0,0 +1,97 @@ +/*************************************************************************** + * + * Copyright (C) Codeplay Software Ltd. + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM + * Exceptions. See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SYCLcompat API + * + * launch_inlining.cpp + * + * Description: + * Ensure kernels are inlined + **************************************************************************/ +// RUN: %clangxx -fsycl -fgpu-inline-threshold=0 %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s +// We set -fgpu-inline-threshold=0 to disable heuristic inlining for the +// purposes of the test +#include +#include +#include +#include + +namespace compat_exp = syclcompat::experimental; +namespace sycl_exp = sycl::ext::oneapi::experimental; +namespace sycl_intel_exp = sycl::ext::intel::experimental; + +static constexpr int LOCAL_MEM_SIZE = 1024; + +// CHECK: define {{.*}}spir_kernel{{.*}}write_mem_kernel{{.*}} { +// CHECK-NOT: call {{.*}}write_mem_kernel +// CHECK: } + +template void write_mem_kernel(T *data, int num_elements) { + const int id = + sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0); + if (id < num_elements) { + data[id] = static_cast(id); + } +}; + +// CHECK: define {{.*}}spir_kernel{{.*}}dynamic_local_mem_typed_kernel{{.*}} { +// CHECK-NOT: call {{.*}}dynamic_local_mem_typed_kernel +// CHECK: } +template +void dynamic_local_mem_typed_kernel(T *data, char *local_mem) { + constexpr size_t num_elements = LOCAL_MEM_SIZE / sizeof(T); + T *typed_local_mem = reinterpret_cast(local_mem); + + const int id = + sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0); + if (id < num_elements) { + typed_local_mem[id] = static_cast(id); + } + sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_work_group<1>()); + if (id < num_elements) { + data[id] = typed_local_mem[num_elements - id - 1]; + } +}; + +int test_write_mem() { + compat_exp::launch_policy my_dim3_config(syclcompat::dim3{32}, + syclcompat::dim3{32}); + + const int memsize = 1024; + int *d_a = (int *)syclcompat::malloc(memsize); + compat_exp::launch>(my_dim3_config, d_a, + memsize / sizeof(int)) + .wait(); + + syclcompat::free(d_a); + return 0; +} + +int test_lmem_launch() { + int local_mem_size = LOCAL_MEM_SIZE; + + size_t num_elements = local_mem_size / sizeof(int); + int *d_a = (int *)syclcompat::malloc(local_mem_size); + + compat_exp::launch_policy my_config( + sycl::nd_range<1>{{256}, {256}}, + compat_exp::local_mem_size(local_mem_size)); + + compat_exp::launch>(my_config, d_a) + .wait(); + + syclcompat::free(d_a); + + return 0; +} diff --git a/sycl/test/syclcompat/launch/launch_policy_lmem_neg.cpp b/sycl/test/syclcompat/launch/launch_policy_lmem_neg.cpp index 98222d6cc374..5c2750e86b70 100644 --- a/sycl/test/syclcompat/launch/launch_policy_lmem_neg.cpp +++ b/sycl/test/syclcompat/launch/launch_policy_lmem_neg.cpp @@ -22,7 +22,7 @@ * templates as tests in launch_policy_neg.cpp **************************************************************************/ -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out 2>&1 | FileCheck -vv %s +// RUN: not %clangxx -fsycl -fsyntax-only %s 2>&1 | FileCheck -vv %s #include #include diff --git a/sycl/test/syclcompat/launch/launch_policy_neg.cpp b/sycl/test/syclcompat/launch/launch_policy_neg.cpp index 558864084ff6..cee796471f23 100644 --- a/sycl/test/syclcompat/launch/launch_policy_neg.cpp +++ b/sycl/test/syclcompat/launch/launch_policy_neg.cpp @@ -20,18 +20,18 @@ * Negative tests for new launch_policy. **************************************************************************/ -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK1 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK1 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK2 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK2 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK3 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK3 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK4 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK4 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK5 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK5 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK6 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK6 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK7 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK7 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK8 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK8 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK9 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK9 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK10 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK10 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK11 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK11 -// RUN: not %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DCHECK12 -o %t.out 2>&1 | FileCheck -vv %s --check-prefixes=CHECK12 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK1 2>&1 | FileCheck -vv %s --check-prefixes=CHECK1 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK2 2>&1 | FileCheck -vv %s --check-prefixes=CHECK2 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK3 2>&1 | FileCheck -vv %s --check-prefixes=CHECK3 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK4 2>&1 | FileCheck -vv %s --check-prefixes=CHECK4 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK5 2>&1 | FileCheck -vv %s --check-prefixes=CHECK5 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK6 2>&1 | FileCheck -vv %s --check-prefixes=CHECK6 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK7 2>&1 | FileCheck -vv %s --check-prefixes=CHECK7 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK8 2>&1 | FileCheck -vv %s --check-prefixes=CHECK8 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK9 2>&1 | FileCheck -vv %s --check-prefixes=CHECK9 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK10 2>&1 | FileCheck -vv %s --check-prefixes=CHECK10 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK11 2>&1 | FileCheck -vv %s --check-prefixes=CHECK11 +// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK12 2>&1 | FileCheck -vv %s --check-prefixes=CHECK12 #include #include