diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 51f7f640108cf..089579b1ebd01 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -1042,6 +1042,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline( /*FP64ConvEmu=*/CodeGenOpts.FP64ConvEmu, /*ExcludeAspects=*/{"fp64"})); MPM.addPass(SYCLPropagateJointMatrixUsagePass()); + // Lowers static/dynamic local memory builtin calls. + MPM.addPass(SYCLLowerWGLocalMemoryPass()); }); else if (LangOpts.SYCLIsHost && !LangOpts.SYCLESIMDBuildHostCode) PB.registerPipelineStartEPCallback( @@ -1191,10 +1193,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline( MPM.addPass(SPIRITTAnnotationsPass()); } - // Allocate static local memory in SYCL kernel scope for each allocation - // call. - MPM.addPass(SYCLLowerWGLocalMemoryPass()); - // Process properties and annotations MPM.addPass(CompileTimePropertiesPass()); diff --git a/clang/test/CodeGenSYCL/group-local-memory.cpp b/clang/test/CodeGenSYCL/group-local-memory.cpp deleted file mode 100644 index 02610e33760ab..0000000000000 --- a/clang/test/CodeGenSYCL/group-local-memory.cpp +++ /dev/null @@ -1,29 +0,0 @@ -// Check that SYCLLowerWGLocalMemory pass is added to the SYCL device -// compilation pipeline with the inliner pass (new Pass Manager). - -// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -emit-llvm -O2 \ -// RUN: -mdebug-pass Structure %s -o /dev/null 2>&1 \ -// RUN: | FileCheck %s -check-prefixes=CHECK-INL,CHECK - -// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -emit-llvm -O0 \ -// RUN: -mdebug-pass Structure %s -o /dev/null 2>&1 \ -// RUN: | FileCheck %s --check-prefixes=CHECK-ALWINL,CHECK - -// Check that AlwaysInliner pass is always run for compilation of SYCL device -// target code, even if all optimizations are disabled. - -// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -emit-llvm -fno-sycl-early-optimizations \ -// RUN: -mdebug-pass Structure %s -o /dev/null 2>&1 \ -// RUN: | FileCheck %s --check-prefixes=CHECK-ALWINL,CHECK - -// CHECK-INL: Running pass: ModuleInlinerWrapperPass on [module] -// CHECK-ALWINL: Running pass: AlwaysInlinerPass on [module] -// CHECK: Running pass: SYCLLowerWGLocalMemoryPass on [module] - -// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -emit-llvm -disable-llvm-passes \ -// RUN: -mdebug-pass Structure %s -o /dev/null 2>&1 \ -// RUN: | FileCheck %s --check-prefixes=CHECK-NO-PASSES-ALWINL,CHECK-NO-PASSES,CHECK-NO-PASSES-INL - -// CHECK-NO-PASSES-INL-NOT: Running pass: ModuleInlinerWrapperPass on [module] -// CHECK-NO-PASSES-ALWINL-NOT: Running pass: AlwaysInlinerPass on [module] -// CHECK-NO-PASSES-NOT: Running pass: SYCLLowerWGLocalMemoryPass on [module] diff --git a/clang/test/CodeGenSYCL/kernel-early-optimization-pipeline.cpp b/clang/test/CodeGenSYCL/kernel-early-optimization-pipeline.cpp index 17527b58e5a8e..c75e48b9727a9 100644 --- a/clang/test/CodeGenSYCL/kernel-early-optimization-pipeline.cpp +++ b/clang/test/CodeGenSYCL/kernel-early-optimization-pipeline.cpp @@ -2,15 +2,24 @@ // SYCL device target, and can be disabled with -fno-sycl-early-optimizations. // New pass manager doesn't print all passes tree, only module level. // -// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-NEWPM-EARLYOPT -// CHECK-NEWPM-EARLYOPT: ConstantMergePass -// CHECK-NEWPM-EARLYOPT: SYCLMutatePrintfAddrspacePass +// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -o /dev/null 2>&1 | FileCheck %s +// CHECK: SYCLVirtualFunctionsAnalysisPass +// CHECK: ESIMDVerifierPass +// CHECK: SYCLConditionalCallOnDevicePass +// CHECK: SYCLPropagateAspectsUsagePass +// CHECK: SYCLPropagateJointMatrixUsagePass +// CHECK: SYCLLowerWGLocalMemoryPass +// CHECK: InferFunctionAttrsPass +// CHECK: AlwaysInlinerPass +// CHECK: ModuleInlinerWrapperPass +// CHECK: ConstantMergePass +// CHECK: SYCLMutatePrintfAddrspacePass +// CHECK: SYCLPropagateAspectsUsagePass +// CHECK: SYCLAddOptLevelAttributePass +// CHECK: CompileTimePropertiesPass +// CHECK: RecordSYCLAspectNamesPass +// CHECK: CleanupSYCLMetadataPass // -// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -fno-sycl-early-optimizations -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-NEWPM-NOEARLYOPT -// CHECK-NEWPM-NOEARLYOPT-NOT: ConstantMergePass -// CHECK-NEWPM-NOEARLYOPT: SYCLMutatePrintfAddrspacePass - -// Checks that the compile time properties pass is added into the compilation pipeline -// -// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-COMPTIMEPROPS -// CHECK-COMPTIMEPROPS: Running pass: CompileTimePropertiesPass on [module] +// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -fno-sycl-early-optimizations -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-NOEARLYOPT +// CHECK-NOEARLYOPT-NOT: ConstantMergePass1 +// CHECK-NOEARLYOPT: SYCLMutatePrintfAddrspacePass diff --git a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp index 6c46b5c75d5d7..f366e0e61668b 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp @@ -9,15 +9,17 @@ //===----------------------------------------------------------------------===// #include "llvm/SYCLLowerIR/LowerWGLocalMemory.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/Pass.h" #include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Utils/Cloning.h" using namespace llvm; -#define DEBUG_TYPE "LowerWGLocalMemory" +#define DEBUG_TYPE "sycllowerwglocalmemory" static constexpr char SYCL_ALLOCLOCALMEM_CALL[] = "__sycl_allocateLocalMemory"; static constexpr char SYCL_DYNAMIC_LOCALMEM_CALL[] = @@ -84,6 +86,42 @@ ModulePass *llvm::createSYCLLowerWGLocalMemoryLegacyPass() { return new SYCLLowerWGLocalMemoryLegacy(); } +// In sycl header __sycl_allocateLocalMemory builtin call is wrapped in +// group_local_memory/group_local_memory_for_overwrite functions, which must be +// inlined first before each __sycl_allocateLocalMemory call can be lowered to a +// distinct global variable. Inlining them here so that this pass doesn't have +// implicit dependency on AlwaysInlinerPass. +// +// syclcompat::local_mem, which represents a distinct allocation, calls +// group_local_memory_for_overwrite. So local_mem should be inlined as well. +static bool inlineGroupLocalMemoryFunc(Module &M) { + Function *ALMFunc = M.getFunction(SYCL_ALLOCLOCALMEM_CALL); + if (!ALMFunc || ALMFunc->use_empty()) + return false; + + SmallVector WorkList{ALMFunc}; + DenseSet Visited; + while (!WorkList.empty()) { + auto *F = WorkList.pop_back_val(); + for (auto *U : make_early_inc_range(F->users())) { + auto *CI = cast(U); + auto *Caller = CI->getFunction(); + if (Caller->hasFnAttribute("sycl-forceinline") && + Visited.insert(Caller).second) + WorkList.push_back(Caller); + if (F != ALMFunc) { + InlineFunctionInfo IFI; + [[maybe_unused]] auto Result = InlineFunction(*CI, IFI); + assert(Result.isSuccess() && "inlining failed"); + } + } + if (F != ALMFunc) + F->eraseFromParent(); + } + + return !Visited.empty(); +} + // TODO: It should be checked that __sycl_allocateLocalMemory (or its source // form - group_local_memory) does not occur: // - in a function (other than user lambda/functor) @@ -317,9 +355,8 @@ static bool dynamicWGLocalMemory(Module &M) { PreservedAnalyses SYCLLowerWGLocalMemoryPass::run(Module &M, ModuleAnalysisManager &) { - bool MadeChanges = allocaWGLocalMemory(M); - MadeChanges = dynamicWGLocalMemory(M) || MadeChanges; - if (MadeChanges) - return PreservedAnalyses::none(); - return PreservedAnalyses::all(); + bool Changed = inlineGroupLocalMemoryFunc(M); + Changed |= allocaWGLocalMemory(M); + Changed |= dynamicWGLocalMemory(M); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } diff --git a/llvm/test/SYCLLowerIR/group_local_memory_inline.ll b/llvm/test/SYCLLowerIR/group_local_memory_inline.ll new file mode 100644 index 0000000000000..d5ba4d0d774a4 --- /dev/null +++ b/llvm/test/SYCLLowerIR/group_local_memory_inline.ll @@ -0,0 +1,66 @@ +; RUN: opt < %s -passes=sycllowerwglocalmemory -S | FileCheck %s + +; Check group_local_memory_for_overwrite and group_local_memory functions are inlined. +; Check __sycl_allocateLocalMemory calls are lowered to four separate allocations. + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" +target triple = "spir64-unknown-unknown" + +%"class.sycl::_V1::multi_ptr" = type { ptr addrspace(3) } +%"class.sycl::_V1::group" = type { %"class.sycl::_V1::range", %"class.sycl::_V1::range", %"class.sycl::_V1::range", %"class.sycl::_V1::id" } +%"class.sycl::_V1::range" = type { %"class.sycl::_V1::detail::array" } +%"class.sycl::_V1::detail::array" = type { [1 x i64] } +%"class.sycl::_V1::id" = type { %"class.sycl::_V1::detail::array" } + +; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 +; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 +; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 +; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 + +; Function Attrs: alwaysinline +define internal spir_func void @_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_7nd_itemILi1EEEE_clES5_() #0 { +entry: +; CHECK: define internal spir_func void @_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_7nd_itemILi1EEEE_clES5_( +; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8 +; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8 +; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8 +; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8 + + %Ptr = alloca %"class.sycl::_V1::multi_ptr", align 8 + %agg = alloca %"class.sycl::_V1::group", align 8 + %Ptr.ascast = addrspacecast ptr %Ptr to ptr addrspace(4) + call spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg) + call spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg) + call spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg) + call spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg) + ret void +} + +; CHECK-NOT: define {{.*}} @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_( + +; Function Attrs: alwaysinline +define spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %agg.result, ptr byval(%"class.sycl::_V1::group") align 8 %g) #1 { +entry: + %AllocatedMem = alloca ptr addrspace(3), align 8 + %AllocatedMem.ascast = addrspacecast ptr %AllocatedMem to ptr addrspace(4) + %call = call spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 4, i64 4) + store ptr addrspace(3) %call, ptr addrspace(4) %AllocatedMem.ascast, align 8 + ret void +} + +; CHECK-NOT: define {{.*}} @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_( + +; Function Attrs: alwaysinline +define spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %agg.result, ptr byval(%"class.sycl::_V1::group") align 8 %g) #1 { +entry: + %AllocatedMem = alloca ptr addrspace(3), align 8 + %AllocatedMem.ascast = addrspacecast ptr %AllocatedMem to ptr addrspace(4) + %call = call spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 4, i64 4) + store ptr addrspace(3) %call, ptr addrspace(4) %AllocatedMem.ascast, align 8 + ret void +} + +declare spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 noundef, i64 noundef) + +attributes #0 = { alwaysinline } +attributes #1 = { "sycl-forceinline"="true" } diff --git a/sycl/include/sycl/ext/oneapi/group_local_memory.hpp b/sycl/include/sycl/ext/oneapi/group_local_memory.hpp index 6e65b9acffe8e..8b0b39c20fd39 100644 --- a/sycl/include/sycl/ext/oneapi/group_local_memory.hpp +++ b/sycl/include/sycl/ext/oneapi/group_local_memory.hpp @@ -21,6 +21,9 @@ namespace sycl { inline namespace _V1 { namespace ext::oneapi { template +#ifdef __SYCL_DEVICE_ONLY__ +[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]] +#endif std::enable_if_t< std::is_trivially_destructible_v && sycl::detail::is_group::value, multi_ptr> @@ -44,6 +47,9 @@ std::enable_if_t< } template +#ifdef __SYCL_DEVICE_ONLY__ +[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]] +#endif std::enable_if_t< std::is_trivially_destructible_v && sycl::detail::is_group::value, multi_ptr> diff --git a/sycl/include/syclcompat/memory.hpp b/sycl/include/syclcompat/memory.hpp index 30f4151f20960..e2735a20b0e99 100644 --- a/sycl/include/syclcompat/memory.hpp +++ b/sycl/include/syclcompat/memory.hpp @@ -68,7 +68,11 @@ namespace syclcompat { -template auto *local_mem() { +template +#ifdef __SYCL_DEVICE_ONLY__ +[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]] +#endif +__SYCL_ALWAYS_INLINE auto *local_mem() { sycl::multi_ptr As_multi_ptr = sycl::ext::oneapi::group_local_memory_for_overwrite( diff --git a/sycl/test/check_device_code/extensions/group_local_memory.cpp b/sycl/test/check_device_code/extensions/group_local_memory.cpp new file mode 100644 index 0000000000000..5fda89f8a25bf --- /dev/null +++ b/sycl/test/check_device_code/extensions/group_local_memory.cpp @@ -0,0 +1,38 @@ +// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -o - | FileCheck %s +// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -fno-sycl-early-optimizations -o - | FileCheck %s +// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -O0 -o - | FileCheck %s + +// The test checks that multiple calls to the same template instantiation of a +// group local memory function result in separate allocations. + +// CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 +// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 +// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 +// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 + +#include +#include +#include + +using namespace sycl; + +int main() { + queue Q; + + int **Out = malloc_shared(4, Q); + + Q.submit([&](handler &Cgh) { + Cgh.parallel_for(nd_range<1>({1}, {1}), [=](nd_item<1> Item) { + auto Ptr0 = + ext::oneapi::group_local_memory_for_overwrite(Item.get_group()); + auto Ptr1 = + ext::oneapi::group_local_memory_for_overwrite(Item.get_group()); + auto Ptr2 = ext::oneapi::group_local_memory(Item.get_group()); + auto Ptr3 = ext::oneapi::group_local_memory(Item.get_group()); + Out[0] = Ptr0; + Out[1] = Ptr1; + Out[2] = Ptr2; + Out[3] = Ptr3; + }); + }); +} diff --git a/sycl/test/check_device_code/syclcompat_local_mem.cpp b/sycl/test/check_device_code/syclcompat_local_mem.cpp new file mode 100644 index 0000000000000..b8fdd51cc8629 --- /dev/null +++ b/sycl/test/check_device_code/syclcompat_local_mem.cpp @@ -0,0 +1,27 @@ +// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -o - | FileCheck %s + +// The test checks that multiple calls to the same template instantiation of +// syclcompat local_mem function result in separate allocations. + +// CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 +// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 + +#include +#include + +using namespace sycl; + +int main() { + queue Q; + + int **Out = malloc_shared(2, Q); + + Q.submit([&](handler &Cgh) { + Cgh.parallel_for(nd_range<1>({1}, {1}), [=](nd_item<1> Item) { + auto Ptr0 = syclcompat::local_mem(); + auto Ptr1 = syclcompat::local_mem(); + Out[0] = Ptr0; + Out[1] = Ptr1; + }); + }); +}