From e6ce5846246f3fa0f2e0b4e07a154ec11d6d15d9 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Fri, 13 Dec 2024 08:48:13 +0800 Subject: [PATCH 01/12] [SYCL][SYCLLowerWGLocalMemoryPass] Remove implicit dependency on AlwaysInlinerPass and move to PipelineStart Currently SYCLLowerWGLocalMemoryPass must run after AlwaysInlinerPass because in sycl header __sycl_allocateLocalMemory call is wrapped in group_local_memory/group_local_memory_for_overwrite function. Each call to __sycl_allocateLocalMemory represents a unique local memory, so group_local_memory/group_local_memory_for_overwrite must be inlined. The dependency is implicit and prohibits SYCLLowerWGLocalMemoryPass being moved around in the pass pipeline. Since the pass transforms __sycl_allocateLocalMemory call to access of global variable @WGLocalMem, moving the pass to beginning of pipeline could enable more optimization than the function call does. In addition, intel gpu compiler has a pass to transform global variable in addrspace(3) to alloca that runs after pipeline basic simplification. Therefore, we shall run SYCLLowerWGLocalMemoryPass ealier. --- clang/lib/CodeGen/BackendUtil.cpp | 7 ++-- clang/test/CodeGenSYCL/group-local-memory.cpp | 29 --------------- .../kernel-early-optimization-pipeline.cpp | 31 ++++++++++------ llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp | 35 ++++++++++++++++--- 4 files changed, 54 insertions(+), 48 deletions(-) delete mode 100644 clang/test/CodeGenSYCL/group-local-memory.cpp diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index f617923670204..d849a08404820 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -1041,6 +1041,9 @@ void EmitAssemblyHelper::RunOptimizationPipeline( /*FP64ConvEmu=*/CodeGenOpts.FP64ConvEmu, /*ExcludeAspects=*/{"fp64"})); MPM.addPass(SYCLPropagateJointMatrixUsagePass()); + // Allocate static local memory in SYCL kernel scope for each allocation + // call. + MPM.addPass(SYCLLowerWGLocalMemoryPass()); }); else if (LangOpts.SYCLIsHost && !LangOpts.SYCLESIMDBuildHostCode) PB.registerPipelineStartEPCallback( @@ -1184,10 +1187,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline( MPM.addPass(SPIRITTAnnotationsPass()); } - // Allocate static local memory in SYCL kernel scope for each allocation - // call. - MPM.addPass(SYCLLowerWGLocalMemoryPass()); - // Process properties and annotations MPM.addPass(CompileTimePropertiesPass()); diff --git a/clang/test/CodeGenSYCL/group-local-memory.cpp b/clang/test/CodeGenSYCL/group-local-memory.cpp deleted file mode 100644 index 02610e33760ab..0000000000000 --- a/clang/test/CodeGenSYCL/group-local-memory.cpp +++ /dev/null @@ -1,29 +0,0 @@ -// Check that SYCLLowerWGLocalMemory pass is added to the SYCL device -// compilation pipeline with the inliner pass (new Pass Manager). - -// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -emit-llvm -O2 \ -// RUN: -mdebug-pass Structure %s -o /dev/null 2>&1 \ -// RUN: | FileCheck %s -check-prefixes=CHECK-INL,CHECK - -// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -emit-llvm -O0 \ -// RUN: -mdebug-pass Structure %s -o /dev/null 2>&1 \ -// RUN: | FileCheck %s --check-prefixes=CHECK-ALWINL,CHECK - -// Check that AlwaysInliner pass is always run for compilation of SYCL device -// target code, even if all optimizations are disabled. - -// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -emit-llvm -fno-sycl-early-optimizations \ -// RUN: -mdebug-pass Structure %s -o /dev/null 2>&1 \ -// RUN: | FileCheck %s --check-prefixes=CHECK-ALWINL,CHECK - -// CHECK-INL: Running pass: ModuleInlinerWrapperPass on [module] -// CHECK-ALWINL: Running pass: AlwaysInlinerPass on [module] -// CHECK: Running pass: SYCLLowerWGLocalMemoryPass on [module] - -// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -emit-llvm -disable-llvm-passes \ -// RUN: -mdebug-pass Structure %s -o /dev/null 2>&1 \ -// RUN: | FileCheck %s --check-prefixes=CHECK-NO-PASSES-ALWINL,CHECK-NO-PASSES,CHECK-NO-PASSES-INL - -// CHECK-NO-PASSES-INL-NOT: Running pass: ModuleInlinerWrapperPass on [module] -// CHECK-NO-PASSES-ALWINL-NOT: Running pass: AlwaysInlinerPass on [module] -// CHECK-NO-PASSES-NOT: Running pass: SYCLLowerWGLocalMemoryPass on [module] diff --git a/clang/test/CodeGenSYCL/kernel-early-optimization-pipeline.cpp b/clang/test/CodeGenSYCL/kernel-early-optimization-pipeline.cpp index 17527b58e5a8e..c75e48b9727a9 100644 --- a/clang/test/CodeGenSYCL/kernel-early-optimization-pipeline.cpp +++ b/clang/test/CodeGenSYCL/kernel-early-optimization-pipeline.cpp @@ -2,15 +2,24 @@ // SYCL device target, and can be disabled with -fno-sycl-early-optimizations. // New pass manager doesn't print all passes tree, only module level. // -// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-NEWPM-EARLYOPT -// CHECK-NEWPM-EARLYOPT: ConstantMergePass -// CHECK-NEWPM-EARLYOPT: SYCLMutatePrintfAddrspacePass +// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -o /dev/null 2>&1 | FileCheck %s +// CHECK: SYCLVirtualFunctionsAnalysisPass +// CHECK: ESIMDVerifierPass +// CHECK: SYCLConditionalCallOnDevicePass +// CHECK: SYCLPropagateAspectsUsagePass +// CHECK: SYCLPropagateJointMatrixUsagePass +// CHECK: SYCLLowerWGLocalMemoryPass +// CHECK: InferFunctionAttrsPass +// CHECK: AlwaysInlinerPass +// CHECK: ModuleInlinerWrapperPass +// CHECK: ConstantMergePass +// CHECK: SYCLMutatePrintfAddrspacePass +// CHECK: SYCLPropagateAspectsUsagePass +// CHECK: SYCLAddOptLevelAttributePass +// CHECK: CompileTimePropertiesPass +// CHECK: RecordSYCLAspectNamesPass +// CHECK: CleanupSYCLMetadataPass // -// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -fno-sycl-early-optimizations -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-NEWPM-NOEARLYOPT -// CHECK-NEWPM-NOEARLYOPT-NOT: ConstantMergePass -// CHECK-NEWPM-NOEARLYOPT: SYCLMutatePrintfAddrspacePass - -// Checks that the compile time properties pass is added into the compilation pipeline -// -// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-COMPTIMEPROPS -// CHECK-COMPTIMEPROPS: Running pass: CompileTimePropertiesPass on [module] +// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -fno-sycl-early-optimizations -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-NOEARLYOPT +// CHECK-NOEARLYOPT-NOT: ConstantMergePass1 +// CHECK-NOEARLYOPT: SYCLMutatePrintfAddrspacePass diff --git a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp index 1ca82ae078df0..803b05d2c7a99 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp @@ -9,10 +9,12 @@ //===----------------------------------------------------------------------===// #include "llvm/SYCLLowerIR/LowerWGLocalMemory.h" +#include "llvm/Demangle/Demangle.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/Pass.h" +#include "llvm/Transforms/Utils/Cloning.h" using namespace llvm; @@ -52,6 +54,32 @@ ModulePass *llvm::createSYCLLowerWGLocalMemoryLegacyPass() { return new SYCLLowerWGLocalMemoryLegacy(); } +static bool inlineAllocateLocalMemoryFunc(Module &M) { + Function *ALMFunc = M.getFunction(SYCL_ALLOCLOCALMEM_CALL); + if (!ALMFunc) + return false; + + auto *Caller = cast(*ALMFunc->user_begin())->getFunction(); + if (!Caller->hasFnAttribute(Attribute::AlwaysInline)) { + // Already inlined. + return false; + } + std::string FName = llvm::demangle(Caller->getName()); + if (FName.find("sycl::_V1::ext::oneapi::group_local_memory") == + std::string::npos) { + // Already inlined. + return false; + } + for (User *U : make_early_inc_range(Caller->users())) { + auto *CI = cast(U); + InlineFunctionInfo IFI; + [[maybe_unused]] auto Result = InlineFunction(*CI, IFI); + assert(Result.isSuccess() && "inlining failed"); + } + Caller->eraseFromParent(); + return true; +} + // TODO: It should be checked that __sycl_allocateLocalMemory (or its source // form - group_local_memory) does not occur: // - in a function (other than user lambda/functor) @@ -94,7 +122,6 @@ static bool allocaWGLocalMemory(Module &M) { Function *ALMFunc = M.getFunction(SYCL_ALLOCLOCALMEM_CALL); if (!ALMFunc) return false; - assert(ALMFunc->isDeclaration() && "should have declaration only"); SmallVector DelCalls; @@ -118,7 +145,7 @@ static bool allocaWGLocalMemory(Module &M) { PreservedAnalyses SYCLLowerWGLocalMemoryPass::run(Module &M, ModuleAnalysisManager &) { - if (allocaWGLocalMemory(M)) - return PreservedAnalyses::none(); - return PreservedAnalyses::all(); + bool Changed = inlineAllocateLocalMemoryFunc(M); + Changed |= allocaWGLocalMemory(M); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } From a4f8382d710a7cdfd42237a93d6c15569d93b9c9 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Fri, 13 Dec 2024 10:15:59 +0800 Subject: [PATCH 02/12] fix inlineGroupLocalMemoryFunc --- clang/lib/CodeGen/BackendUtil.cpp | 3 +- llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp | 50 ++++++++++++--------- 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 7b1b19ab2dcc1..089579b1ebd01 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -1042,8 +1042,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline( /*FP64ConvEmu=*/CodeGenOpts.FP64ConvEmu, /*ExcludeAspects=*/{"fp64"})); MPM.addPass(SYCLPropagateJointMatrixUsagePass()); - // Allocate static local memory in SYCL kernel scope for each allocation - // call. + // Lowers static/dynamic local memory builtin calls. MPM.addPass(SYCLLowerWGLocalMemoryPass()); }); else if (LangOpts.SYCLIsHost && !LangOpts.SYCLESIMDBuildHostCode) diff --git a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp index 42596a6ced9c9..78e16211336c0 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp @@ -19,7 +19,7 @@ using namespace llvm; -#define DEBUG_TYPE "LowerWGLocalMemory" +#define DEBUG_TYPE "sycllowerwglocalmemory" static constexpr char SYCL_ALLOCLOCALMEM_CALL[] = "__sycl_allocateLocalMemory"; static constexpr char SYCL_DYNAMIC_LOCALMEM_CALL[] = @@ -86,29 +86,37 @@ ModulePass *llvm::createSYCLLowerWGLocalMemoryLegacyPass() { return new SYCLLowerWGLocalMemoryLegacy(); } -static bool inlineAllocateLocalMemoryFunc(Module &M) { +// In sycl header __sycl_allocateLocalMemory builtin call is wrapped in +// group_local_memory/group_local_memory_for_overwrite functions, which must be +// inlined first before each __sycl_allocateLocalMemory call can be lowered to a +// unique global variable. Inlining them here so that this pass doesn't have +// implicit dependency on AlwaysInlinerPass. +static bool inlineGroupLocalMemoryFunc(Module &M) { Function *ALMFunc = M.getFunction(SYCL_ALLOCLOCALMEM_CALL); - if (!ALMFunc) + if (!ALMFunc || ALMFunc->use_empty()) return false; - auto *Caller = cast(*ALMFunc->user_begin())->getFunction(); - if (!Caller->hasFnAttribute(Attribute::AlwaysInline)) { - // Already inlined. - return false; - } - std::string FName = llvm::demangle(Caller->getName()); - if (FName.find("sycl::_V1::ext::oneapi::group_local_memory") == - std::string::npos) { - // Already inlined. - return false; - } - for (User *U : make_early_inc_range(Caller->users())) { - auto *CI = cast(U); - InlineFunctionInfo IFI; - [[maybe_unused]] auto Result = InlineFunction(*CI, IFI); - assert(Result.isSuccess() && "inlining failed"); + for (auto *U : ALMFunc->users()) { + auto *Caller = cast(U)->getFunction(); + if (!Caller->hasFnAttribute(Attribute::AlwaysInline)) { + // Already inlined. + return false; + } + std::string FName = llvm::demangle(Caller->getName()); + if (FName.find("sycl::_V1::ext::oneapi::group_local_memory") == + std::string::npos) { + // Already inlined. + return false; + } + for (auto *U2 : make_early_inc_range(Caller->users())) { + auto *CI = cast(U2); + InlineFunctionInfo IFI; + [[maybe_unused]] auto Result = InlineFunction(*CI, IFI); + assert(Result.isSuccess() && "inlining failed"); + } + Caller->eraseFromParent(); } - Caller->eraseFromParent(); + return true; } @@ -345,7 +353,7 @@ static bool dynamicWGLocalMemory(Module &M) { PreservedAnalyses SYCLLowerWGLocalMemoryPass::run(Module &M, ModuleAnalysisManager &) { - bool Changed = inlineAllocateLocalMemoryFunc(M); + bool Changed = inlineGroupLocalMemoryFunc(M); Changed |= allocaWGLocalMemory(M); Changed |= dynamicWGLocalMemory(M); return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); From b42fd22b5e17e50d682a49b880eed3685f90130c Mon Sep 17 00:00:00 2001 From: Wenju He Date: Fri, 13 Dec 2024 10:34:28 +0800 Subject: [PATCH 03/12] inlineGroupLocalMemoryFunc: return false -> continue --- llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp index 78e16211336c0..dbdf962b0c84b 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp @@ -96,17 +96,18 @@ static bool inlineGroupLocalMemoryFunc(Module &M) { if (!ALMFunc || ALMFunc->use_empty()) return false; + bool Changed = false; for (auto *U : ALMFunc->users()) { auto *Caller = cast(U)->getFunction(); if (!Caller->hasFnAttribute(Attribute::AlwaysInline)) { // Already inlined. - return false; + continue; } std::string FName = llvm::demangle(Caller->getName()); if (FName.find("sycl::_V1::ext::oneapi::group_local_memory") == std::string::npos) { // Already inlined. - return false; + continue; } for (auto *U2 : make_early_inc_range(Caller->users())) { auto *CI = cast(U2); @@ -115,9 +116,10 @@ static bool inlineGroupLocalMemoryFunc(Module &M) { assert(Result.isSuccess() && "inlining failed"); } Caller->eraseFromParent(); + Changed = true; } - return true; + return Changed; } // TODO: It should be checked that __sycl_allocateLocalMemory (or its source From 44db66ad74ee59f8b7f1c6c7cd451f1182dd6a04 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Mon, 16 Dec 2024 11:21:16 +0800 Subject: [PATCH 04/12] check device code --- .../SYCLLowerIR/group_local_memory_inline.ll | 65 +++++++++++++++++++ .../extensions/group_local_memory.cpp | 40 ++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 llvm/test/SYCLLowerIR/group_local_memory_inline.ll create mode 100644 sycl/test/check_device_code/extensions/group_local_memory.cpp diff --git a/llvm/test/SYCLLowerIR/group_local_memory_inline.ll b/llvm/test/SYCLLowerIR/group_local_memory_inline.ll new file mode 100644 index 0000000000000..9ddd7feee661a --- /dev/null +++ b/llvm/test/SYCLLowerIR/group_local_memory_inline.ll @@ -0,0 +1,65 @@ +; RUN: opt < %s -passes=sycllowerwglocalmemory -S | FileCheck %s + +; Check group_local_memory_for_overwrite and group_local_memory functions are inlined. +; Check __sycl_allocateLocalMemory calls are lowered to four separate allocations. + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" +target triple = "spir64-unknown-unknown" + +%"class.sycl::_V1::multi_ptr" = type { ptr addrspace(3) } +%"class.sycl::_V1::group" = type { %"class.sycl::_V1::range", %"class.sycl::_V1::range", %"class.sycl::_V1::range", %"class.sycl::_V1::id" } +%"class.sycl::_V1::range" = type { %"class.sycl::_V1::detail::array" } +%"class.sycl::_V1::detail::array" = type { [1 x i64] } +%"class.sycl::_V1::id" = type { %"class.sycl::_V1::detail::array" } + +; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 +; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 +; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 +; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 + +; Function Attrs: alwaysinline +define internal spir_func void @_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_7nd_itemILi1EEEE_clES5_() #0 { +entry: +; CHECK: define internal spir_func void @_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_7nd_itemILi1EEEE_clES5_( +; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8 +; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8 +; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8 +; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8 + + %Ptr = alloca %"class.sycl::_V1::multi_ptr", align 8 + %agg = alloca %"class.sycl::_V1::group", align 8 + %Ptr.ascast = addrspacecast ptr %Ptr to ptr addrspace(4) + call spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg) + call spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg) + call spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg) + call spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg) + ret void +} + +; CHECK-NOT: define {{.*}} @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_( + +; Function Attrs: alwaysinline +define spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %agg.result, ptr byval(%"class.sycl::_V1::group") align 8 %g) #0 { +entry: + %AllocatedMem = alloca ptr addrspace(3), align 8 + %AllocatedMem.ascast = addrspacecast ptr %AllocatedMem to ptr addrspace(4) + %call = call spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 4, i64 4) + store ptr addrspace(3) %call, ptr addrspace(4) %AllocatedMem.ascast, align 8 + ret void +} + +; CHECK-NOT: define {{.*}} @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_( + +; Function Attrs: alwaysinline +define spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %agg.result, ptr byval(%"class.sycl::_V1::group") align 8 %g) #0 { +entry: + %AllocatedMem = alloca ptr addrspace(3), align 8 + %AllocatedMem.ascast = addrspacecast ptr %AllocatedMem to ptr addrspace(4) + %call = call spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 4, i64 4) + store ptr addrspace(3) %call, ptr addrspace(4) %AllocatedMem.ascast, align 8 + ret void +} + +declare spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 noundef, i64 noundef) + +attributes #0 = { alwaysinline } diff --git a/sycl/test/check_device_code/extensions/group_local_memory.cpp b/sycl/test/check_device_code/extensions/group_local_memory.cpp new file mode 100644 index 0000000000000..4c09755dabde6 --- /dev/null +++ b/sycl/test/check_device_code/extensions/group_local_memory.cpp @@ -0,0 +1,40 @@ +// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -o - | FileCheck %s +// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -fno-sycl-early-optimizations -o - | FileCheck %s +// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -O0 -o - | FileCheck %s + +// The test checks that multiple calls to the same template instantiation of a +// group local memory function result in separate allocations. + +#include +#include +#include + +using namespace sycl; + +int main() { + queue Q; + + int **Out = malloc_shared(4, Q); + + Q.submit([&](handler &Cgh) { + Cgh.parallel_for(nd_range<1>({1}, {1}), [=](nd_item<1> Item) { + auto Ptr0 = ext::oneapi::group_local_memory_for_overwrite( + Item.get_group()); + auto Ptr1 = ext::oneapi::group_local_memory_for_overwrite( + Item.get_group()); + auto Ptr2 = ext::oneapi::group_local_memory( + Item.get_group()); + auto Ptr3 = ext::oneapi::group_local_memory( + Item.get_group()); + Out[0] = Ptr0; + Out[1] = Ptr1; + Out[2] = Ptr2; + Out[3] = Ptr3; + }); + }); +} + +// CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 +// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 +// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 +// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 From 97add86817c096511f85f26edc10dfcbe551a4ed Mon Sep 17 00:00:00 2001 From: Wenju He Date: Mon, 16 Dec 2024 11:39:15 +0800 Subject: [PATCH 05/12] clang-format --- .../extensions/group_local_memory.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/sycl/test/check_device_code/extensions/group_local_memory.cpp b/sycl/test/check_device_code/extensions/group_local_memory.cpp index 4c09755dabde6..21222a96375ab 100644 --- a/sycl/test/check_device_code/extensions/group_local_memory.cpp +++ b/sycl/test/check_device_code/extensions/group_local_memory.cpp @@ -18,14 +18,12 @@ int main() { Q.submit([&](handler &Cgh) { Cgh.parallel_for(nd_range<1>({1}, {1}), [=](nd_item<1> Item) { - auto Ptr0 = ext::oneapi::group_local_memory_for_overwrite( - Item.get_group()); - auto Ptr1 = ext::oneapi::group_local_memory_for_overwrite( - Item.get_group()); - auto Ptr2 = ext::oneapi::group_local_memory( - Item.get_group()); - auto Ptr3 = ext::oneapi::group_local_memory( - Item.get_group()); + auto Ptr0 = + ext::oneapi::group_local_memory_for_overwrite(Item.get_group()); + auto Ptr1 = + ext::oneapi::group_local_memory_for_overwrite(Item.get_group()); + auto Ptr2 = ext::oneapi::group_local_memory(Item.get_group()); + auto Ptr3 = ext::oneapi::group_local_memory(Item.get_group()); Out[0] = Ptr0; Out[1] = Ptr1; Out[2] = Ptr2; From a4fe9152efda3e8a28d78b736a4b51d02be82157 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Tue, 17 Dec 2024 13:54:09 +0800 Subject: [PATCH 06/12] add ir attribute sycl_forceinline to group_local_memory --- llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp | 8 +------- llvm/test/SYCLLowerIR/group_local_memory_inline.ll | 5 +++-- sycl/include/sycl/ext/oneapi/group_local_memory.hpp | 10 ++++++++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp index dbdf962b0c84b..a272d077509d1 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp @@ -99,13 +99,7 @@ static bool inlineGroupLocalMemoryFunc(Module &M) { bool Changed = false; for (auto *U : ALMFunc->users()) { auto *Caller = cast(U)->getFunction(); - if (!Caller->hasFnAttribute(Attribute::AlwaysInline)) { - // Already inlined. - continue; - } - std::string FName = llvm::demangle(Caller->getName()); - if (FName.find("sycl::_V1::ext::oneapi::group_local_memory") == - std::string::npos) { + if (!Caller->hasFnAttribute("sycl_forceinline")) { // Already inlined. continue; } diff --git a/llvm/test/SYCLLowerIR/group_local_memory_inline.ll b/llvm/test/SYCLLowerIR/group_local_memory_inline.ll index 9ddd7feee661a..29c5add1aef47 100644 --- a/llvm/test/SYCLLowerIR/group_local_memory_inline.ll +++ b/llvm/test/SYCLLowerIR/group_local_memory_inline.ll @@ -39,7 +39,7 @@ entry: ; CHECK-NOT: define {{.*}} @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_( ; Function Attrs: alwaysinline -define spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %agg.result, ptr byval(%"class.sycl::_V1::group") align 8 %g) #0 { +define spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %agg.result, ptr byval(%"class.sycl::_V1::group") align 8 %g) #1 { entry: %AllocatedMem = alloca ptr addrspace(3), align 8 %AllocatedMem.ascast = addrspacecast ptr %AllocatedMem to ptr addrspace(4) @@ -51,7 +51,7 @@ entry: ; CHECK-NOT: define {{.*}} @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_( ; Function Attrs: alwaysinline -define spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %agg.result, ptr byval(%"class.sycl::_V1::group") align 8 %g) #0 { +define spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %agg.result, ptr byval(%"class.sycl::_V1::group") align 8 %g) #1 { entry: %AllocatedMem = alloca ptr addrspace(3), align 8 %AllocatedMem.ascast = addrspacecast ptr %AllocatedMem to ptr addrspace(4) @@ -63,3 +63,4 @@ entry: declare spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 noundef, i64 noundef) attributes #0 = { alwaysinline } +attributes #1 = { "sycl_forceinline"="true" } diff --git a/sycl/include/sycl/ext/oneapi/group_local_memory.hpp b/sycl/include/sycl/ext/oneapi/group_local_memory.hpp index 6e65b9acffe8e..c687dcb708ea0 100644 --- a/sycl/include/sycl/ext/oneapi/group_local_memory.hpp +++ b/sycl/include/sycl/ext/oneapi/group_local_memory.hpp @@ -21,10 +21,13 @@ namespace sycl { inline namespace _V1 { namespace ext::oneapi { template +#ifdef __SYCL_DEVICE_ONLY__ +[[__sycl_detail__::add_ir_attributes_function("sycl_forceinline", true)]] +#endif std::enable_if_t< std::is_trivially_destructible_v && sycl::detail::is_group::value, multi_ptr> - __SYCL_ALWAYS_INLINE group_local_memory_for_overwrite(Group g) { +group_local_memory_for_overwrite(Group g) { (void)g; #ifdef __SYCL_DEVICE_ONLY__ __attribute__((opencl_local)) std::uint8_t *AllocatedMem = @@ -44,10 +47,13 @@ std::enable_if_t< } template +#ifdef __SYCL_DEVICE_ONLY__ +[[__sycl_detail__::add_ir_attributes_function("sycl_forceinline", true)]] +#endif std::enable_if_t< std::is_trivially_destructible_v && sycl::detail::is_group::value, multi_ptr> - __SYCL_ALWAYS_INLINE group_local_memory(Group g, Args &&...args) { +group_local_memory(Group g, Args &&...args) { #ifdef __SYCL_DEVICE_ONLY__ __attribute__((opencl_local)) std::uint8_t *AllocatedMem = __sycl_allocateLocalMemory(sizeof(T), alignof(T)); From d764d00043b3c61f62a711cac42b299f61918ce8 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Tue, 17 Dec 2024 16:17:47 -0800 Subject: [PATCH 07/12] inline syclcompat::local_mem as well --- llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp | 37 +++++++++++-------- .../SYCLLowerIR/group_local_memory_inline.ll | 2 +- .../sycl/ext/oneapi/group_local_memory.hpp | 4 +- sycl/include/syclcompat/memory.hpp | 6 ++- .../extensions/group_local_memory.cpp | 10 ++--- .../syclcompat_local_mem.cpp | 27 ++++++++++++++ 6 files changed, 62 insertions(+), 24 deletions(-) create mode 100644 sycl/test/check_device_code/syclcompat_local_mem.cpp diff --git a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp index a272d077509d1..6f0b4fb5881ac 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #include "llvm/SYCLLowerIR/LowerWGLocalMemory.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/Demangle/Demangle.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -91,29 +92,35 @@ ModulePass *llvm::createSYCLLowerWGLocalMemoryLegacyPass() { // inlined first before each __sycl_allocateLocalMemory call can be lowered to a // unique global variable. Inlining them here so that this pass doesn't have // implicit dependency on AlwaysInlinerPass. +// +// syclcompat::local_mem, which represents a unique allocation, calls +// group_local_memory_for_overwrite. So local_mem should be inlined as well. static bool inlineGroupLocalMemoryFunc(Module &M) { Function *ALMFunc = M.getFunction(SYCL_ALLOCLOCALMEM_CALL); if (!ALMFunc || ALMFunc->use_empty()) return false; - bool Changed = false; - for (auto *U : ALMFunc->users()) { - auto *Caller = cast(U)->getFunction(); - if (!Caller->hasFnAttribute("sycl_forceinline")) { - // Already inlined. - continue; - } - for (auto *U2 : make_early_inc_range(Caller->users())) { - auto *CI = cast(U2); - InlineFunctionInfo IFI; - [[maybe_unused]] auto Result = InlineFunction(*CI, IFI); - assert(Result.isSuccess() && "inlining failed"); + SmallVector WorkList{ALMFunc}; + DenseSet Visited; + while (!WorkList.empty()) { + auto *F = WorkList.pop_back_val(); + for (auto *U : make_early_inc_range(F->users())) { + auto *CI = cast(U); + auto *Caller = CI->getFunction(); + if (Caller->hasFnAttribute("sycl-forceinline") && + Visited.insert(Caller).second) + WorkList.push_back(Caller); + if (F != ALMFunc) { + InlineFunctionInfo IFI; + [[maybe_unused]] auto Result = InlineFunction(*CI, IFI); + assert(Result.isSuccess() && "inlining failed"); + } } - Caller->eraseFromParent(); - Changed = true; } + for (auto *F : Visited) + F->eraseFromParent(); - return Changed; + return !Visited.empty(); } // TODO: It should be checked that __sycl_allocateLocalMemory (or its source diff --git a/llvm/test/SYCLLowerIR/group_local_memory_inline.ll b/llvm/test/SYCLLowerIR/group_local_memory_inline.ll index 29c5add1aef47..d5ba4d0d774a4 100644 --- a/llvm/test/SYCLLowerIR/group_local_memory_inline.ll +++ b/llvm/test/SYCLLowerIR/group_local_memory_inline.ll @@ -63,4 +63,4 @@ entry: declare spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 noundef, i64 noundef) attributes #0 = { alwaysinline } -attributes #1 = { "sycl_forceinline"="true" } +attributes #1 = { "sycl-forceinline"="true" } diff --git a/sycl/include/sycl/ext/oneapi/group_local_memory.hpp b/sycl/include/sycl/ext/oneapi/group_local_memory.hpp index c687dcb708ea0..99364c29a821a 100644 --- a/sycl/include/sycl/ext/oneapi/group_local_memory.hpp +++ b/sycl/include/sycl/ext/oneapi/group_local_memory.hpp @@ -22,7 +22,7 @@ inline namespace _V1 { namespace ext::oneapi { template #ifdef __SYCL_DEVICE_ONLY__ -[[__sycl_detail__::add_ir_attributes_function("sycl_forceinline", true)]] +[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]] #endif std::enable_if_t< std::is_trivially_destructible_v && sycl::detail::is_group::value, @@ -48,7 +48,7 @@ group_local_memory_for_overwrite(Group g) { template #ifdef __SYCL_DEVICE_ONLY__ -[[__sycl_detail__::add_ir_attributes_function("sycl_forceinline", true)]] +[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]] #endif std::enable_if_t< std::is_trivially_destructible_v && sycl::detail::is_group::value, diff --git a/sycl/include/syclcompat/memory.hpp b/sycl/include/syclcompat/memory.hpp index 30f4151f20960..bb469e6ad8340 100644 --- a/sycl/include/syclcompat/memory.hpp +++ b/sycl/include/syclcompat/memory.hpp @@ -68,7 +68,11 @@ namespace syclcompat { -template auto *local_mem() { +template +#ifdef __SYCL_DEVICE_ONLY__ +[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]] +#endif +auto *local_mem() { sycl::multi_ptr As_multi_ptr = sycl::ext::oneapi::group_local_memory_for_overwrite( diff --git a/sycl/test/check_device_code/extensions/group_local_memory.cpp b/sycl/test/check_device_code/extensions/group_local_memory.cpp index 21222a96375ab..5fda89f8a25bf 100644 --- a/sycl/test/check_device_code/extensions/group_local_memory.cpp +++ b/sycl/test/check_device_code/extensions/group_local_memory.cpp @@ -5,6 +5,11 @@ // The test checks that multiple calls to the same template instantiation of a // group local memory function result in separate allocations. +// CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 +// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 +// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 +// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 + #include #include #include @@ -31,8 +36,3 @@ int main() { }); }); } - -// CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 -// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 -// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 -// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 diff --git a/sycl/test/check_device_code/syclcompat_local_mem.cpp b/sycl/test/check_device_code/syclcompat_local_mem.cpp new file mode 100644 index 0000000000000..bb25cd62c5d6a --- /dev/null +++ b/sycl/test/check_device_code/syclcompat_local_mem.cpp @@ -0,0 +1,27 @@ +// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -o - | FileCheck %s + +// The test checks that multiple calls to the same template instantiation of +// syclcompat local_mem function result in separate allocations. + +// CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 +// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 + +#include +#include + +using namespace sycl; + +int main() { + queue Q; + + int **Out = malloc_shared(4, Q); + + Q.submit([&](handler &Cgh) { + Cgh.parallel_for(nd_range<1>({1}, {1}), [=](nd_item<1> Item) { + auto Ptr0 = syclcompat::local_mem(); + auto Ptr1 = syclcompat::local_mem(); + Out[0] = Ptr0; + Out[1] = Ptr1; + }); + }); +} From a765d797ea9d9bf0b89750b185649675fa25251a Mon Sep 17 00:00:00 2001 From: Wenju He Date: Tue, 17 Dec 2024 17:34:44 -0800 Subject: [PATCH 08/12] unique -> distinct --- llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp index 6f0b4fb5881ac..e95ec490979bc 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp @@ -90,10 +90,10 @@ ModulePass *llvm::createSYCLLowerWGLocalMemoryLegacyPass() { // In sycl header __sycl_allocateLocalMemory builtin call is wrapped in // group_local_memory/group_local_memory_for_overwrite functions, which must be // inlined first before each __sycl_allocateLocalMemory call can be lowered to a -// unique global variable. Inlining them here so that this pass doesn't have +// distinct global variable. Inlining them here so that this pass doesn't have // implicit dependency on AlwaysInlinerPass. // -// syclcompat::local_mem, which represents a unique allocation, calls +// syclcompat::local_mem, which represents a distinct allocation, calls // group_local_memory_for_overwrite. So local_mem should be inlined as well. static bool inlineGroupLocalMemoryFunc(Module &M) { Function *ALMFunc = M.getFunction(SYCL_ALLOCLOCALMEM_CALL); From 0de9070b2b48f409c0f2ac30f9ab797c62f79222 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Tue, 17 Dec 2024 19:59:49 -0800 Subject: [PATCH 09/12] change 4 to 2 in check_device_code/syclcompat_local_mem.cpp --- sycl/test/check_device_code/syclcompat_local_mem.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/check_device_code/syclcompat_local_mem.cpp b/sycl/test/check_device_code/syclcompat_local_mem.cpp index bb25cd62c5d6a..b8fdd51cc8629 100644 --- a/sycl/test/check_device_code/syclcompat_local_mem.cpp +++ b/sycl/test/check_device_code/syclcompat_local_mem.cpp @@ -14,7 +14,7 @@ using namespace sycl; int main() { queue Q; - int **Out = malloc_shared(4, Q); + int **Out = malloc_shared(2, Q); Q.submit([&](handler &Cgh) { Cgh.parallel_for(nd_range<1>({1}, {1}), [=](nd_item<1> Item) { From db2ad4a2b6f5139d266e17210a69f7be1404251f Mon Sep 17 00:00:00 2001 From: Wenju He Date: Tue, 17 Dec 2024 20:21:58 -0800 Subject: [PATCH 10/12] deterministic order of erasing functions --- llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp index e95ec490979bc..ec28ade5b866c 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp @@ -116,9 +116,9 @@ static bool inlineGroupLocalMemoryFunc(Module &M) { assert(Result.isSuccess() && "inlining failed"); } } + if (F != ALMFunc) + F->eraseFromParent(); } - for (auto *F : Visited) - F->eraseFromParent(); return !Visited.empty(); } From 442fe98c75d30687a6e829385a7ff228f401f978 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Thu, 19 Dec 2024 08:55:36 +0800 Subject: [PATCH 11/12] add back __SYCL_ALWAYS_INLINE --- sycl/include/sycl/ext/oneapi/group_local_memory.hpp | 4 ++-- sycl/include/syclcompat/memory.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/group_local_memory.hpp b/sycl/include/sycl/ext/oneapi/group_local_memory.hpp index 99364c29a821a..8b0b39c20fd39 100644 --- a/sycl/include/sycl/ext/oneapi/group_local_memory.hpp +++ b/sycl/include/sycl/ext/oneapi/group_local_memory.hpp @@ -27,7 +27,7 @@ template std::enable_if_t< std::is_trivially_destructible_v && sycl::detail::is_group::value, multi_ptr> -group_local_memory_for_overwrite(Group g) { + __SYCL_ALWAYS_INLINE group_local_memory_for_overwrite(Group g) { (void)g; #ifdef __SYCL_DEVICE_ONLY__ __attribute__((opencl_local)) std::uint8_t *AllocatedMem = @@ -53,7 +53,7 @@ template std::enable_if_t< std::is_trivially_destructible_v && sycl::detail::is_group::value, multi_ptr> -group_local_memory(Group g, Args &&...args) { + __SYCL_ALWAYS_INLINE group_local_memory(Group g, Args &&...args) { #ifdef __SYCL_DEVICE_ONLY__ __attribute__((opencl_local)) std::uint8_t *AllocatedMem = __sycl_allocateLocalMemory(sizeof(T), alignof(T)); diff --git a/sycl/include/syclcompat/memory.hpp b/sycl/include/syclcompat/memory.hpp index bb469e6ad8340..e2735a20b0e99 100644 --- a/sycl/include/syclcompat/memory.hpp +++ b/sycl/include/syclcompat/memory.hpp @@ -72,7 +72,7 @@ template #ifdef __SYCL_DEVICE_ONLY__ [[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]] #endif -auto *local_mem() { +__SYCL_ALWAYS_INLINE auto *local_mem() { sycl::multi_ptr As_multi_ptr = sycl::ext::oneapi::group_local_memory_for_overwrite( From 275d60d997234c312099e3a0c5e99834cafdaa9d Mon Sep 17 00:00:00 2001 From: Wenju He Date: Fri, 27 Dec 2024 12:48:38 +0800 Subject: [PATCH 12/12] remove #include llvm/Demangle/Demangle.h --- llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp index ec28ade5b866c..f366e0e61668b 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp @@ -10,7 +10,6 @@ #include "llvm/SYCLLowerIR/LowerWGLocalMemory.h" #include "llvm/ADT/DenseSet.h" -#include "llvm/Demangle/Demangle.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h"