Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][SYCLLowerWGLocalMemoryPass] Remove implicit dependency on AlwaysInlinerPass and move to PipelineStart #16356

Open
wants to merge 13 commits into
base: sycl
Choose a base branch
from
Open
6 changes: 2 additions & 4 deletions clang/lib/CodeGen/BackendUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1042,6 +1042,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
/*FP64ConvEmu=*/CodeGenOpts.FP64ConvEmu,
/*ExcludeAspects=*/{"fp64"}));
MPM.addPass(SYCLPropagateJointMatrixUsagePass());
// Lowers static/dynamic local memory builtin calls.
MPM.addPass(SYCLLowerWGLocalMemoryPass());
});
else if (LangOpts.SYCLIsHost && !LangOpts.SYCLESIMDBuildHostCode)
PB.registerPipelineStartEPCallback(
Expand Down Expand Up @@ -1191,10 +1193,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
MPM.addPass(SPIRITTAnnotationsPass());
}

// Allocate static local memory in SYCL kernel scope for each allocation
// call.
MPM.addPass(SYCLLowerWGLocalMemoryPass());

// Process properties and annotations
MPM.addPass(CompileTimePropertiesPass());

Expand Down
29 changes: 0 additions & 29 deletions clang/test/CodeGenSYCL/group-local-memory.cpp

This file was deleted.

31 changes: 20 additions & 11 deletions clang/test/CodeGenSYCL/kernel-early-optimization-pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,24 @@
// SYCL device target, and can be disabled with -fno-sycl-early-optimizations.
// New pass manager doesn't print all passes tree, only module level.
//
// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-NEWPM-EARLYOPT
// CHECK-NEWPM-EARLYOPT: ConstantMergePass
// CHECK-NEWPM-EARLYOPT: SYCLMutatePrintfAddrspacePass
// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -o /dev/null 2>&1 | FileCheck %s
// CHECK: SYCLVirtualFunctionsAnalysisPass
// CHECK: ESIMDVerifierPass
// CHECK: SYCLConditionalCallOnDevicePass
// CHECK: SYCLPropagateAspectsUsagePass
// CHECK: SYCLPropagateJointMatrixUsagePass
// CHECK: SYCLLowerWGLocalMemoryPass
// CHECK: InferFunctionAttrsPass
// CHECK: AlwaysInlinerPass
// CHECK: ModuleInlinerWrapperPass
// CHECK: ConstantMergePass
// CHECK: SYCLMutatePrintfAddrspacePass
// CHECK: SYCLPropagateAspectsUsagePass
// CHECK: SYCLAddOptLevelAttributePass
// CHECK: CompileTimePropertiesPass
// CHECK: RecordSYCLAspectNamesPass
// CHECK: CleanupSYCLMetadataPass
//
// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -fno-sycl-early-optimizations -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-NEWPM-NOEARLYOPT
// CHECK-NEWPM-NOEARLYOPT-NOT: ConstantMergePass
// CHECK-NEWPM-NOEARLYOPT: SYCLMutatePrintfAddrspacePass

// Checks that the compile time properties pass is added into the compilation pipeline
//
// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-COMPTIMEPROPS
// CHECK-COMPTIMEPROPS: Running pass: CompileTimePropertiesPass on [module]
// RUN: %clang_cc1 -O2 -fsycl-is-device -triple spir64-unknown-unknown %s -mdebug-pass Structure -emit-llvm -fno-sycl-early-optimizations -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-NOEARLYOPT
// CHECK-NOEARLYOPT-NOT: ConstantMergePass1
// CHECK-NOEARLYOPT: SYCLMutatePrintfAddrspacePass
49 changes: 43 additions & 6 deletions llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,17 @@
//===----------------------------------------------------------------------===//

#include "llvm/SYCLLowerIR/LowerWGLocalMemory.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/Pass.h"
#include "llvm/TargetParser/Triple.h"
#include "llvm/Transforms/Utils/Cloning.h"

using namespace llvm;

#define DEBUG_TYPE "LowerWGLocalMemory"
#define DEBUG_TYPE "sycllowerwglocalmemory"

static constexpr char SYCL_ALLOCLOCALMEM_CALL[] = "__sycl_allocateLocalMemory";
static constexpr char SYCL_DYNAMIC_LOCALMEM_CALL[] =
Expand Down Expand Up @@ -84,6 +86,42 @@ ModulePass *llvm::createSYCLLowerWGLocalMemoryLegacyPass() {
return new SYCLLowerWGLocalMemoryLegacy();
}

// In sycl header __sycl_allocateLocalMemory builtin call is wrapped in
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can we not rewrite the SYCL headers to 'inline' these calls? Is there a specific reason? Thanks

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can we not rewrite the SYCL headers to 'inline' these calls? Is there a specific reason? Thanks

We can't ask users to call __sycl_allocateLocalMemory internal intrsinsic when documented interface is sycl::ext::something::something::group_local_memory<T>

// group_local_memory/group_local_memory_for_overwrite functions, which must be
// inlined first before each __sycl_allocateLocalMemory call can be lowered to a
// distinct global variable. Inlining them here so that this pass doesn't have
// implicit dependency on AlwaysInlinerPass.
Comment on lines +92 to +93
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, instead of having two building blocks (i.e. passes) where each does a specific thing, we now have one of them doing both things.

This doesn't sound good from the high-level design point of view. I understand that having an implicit dependency is probably not a good thing, but are there any reasons to remove the dependency completely?

Because many passes have dependencies on each other and there are mechanism to explicitly tell the pass manager about them: The AnalysisUsage::addRequired<> and AnalysisUsage::addRequiredTransitive<> methods. They are mostly used for requesting results of a certain analysis, but they can also be used to request specific transformations to be performed before a certain pass is run. You can find examples of that with LoopSimplify

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't sound good from the high-level design point of view. I understand that having an implicit dependency is probably not a good thing, but are there any reasons to remove the dependency completely?

I agree, but I think it is good to making this pass self-contained.

Because many passes have dependencies on each other and there are mechanism to explicitly tell the pass manager about them: The AnalysisUsage::addRequired<> and AnalysisUsage::addRequiredTransitive<> methods. They are mostly used for requesting results of a certain analysis, but they can also be used to request specific transformations to be performed before a certain pass is run. You can find examples of that with LoopSimplify

This would inline other functions that are not related to what we handle in this pass.

//
// syclcompat::local_mem, which represents a distinct allocation, calls
// group_local_memory_for_overwrite. So local_mem should be inlined as well.
static bool inlineGroupLocalMemoryFunc(Module &M) {
Function *ALMFunc = M.getFunction(SYCL_ALLOCLOCALMEM_CALL);
if (!ALMFunc || ALMFunc->use_empty())
return false;

SmallVector<Function *, 4> WorkList{ALMFunc};
DenseSet<Function *> Visited;
while (!WorkList.empty()) {
auto *F = WorkList.pop_back_val();
for (auto *U : make_early_inc_range(F->users())) {
auto *CI = cast<CallInst>(U);
auto *Caller = CI->getFunction();
if (Caller->hasFnAttribute("sycl-forceinline") &&
Visited.insert(Caller).second)
WorkList.push_back(Caller);
if (F != ALMFunc) {
InlineFunctionInfo IFI;
[[maybe_unused]] auto Result = InlineFunction(*CI, IFI);
assert(Result.isSuccess() && "inlining failed");
}
}
if (F != ALMFunc)
F->eraseFromParent();
}

return !Visited.empty();
}

// TODO: It should be checked that __sycl_allocateLocalMemory (or its source
// form - group_local_memory) does not occur:
// - in a function (other than user lambda/functor)
Expand Down Expand Up @@ -317,9 +355,8 @@ static bool dynamicWGLocalMemory(Module &M) {

PreservedAnalyses SYCLLowerWGLocalMemoryPass::run(Module &M,
ModuleAnalysisManager &) {
bool MadeChanges = allocaWGLocalMemory(M);
MadeChanges = dynamicWGLocalMemory(M) || MadeChanges;
if (MadeChanges)
return PreservedAnalyses::none();
return PreservedAnalyses::all();
bool Changed = inlineGroupLocalMemoryFunc(M);
Changed |= allocaWGLocalMemory(M);
Changed |= dynamicWGLocalMemory(M);
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
jsji marked this conversation as resolved.
Show resolved Hide resolved
}
66 changes: 66 additions & 0 deletions llvm/test/SYCLLowerIR/group_local_memory_inline.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
; RUN: opt < %s -passes=sycllowerwglocalmemory -S | FileCheck %s

; Check group_local_memory_for_overwrite and group_local_memory functions are inlined.
; Check __sycl_allocateLocalMemory calls are lowered to four separate allocations.

target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1"
target triple = "spir64-unknown-unknown"

%"class.sycl::_V1::multi_ptr" = type { ptr addrspace(3) }
%"class.sycl::_V1::group" = type { %"class.sycl::_V1::range", %"class.sycl::_V1::range", %"class.sycl::_V1::range", %"class.sycl::_V1::id" }
%"class.sycl::_V1::range" = type { %"class.sycl::_V1::detail::array" }
%"class.sycl::_V1::detail::array" = type { [1 x i64] }
%"class.sycl::_V1::id" = type { %"class.sycl::_V1::detail::array" }

; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4

; Function Attrs: alwaysinline
define internal spir_func void @_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_7nd_itemILi1EEEE_clES5_() #0 {
entry:
; CHECK: define internal spir_func void @_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_7nd_itemILi1EEEE_clES5_(
; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8
; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8
; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8
; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8

%Ptr = alloca %"class.sycl::_V1::multi_ptr", align 8
%agg = alloca %"class.sycl::_V1::group", align 8
%Ptr.ascast = addrspacecast ptr %Ptr to ptr addrspace(4)
call spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg)
call spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg)
call spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg)
call spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg)
ret void
}

; CHECK-NOT: define {{.*}} @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(

; Function Attrs: alwaysinline
define spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %agg.result, ptr byval(%"class.sycl::_V1::group") align 8 %g) #1 {
entry:
%AllocatedMem = alloca ptr addrspace(3), align 8
%AllocatedMem.ascast = addrspacecast ptr %AllocatedMem to ptr addrspace(4)
%call = call spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 4, i64 4)
store ptr addrspace(3) %call, ptr addrspace(4) %AllocatedMem.ascast, align 8
ret void
}

; CHECK-NOT: define {{.*}} @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(

; Function Attrs: alwaysinline
define spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %agg.result, ptr byval(%"class.sycl::_V1::group") align 8 %g) #1 {
entry:
%AllocatedMem = alloca ptr addrspace(3), align 8
%AllocatedMem.ascast = addrspacecast ptr %AllocatedMem to ptr addrspace(4)
%call = call spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 4, i64 4)
store ptr addrspace(3) %call, ptr addrspace(4) %AllocatedMem.ascast, align 8
ret void
}

declare spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 noundef, i64 noundef)

attributes #0 = { alwaysinline }
attributes #1 = { "sycl-forceinline"="true" }
6 changes: 6 additions & 0 deletions sycl/include/sycl/ext/oneapi/group_local_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ namespace sycl {
inline namespace _V1 {
namespace ext::oneapi {
template <typename T, typename Group>
#ifdef __SYCL_DEVICE_ONLY__
[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]]
#endif
std::enable_if_t<
std::is_trivially_destructible_v<T> && sycl::detail::is_group<Group>::value,
multi_ptr<T, access::address_space::local_space, access::decorated::legacy>>
Expand All @@ -44,6 +47,9 @@ std::enable_if_t<
}

template <typename T, typename Group, typename... Args>
#ifdef __SYCL_DEVICE_ONLY__
[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]]
#endif
std::enable_if_t<
std::is_trivially_destructible_v<T> && sycl::detail::is_group<Group>::value,
multi_ptr<T, access::address_space::local_space, access::decorated::legacy>>
Expand Down
6 changes: 5 additions & 1 deletion sycl/include/syclcompat/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,11 @@

namespace syclcompat {

template <typename AllocT> auto *local_mem() {
template <typename AllocT>
#ifdef __SYCL_DEVICE_ONLY__
[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]]
#endif
__SYCL_ALWAYS_INLINE auto *local_mem() {
sycl::multi_ptr<AllocT, sycl::access::address_space::local_space>
As_multi_ptr =
sycl::ext::oneapi::group_local_memory_for_overwrite<AllocT>(
Expand Down
38 changes: 38 additions & 0 deletions sycl/test/check_device_code/extensions/group_local_memory.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -o - | FileCheck %s
// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -fno-sycl-early-optimizations -o - | FileCheck %s
// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -O0 -o - | FileCheck %s

// The test checks that multiple calls to the same template instantiation of a
// group local memory function result in separate allocations.

// CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4

#include <sycl/detail/core.hpp>
#include <sycl/ext/oneapi/group_local_memory.hpp>
#include <sycl/usm.hpp>

using namespace sycl;

int main() {
queue Q;

int **Out = malloc_shared<int *>(4, Q);

Q.submit([&](handler &Cgh) {
Cgh.parallel_for(nd_range<1>({1}, {1}), [=](nd_item<1> Item) {
auto Ptr0 =
ext::oneapi::group_local_memory_for_overwrite<int>(Item.get_group());
auto Ptr1 =
ext::oneapi::group_local_memory_for_overwrite<int>(Item.get_group());
auto Ptr2 = ext::oneapi::group_local_memory<int>(Item.get_group());
auto Ptr3 = ext::oneapi::group_local_memory<int>(Item.get_group());
Out[0] = Ptr0;
Out[1] = Ptr1;
Out[2] = Ptr2;
Out[3] = Ptr3;
});
});
}
27 changes: 27 additions & 0 deletions sycl/test/check_device_code/syclcompat_local_mem.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -o - | FileCheck %s

// The test checks that multiple calls to the same template instantiation of
// syclcompat local_mem function result in separate allocations.

// CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4

#include <sycl/detail/core.hpp>
#include <syclcompat/memory.hpp>

using namespace sycl;

int main() {
queue Q;

int **Out = malloc_shared<int *>(2, Q);

Q.submit([&](handler &Cgh) {
Cgh.parallel_for(nd_range<1>({1}, {1}), [=](nd_item<1> Item) {
auto Ptr0 = syclcompat::local_mem<int[1]>();
auto Ptr1 = syclcompat::local_mem<int[1]>();
Out[0] = Ptr0;
Out[1] = Ptr1;
});
});
}
Loading