-
Notifications
You must be signed in to change notification settings - Fork 752
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SYCL][SYCLLowerWGLocalMemoryPass] Remove implicit dependency on AlwaysInlinerPass and move to PipelineStart #16356
base: sycl
Are you sure you want to change the base?
Changes from all commits
e6ce584
51b1ec9
a4f8382
b42fd22
44db66a
97add86
a4fe915
d764d00
a765d79
0de9070
db2ad4a
442fe98
275d60d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,15 +9,17 @@ | |
//===----------------------------------------------------------------------===// | ||
|
||
#include "llvm/SYCLLowerIR/LowerWGLocalMemory.h" | ||
#include "llvm/ADT/DenseSet.h" | ||
#include "llvm/IR/Function.h" | ||
#include "llvm/IR/IRBuilder.h" | ||
#include "llvm/IR/InstIterator.h" | ||
#include "llvm/Pass.h" | ||
#include "llvm/TargetParser/Triple.h" | ||
#include "llvm/Transforms/Utils/Cloning.h" | ||
|
||
using namespace llvm; | ||
|
||
#define DEBUG_TYPE "LowerWGLocalMemory" | ||
#define DEBUG_TYPE "sycllowerwglocalmemory" | ||
|
||
static constexpr char SYCL_ALLOCLOCALMEM_CALL[] = "__sycl_allocateLocalMemory"; | ||
static constexpr char SYCL_DYNAMIC_LOCALMEM_CALL[] = | ||
|
@@ -84,6 +86,42 @@ ModulePass *llvm::createSYCLLowerWGLocalMemoryLegacyPass() { | |
return new SYCLLowerWGLocalMemoryLegacy(); | ||
} | ||
|
||
// In sycl header __sycl_allocateLocalMemory builtin call is wrapped in | ||
// group_local_memory/group_local_memory_for_overwrite functions, which must be | ||
// inlined first before each __sycl_allocateLocalMemory call can be lowered to a | ||
// distinct global variable. Inlining them here so that this pass doesn't have | ||
// implicit dependency on AlwaysInlinerPass. | ||
Comment on lines
+92
to
+93
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, instead of having two building blocks (i.e. passes) where each does a specific thing, we now have one of them doing both things. This doesn't sound good from the high-level design point of view. I understand that having an implicit dependency is probably not a good thing, but are there any reasons to remove the dependency completely? Because many passes have dependencies on each other and there are mechanism to explicitly tell the pass manager about them: The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I agree, but I think it is good to making this pass self-contained.
This would inline other functions that are not related to what we handle in this pass. |
||
// | ||
// syclcompat::local_mem, which represents a distinct allocation, calls | ||
// group_local_memory_for_overwrite. So local_mem should be inlined as well. | ||
static bool inlineGroupLocalMemoryFunc(Module &M) { | ||
Function *ALMFunc = M.getFunction(SYCL_ALLOCLOCALMEM_CALL); | ||
if (!ALMFunc || ALMFunc->use_empty()) | ||
return false; | ||
|
||
SmallVector<Function *, 4> WorkList{ALMFunc}; | ||
DenseSet<Function *> Visited; | ||
while (!WorkList.empty()) { | ||
auto *F = WorkList.pop_back_val(); | ||
for (auto *U : make_early_inc_range(F->users())) { | ||
auto *CI = cast<CallInst>(U); | ||
auto *Caller = CI->getFunction(); | ||
if (Caller->hasFnAttribute("sycl-forceinline") && | ||
Visited.insert(Caller).second) | ||
WorkList.push_back(Caller); | ||
if (F != ALMFunc) { | ||
InlineFunctionInfo IFI; | ||
[[maybe_unused]] auto Result = InlineFunction(*CI, IFI); | ||
assert(Result.isSuccess() && "inlining failed"); | ||
} | ||
} | ||
if (F != ALMFunc) | ||
F->eraseFromParent(); | ||
} | ||
|
||
return !Visited.empty(); | ||
} | ||
|
||
// TODO: It should be checked that __sycl_allocateLocalMemory (or its source | ||
// form - group_local_memory) does not occur: | ||
// - in a function (other than user lambda/functor) | ||
|
@@ -317,9 +355,8 @@ static bool dynamicWGLocalMemory(Module &M) { | |
|
||
PreservedAnalyses SYCLLowerWGLocalMemoryPass::run(Module &M, | ||
ModuleAnalysisManager &) { | ||
bool MadeChanges = allocaWGLocalMemory(M); | ||
MadeChanges = dynamicWGLocalMemory(M) || MadeChanges; | ||
if (MadeChanges) | ||
return PreservedAnalyses::none(); | ||
return PreservedAnalyses::all(); | ||
bool Changed = inlineGroupLocalMemoryFunc(M); | ||
Changed |= allocaWGLocalMemory(M); | ||
Changed |= dynamicWGLocalMemory(M); | ||
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); | ||
jsji marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
; RUN: opt < %s -passes=sycllowerwglocalmemory -S | FileCheck %s | ||
|
||
; Check group_local_memory_for_overwrite and group_local_memory functions are inlined. | ||
; Check __sycl_allocateLocalMemory calls are lowered to four separate allocations. | ||
|
||
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" | ||
target triple = "spir64-unknown-unknown" | ||
|
||
%"class.sycl::_V1::multi_ptr" = type { ptr addrspace(3) } | ||
%"class.sycl::_V1::group" = type { %"class.sycl::_V1::range", %"class.sycl::_V1::range", %"class.sycl::_V1::range", %"class.sycl::_V1::id" } | ||
%"class.sycl::_V1::range" = type { %"class.sycl::_V1::detail::array" } | ||
%"class.sycl::_V1::detail::array" = type { [1 x i64] } | ||
%"class.sycl::_V1::id" = type { %"class.sycl::_V1::detail::array" } | ||
|
||
; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 | ||
; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 | ||
; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 | ||
; CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 | ||
|
||
; Function Attrs: alwaysinline | ||
define internal spir_func void @_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_7nd_itemILi1EEEE_clES5_() #0 { | ||
entry: | ||
; CHECK: define internal spir_func void @_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_7nd_itemILi1EEEE_clES5_( | ||
; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8 | ||
; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8 | ||
; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8 | ||
; CHECK: store ptr addrspace(3) @WGLocalMem{{.*}}, ptr addrspace(4) %AllocatedMem{{.*}}, align 8 | ||
|
||
%Ptr = alloca %"class.sycl::_V1::multi_ptr", align 8 | ||
%agg = alloca %"class.sycl::_V1::group", align 8 | ||
%Ptr.ascast = addrspacecast ptr %Ptr to ptr addrspace(4) | ||
call spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg) | ||
call spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg) | ||
call spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg) | ||
call spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %Ptr.ascast, ptr byval(%"class.sycl::_V1::group") align 8 %agg) | ||
ret void | ||
} | ||
|
||
; CHECK-NOT: define {{.*}} @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_( | ||
|
||
; Function Attrs: alwaysinline | ||
define spir_func void @_ZN4sycl3_V13ext6oneapi32group_local_memory_for_overwriteIiNS0_5groupILi1EEEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %agg.result, ptr byval(%"class.sycl::_V1::group") align 8 %g) #1 { | ||
entry: | ||
%AllocatedMem = alloca ptr addrspace(3), align 8 | ||
%AllocatedMem.ascast = addrspacecast ptr %AllocatedMem to ptr addrspace(4) | ||
%call = call spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 4, i64 4) | ||
store ptr addrspace(3) %call, ptr addrspace(4) %AllocatedMem.ascast, align 8 | ||
ret void | ||
} | ||
|
||
; CHECK-NOT: define {{.*}} @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_( | ||
|
||
; Function Attrs: alwaysinline | ||
define spir_func void @_ZN4sycl3_V13ext6oneapi18group_local_memoryIiNS0_5groupILi1EEEJEEENSt9enable_ifIXaasr3stdE27is_trivially_destructible_vIT_Esr4sycl6detail8is_groupIT0_EE5valueENS0_9multi_ptrIS7_LNS0_6access13address_spaceE3ELNSA_9decoratedE2EEEE4typeES8_DpOT1_(ptr addrspace(4) sret(%"class.sycl::_V1::multi_ptr") align 8 %agg.result, ptr byval(%"class.sycl::_V1::group") align 8 %g) #1 { | ||
entry: | ||
%AllocatedMem = alloca ptr addrspace(3), align 8 | ||
%AllocatedMem.ascast = addrspacecast ptr %AllocatedMem to ptr addrspace(4) | ||
%call = call spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 4, i64 4) | ||
store ptr addrspace(3) %call, ptr addrspace(4) %AllocatedMem.ascast, align 8 | ||
ret void | ||
} | ||
|
||
declare spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 noundef, i64 noundef) | ||
|
||
attributes #0 = { alwaysinline } | ||
attributes #1 = { "sycl-forceinline"="true" } |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -o - | FileCheck %s | ||
// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -fno-sycl-early-optimizations -o - | FileCheck %s | ||
// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -O0 -o - | FileCheck %s | ||
|
||
// The test checks that multiple calls to the same template instantiation of a | ||
// group local memory function result in separate allocations. | ||
|
||
// CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 | ||
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 | ||
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 | ||
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 | ||
|
||
#include <sycl/detail/core.hpp> | ||
#include <sycl/ext/oneapi/group_local_memory.hpp> | ||
#include <sycl/usm.hpp> | ||
|
||
using namespace sycl; | ||
|
||
int main() { | ||
queue Q; | ||
|
||
int **Out = malloc_shared<int *>(4, Q); | ||
|
||
Q.submit([&](handler &Cgh) { | ||
Cgh.parallel_for(nd_range<1>({1}, {1}), [=](nd_item<1> Item) { | ||
auto Ptr0 = | ||
ext::oneapi::group_local_memory_for_overwrite<int>(Item.get_group()); | ||
auto Ptr1 = | ||
ext::oneapi::group_local_memory_for_overwrite<int>(Item.get_group()); | ||
auto Ptr2 = ext::oneapi::group_local_memory<int>(Item.get_group()); | ||
auto Ptr3 = ext::oneapi::group_local_memory<int>(Item.get_group()); | ||
Out[0] = Ptr0; | ||
Out[1] = Ptr1; | ||
Out[2] = Ptr2; | ||
Out[3] = Ptr3; | ||
}); | ||
}); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -o - | FileCheck %s | ||
|
||
// The test checks that multiple calls to the same template instantiation of | ||
// syclcompat local_mem function result in separate allocations. | ||
|
||
// CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 | ||
// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] undef, align 4 | ||
|
||
#include <sycl/detail/core.hpp> | ||
#include <syclcompat/memory.hpp> | ||
|
||
using namespace sycl; | ||
|
||
int main() { | ||
queue Q; | ||
|
||
int **Out = malloc_shared<int *>(2, Q); | ||
|
||
Q.submit([&](handler &Cgh) { | ||
Cgh.parallel_for(nd_range<1>({1}, {1}), [=](nd_item<1> Item) { | ||
auto Ptr0 = syclcompat::local_mem<int[1]>(); | ||
auto Ptr1 = syclcompat::local_mem<int[1]>(); | ||
Out[0] = Ptr0; | ||
Out[1] = Ptr1; | ||
}); | ||
}); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why can we not rewrite the SYCL headers to 'inline' these calls? Is there a specific reason? Thanks
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can't ask users to call
__sycl_allocateLocalMemory
internal intrsinsic when documented interface issycl::ext::something::something::group_local_memory<T>