Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] Materialize shadow local variables for byval arguments before use #2200

Merged
merged 1 commit into from
Jul 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 10 additions & 26 deletions llvm/lib/SYCLLowerIR/LowerWGScope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -685,7 +685,7 @@ static void fixupPrivateMemoryPFWILambdaCaptures(CallInst *PFWICall) {

// Go through "byval" parameters which are passed as AS(0) pointers
// and: (1) create local shadows for them (2) and initialize them from the
// leader's copy and (3) replace usages with pointer to the shadow
// leader's copy and (3) materialize the value in the local variable before use
//
// Do the same for 'this' pointer which points to PFWG lamda object which is
// allocated in the caller. Caller is a kernel function which is generated by
Expand All @@ -707,7 +707,7 @@ static void sharePFWGPrivateObjects(Function &F, const Triple &TT) {
BasicBlock *LeaderBB = EntryBB->splitBasicBlock(SplitPoint, "leader");
BasicBlock *MergeBB = LeaderBB->splitBasicBlock(&LeaderBB->front(), "merge");

// 1) rewire the above basic blocks so that LeaderBB is executed only for the
// Rewire the above basic blocks so that LeaderBB is executed only for the
// leader workitem
guardBlockWithIsLeaderCheck(EntryBB, LeaderBB, MergeBB,
EntryBB->back().getDebugLoc(), TT);
Expand All @@ -719,50 +719,34 @@ static void sharePFWGPrivateObjects(Function &F, const Triple &TT) {
IRBuilder<> Builder(Ctx);
Builder.SetInsertPoint(&LeaderBB->front());

// 2) create the shared copy - "shadow" - for current arg
// Create the shared copy - "shadow" - for current arg
GlobalVariable *Shadow = nullptr;
Value *RepVal = nullptr;
if (Arg.hasByValAttr()) {
assert(Arg.getType()->getPointerAddressSpace() ==
asUInt(spirv::AddrSpace::Private));
T = Arg.getParamByValType();
Shadow = spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow");
RepVal = Shadow;
if (TT.isNVPTX()) {
// For NVPTX target address space inference for kernel arguments and
// allocas is happening in the backend (NVPTXLowerArgs and
// NVPTXLowerAlloca passes). After the frontend these pointers are in
// LLVM default address space 0 which is the generic address space for
// NVPTX target.
assert(Arg.getType()->getPointerAddressSpace() == 0);

// Cast a pointer in the shared address space to the generic address
// space.
RepVal = ConstantExpr::getPointerBitCastOrAddrSpaceCast(Shadow,
Arg.getType());
}
}
// Process 'this' pointer which points to PFWG lambda object
else if (Arg.getArgNo() == 0) {
PointerType *PtrT = dyn_cast<PointerType>(Arg.getType());
assert(PtrT && "Expected this pointer as the first argument");
T = PtrT->getElementType();
Shadow = spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow");
RepVal =
Builder.CreatePointerBitCastOrAddrSpaceCast(Shadow, Arg.getType());
}

if (!Shadow || !RepVal)
if (!Shadow)
continue;

// 3) replace argument with shadow in all uses
for (auto *U : Arg.users())
U->replaceUsesOfWith(&Arg, RepVal);

copyBetweenPrivateAndShadow(&Arg, Shadow, Builder,
true /*private->shadow*/);
// Materialize the value in the local variable before use
Builder.SetInsertPoint(&MergeBB->front());
copyBetweenPrivateAndShadow(&Arg, Shadow, Builder,
false /*shadow->private*/);
}
// 5) make sure workers use up-to-date shared values written by the leader
// Insert barrier to make sure workers use up-to-date shared values written by
// the leader
spirv::genWGBarrier(MergeBB->front(), TT);
}

Expand Down
2 changes: 2 additions & 0 deletions llvm/test/SYCLLowerIR/byval_arg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ define internal spir_func void @wibble(%struct.baz* byval(%struct.baz) %arg1) !w
; CHECK-NEXT: br label [[MERGE]]
; CHECK: merge:
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272)
; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.baz* [[ARG1]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* [[TMP3]], i8 addrspace(3)* align 8 bitcast (%struct.baz addrspace(3)* @[[SHADOW]] to i8 addrspace(3)*), i64 8, i1 false)
; CHECK-NEXT: ret void
;
ret void
Expand Down
50 changes: 50 additions & 0 deletions llvm/test/SYCLLowerIR/byval_arg_cast.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -LowerWGScope -S | FileCheck %s

; Test to check that shadow local variable for byval argument is
; materialized before use. Otherwise invalid cast between address
; spaces is generated.

%struct.widget = type { %struct.baz, %struct.baz, %struct.baz, %struct.spam }
%struct.baz = type { %struct.snork }
%struct.snork = type { [1 x i64] }
%struct.spam = type { %struct.snork }


declare dso_local spir_func void @zot(i8*)

; CHECK: @[[SHADOW:[a-zA-Z0-9]+]] = internal unnamed_addr addrspace(3) global %struct.widget undef, align 16

; Function Attrs: inlinehint norecurse
define dso_local spir_func void @wombat(%struct.widget* byval(%struct.widget) align 8 %arg) align 2 !work_group_scope !1 {
; CHECK-LABEL: @wombat(
; CHECK-NEXT: bb:
; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
; CHECK-NEXT: [[CMPZ1:%.*]] = icmp eq i64 [[TMP0]], 0
; CHECK-NEXT: br i1 [[CMPZ1]], label [[LEADER:%.*]], label [[MERGE:%.*]]
; CHECK: leader:
; CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.widget* [[ARG:%.*]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 16 bitcast (%struct.widget addrspace(3)* @[[SHADOW]] to i8 addrspace(3)*), i8* align 8 [[TMP1]], i64 32, i1 false)
; CHECK-NEXT: br label [[MERGE]]
; CHECK: merge:
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0
; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.widget* [[ARG]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 8 [[TMP2]], i8 addrspace(3)* align 16 bitcast (%struct.widget addrspace(3)* @[[SHADOW]] to i8 addrspace(3)*), i64 32, i1 false)
; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP3]], 0
; CHECK-NEXT: br i1 [[CMPZ]], label [[WG_LEADER:%.*]], label [[WG_CF:%.*]]
; CHECK: wg_leader:
; CHECK-NEXT: [[TMP:%.*]] = bitcast %struct.widget* [[ARG]] to i8*
; CHECK-NEXT: call void @zot(i8* [[TMP]])
; CHECK-NEXT: br label [[WG_CF]]
; CHECK: wg_cf:
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0
; CHECK-NEXT: ret void
;
bb:
%tmp = bitcast %struct.widget* %arg to i8*
call void @zot(i8* %tmp)
ret void
}

!1 = !{}
5 changes: 4 additions & 1 deletion llvm/test/SYCLLowerIR/cast_shadow.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ target triple = "nvptx64-nvidia-cuda-sycldevice"
define internal void @wobble(%struct.baz* %arg, %struct.spam* byval(%struct.spam) %arg1) !work_group_scope !0 {
; CHECK: [[TMP10:%.*]] = bitcast %struct.spam* [[ARG1:%.*]] to i8*
; CHECK: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 16 bitcast (%struct.spam addrspace(3)* @[[SHADOW]] to i8 addrspace(3)*), i8* [[TMP10]], i64 32, i1 false)
; CHECK: call void @widget(%struct.spam* addrspacecast (%struct.spam addrspace(3)* @[[SHADOW]] to %struct.spam*), %struct.quux* byval(%struct.quux) [[TMP2:%.*]])
; CHECK: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0
; CHECK: [[TMP11:%.*]] = bitcast %struct.spam* %arg1 to i8*
; CHECK: call void @llvm.memcpy.p0i8.p3i8.i64(i8* [[TMP11:%.*]], i8 addrspace(3)* align 16 bitcast (%struct.spam addrspace(3)* @[[SHADOW]] to i8
; CHECK: call void @widget(%struct.spam* %arg1, %struct.quux* byval(%struct.quux) [[TMP2:%.*]])
;
bb:
%tmp = alloca %struct.baz*
Expand Down
12 changes: 8 additions & 4 deletions llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,17 @@ define internal spir_func void @wibble(%struct.bar addrspace(4)* %arg, %struct.z
; CHECK-NEXT: br label [[MERGE]]
; CHECK: merge:
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0
; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex
; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP3]], 0
; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.zot* [[ARG1]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 8 [[TMP3]], i8 addrspace(3)* align 16 bitcast (%struct.zot addrspace(3)* @[[GROUP_SHADOW]] to i8 addrspace(3)*), i64 96, i1 false)
; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[STRUCT_BAR]] addrspace(4)* [[ARG]] to i8 addrspace(4)*
; CHECK-NEXT: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* align 8 [[TMP4]], i8 addrspace(3)* align 8 getelementptr inbounds (%struct.bar, [[STRUCT_BAR]] addrspace(3)* @[[PFWG_SHADOW]], i32 0, i32 0), i64 1, i1 false)
; CHECK-NEXT: [[TMP5:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex
; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP5]], 0
; CHECK-NEXT: br i1 [[CMPZ]], label [[WG_LEADER:%.*]], label [[WG_CF:%.*]]
; CHECK: wg_leader:
; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* addrspacecast (%struct.bar addrspace(3)* @[[PFWG_SHADOW]] to [[STRUCT_BAR]] addrspace(4)*), [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8
; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[ARG]], [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8
; CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast [[STRUCT_ZOT:%.*]] addrspace(3)* @[[GROUP_SHADOW]] to [[STRUCT_ZOT]] addrspace(4)*
; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast %struct.zot* [[ARG1]] to [[STRUCT_ZOT:%.*]] addrspace(4)*
; CHECK-NEXT: store [[STRUCT_ZOT]] addrspace(4)* [[TMP4]], [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @[[GROUP_SHADOW_PTR]]
; CHECK-NEXT: br label [[WG_CF]]
; CHECK: wg_cf:
Expand Down