Skip to content

Commit

Permalink
[AMDGPU] Improve uniform argument handling in InstCombineIntrinsic (#…
Browse files Browse the repository at this point in the history
…105812)

Common up handling of intrinsics that are a no-op on uniform arguments.
This catches a couple of new cases:

readlane (readlane x, y), z -> readlane x, y
(for any z, does not have to equal y).

permlane64 (readfirstlane x) -> readfirstlane x
(and likewise for any other uniform argument to permlane64).
  • Loading branch information
jayfoad authored Aug 23, 2024
1 parent c9b6339 commit f142f8a
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 40 deletions.
57 changes: 19 additions & 38 deletions llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,21 @@ static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
SqrtOp->getType()->isHalfTy();
}

/// Return true if we can easily prove that use U is uniform.
static bool isTriviallyUniform(const Use &U) {
Value *V = U.get();
if (isa<Constant>(V))
return true;
if (const auto *II = dyn_cast<IntrinsicInst>(V)) {
if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID()))
return false;
// If II and U are in different blocks then there is a possibility of
// temporal divergence.
return II->getParent() == cast<Instruction>(U.getUser())->getParent();
}
return false;
}

std::optional<Instruction *>
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
Intrinsic::ID IID = II.getIntrinsicID();
Expand Down Expand Up @@ -1060,46 +1075,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
}
case Intrinsic::amdgcn_permlane64:
// A constant value is trivially uniform.
if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
return IC.replaceInstUsesWith(II, C);
}
break;
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
// A constant value is trivially uniform.
if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
return IC.replaceInstUsesWith(II, C);
}

// The rest of these may not be safe if the exec may not be the same between
// the def and use.
Value *Src = II.getArgOperand(0);
Instruction *SrcInst = dyn_cast<Instruction>(Src);
if (SrcInst && SrcInst->getParent() != II.getParent())
break;

// readfirstlane (readfirstlane x) -> readfirstlane x
// readlane (readfirstlane x), y -> readfirstlane x
if (match(Src,
PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
return IC.replaceInstUsesWith(II, Src);
}

if (IID == Intrinsic::amdgcn_readfirstlane) {
// readfirstlane (readlane x, y) -> readlane x, y
if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
return IC.replaceInstUsesWith(II, Src);
}
} else {
// readlane (readlane x, y), y -> readlane x, y
if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
PatternMatch::m_Value(),
PatternMatch::m_Specific(II.getArgOperand(1))))) {
return IC.replaceInstUsesWith(II, Src);
}
}

// If the first argument is uniform these intrinsics return it unchanged.
const Use &Src = II.getArgOperandUse(0);
if (isTriviallyUniform(Src))
return IC.replaceInstUsesWith(II, Src.get());
break;
}
case Intrinsic::amdgcn_trig_preop: {
Expand Down
19 changes: 17 additions & 2 deletions llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2888,8 +2888,7 @@ define i32 @readlane_idempotent(i32 %arg, i32 %lane) {
define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1) {
; CHECK-LABEL: @readlane_idempotent_different_lanes(
; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE1:%.*]])
; CHECK-NEXT: ret i32 [[READ1]]
; CHECK-NEXT: ret i32 [[READ0]]
;
%read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0)
%read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane1)
Expand Down Expand Up @@ -3061,6 +3060,22 @@ define amdgpu_kernel void @permlanex16_fetch_invalid_bound_ctrl(ptr addrspace(1)
ret void
}

; --------------------------------------------------------------------
; llvm.amdgcn.permlane64
; --------------------------------------------------------------------

define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src0) {
; CHECK-LABEL: @permlane64_uniform(
; CHECK-NEXT: [[SRC1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0:%.*]])
; CHECK-NEXT: store i32 [[SRC1]], ptr addrspace(1) [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
%src1 = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
%res = call i32 @llvm.amdgcn.permlane64(i32 %src1)
store i32 %res, ptr addrspace(1) %out
ret void
}

; --------------------------------------------------------------------
; llvm.amdgcn.image.sample a16
; --------------------------------------------------------------------
Expand Down

0 comments on commit f142f8a

Please sign in to comment.