From f142f8afe21bceb00fb495468aa0b5043e98c419 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 23 Aug 2024 14:43:31 +0100 Subject: [PATCH] [AMDGPU] Improve uniform argument handling in InstCombineIntrinsic (#105812) Common up handling of intrinsics that are a no-op on uniform arguments. This catches a couple of new cases: readlane (readlane x, y), z -> readlane x, y (for any z, does not have to equal y). permlane64 (readfirstlane x) -> readfirstlane x (and likewise for any other uniform argument to permlane64). --- .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 57 +++++++------------ .../InstCombine/AMDGPU/amdgcn-intrinsics.ll | 19 ++++++- 2 files changed, 36 insertions(+), 40 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 9197404309663a..4da3618357c420 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -440,6 +440,21 @@ static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) { SqrtOp->getType()->isHalfTy(); } +/// Return true if we can easily prove that use U is uniform. +static bool isTriviallyUniform(const Use &U) { + Value *V = U.get(); + if (isa(V)) + return true; + if (const auto *II = dyn_cast(V)) { + if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID())) + return false; + // If II and U are in different blocks then there is a possibility of + // temporal divergence. + return II->getParent() == cast(U.getUser())->getParent(); + } + return false; +} + std::optional GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Intrinsic::ID IID = II.getIntrinsicID(); @@ -1060,46 +1075,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); } case Intrinsic::amdgcn_permlane64: - // A constant value is trivially uniform. - if (Constant *C = dyn_cast(II.getArgOperand(0))) { - return IC.replaceInstUsesWith(II, C); - } - break; case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: { - // A constant value is trivially uniform. - if (Constant *C = dyn_cast(II.getArgOperand(0))) { - return IC.replaceInstUsesWith(II, C); - } - - // The rest of these may not be safe if the exec may not be the same between - // the def and use. - Value *Src = II.getArgOperand(0); - Instruction *SrcInst = dyn_cast(Src); - if (SrcInst && SrcInst->getParent() != II.getParent()) - break; - - // readfirstlane (readfirstlane x) -> readfirstlane x - // readlane (readfirstlane x), y -> readfirstlane x - if (match(Src, - PatternMatch::m_Intrinsic())) { - return IC.replaceInstUsesWith(II, Src); - } - - if (IID == Intrinsic::amdgcn_readfirstlane) { - // readfirstlane (readlane x, y) -> readlane x, y - if (match(Src, PatternMatch::m_Intrinsic())) { - return IC.replaceInstUsesWith(II, Src); - } - } else { - // readlane (readlane x, y), y -> readlane x, y - if (match(Src, PatternMatch::m_Intrinsic( - PatternMatch::m_Value(), - PatternMatch::m_Specific(II.getArgOperand(1))))) { - return IC.replaceInstUsesWith(II, Src); - } - } - + // If the first argument is uniform these intrinsics return it unchanged. + const Use &Src = II.getArgOperandUse(0); + if (isTriviallyUniform(Src)) + return IC.replaceInstUsesWith(II, Src.get()); break; } case Intrinsic::amdgcn_trig_preop: { diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll index 9cb79b26448658..f3a3b8c1dc5d8a 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -2888,8 +2888,7 @@ define i32 @readlane_idempotent(i32 %arg, i32 %lane) { define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1) { ; CHECK-LABEL: @readlane_idempotent_different_lanes( ; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG:%.*]], i32 [[LANE0:%.*]]) -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[READ0]], i32 [[LANE1:%.*]]) -; CHECK-NEXT: ret i32 [[READ1]] +; CHECK-NEXT: ret i32 [[READ0]] ; %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0) %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane1) @@ -3061,6 +3060,22 @@ define amdgpu_kernel void @permlanex16_fetch_invalid_bound_ctrl(ptr addrspace(1) ret void } +; -------------------------------------------------------------------- +; llvm.amdgcn.permlane64 +; -------------------------------------------------------------------- + +define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src0) { +; CHECK-LABEL: @permlane64_uniform( +; CHECK-NEXT: [[SRC1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0:%.*]]) +; CHECK-NEXT: store i32 [[SRC1]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %src1 = call i32 @llvm.amdgcn.readfirstlane(i32 %src0) + %res = call i32 @llvm.amdgcn.permlane64(i32 %src1) + store i32 %res, ptr addrspace(1) %out + ret void +} + ; -------------------------------------------------------------------- ; llvm.amdgcn.image.sample a16 ; --------------------------------------------------------------------