Skip to content

Commit

Permalink
[AMDGPU] Fix negative immediate offset for unbuffered smem loads (#89165
Browse files Browse the repository at this point in the history
)

For unbuffered smem loads, it is illegal for the immediate offset to be
negative if the resulting IOFFSET + (SGPR[Offset] or M0 or zero) is
negative.

New PR of #79553.
  • Loading branch information
vangthao95 authored Jun 24, 2024
1 parent 5413a2b commit 3aef525
Show file tree
Hide file tree
Showing 12 changed files with 357 additions and 104 deletions.
56 changes: 45 additions & 11 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2004,28 +2004,50 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
return true;
}

// For unbuffered smem loads, it is illegal for the Immediate Offset to be
// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
// Handle the case where the Immediate Offset + SOffset is negative.
bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
bool Imm32Only,
bool IsBuffer,
int64_t ImmOffset) const {
if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
return false;
}

return true;
}

// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
// not null) offset. If Imm32Only is true, match only 32-bit immediate
// offsets available on CI.
bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
SDValue *SOffset, SDValue *Offset,
bool Imm32Only, bool IsBuffer) const {
bool Imm32Only, bool IsBuffer,
bool HasSOffset,
int64_t ImmOffset) const {
assert((!SOffset || !Offset) &&
"Cannot match both soffset and offset at the same time!");

ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
if (!C) {
if (!SOffset)
return false;

if (ByteOffsetNode.getValueType().isScalarInteger() &&
ByteOffsetNode.getValueType().getSizeInBits() == 32) {
*SOffset = ByteOffsetNode;
return true;
return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
ImmOffset);
}
if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
*SOffset = ByteOffsetNode.getOperand(0);
return true;
return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
ImmOffset);
}
}
return false;
Expand All @@ -2036,8 +2058,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
// GFX9 and GFX10 have signed byte immediate offsets. The immediate
// offset for S_BUFFER instructions is unsigned.
int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
std::optional<int64_t> EncodedOffset =
AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer);
std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
*Subtarget, ByteOffset, IsBuffer, HasSOffset);
if (EncodedOffset && Offset && !Imm32Only) {
*Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
return true;
Expand Down Expand Up @@ -2096,13 +2118,22 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
// true, match only 32-bit immediate offsets available on CI.
bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
SDValue *SOffset, SDValue *Offset,
bool Imm32Only,
bool IsBuffer) const {
bool Imm32Only, bool IsBuffer,
bool HasSOffset,
int64_t ImmOffset) const {
if (SOffset && Offset) {
assert(!Imm32Only && !IsBuffer);
SDValue B;
return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) &&
SelectSMRDBaseOffset(B, SBase, SOffset, nullptr);

if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
return false;

int64_t ImmOff = 0;
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
ImmOff = C->getSExtValue();

return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
ImmOff);
}

// A 32-bit (address + offset) should not cause unsigned 32-bit integer
Expand All @@ -2121,11 +2152,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
}
if (!N0 || !N1)
return false;
if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) {

if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
ImmOffset)) {
SBase = N0;
return true;
}
if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) {
if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
ImmOffset)) {
SBase = N1;
return true;
}
Expand Down
10 changes: 8 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool isFlatScratchBaseLegal(SDValue Addr) const;
bool isFlatScratchBaseLegalSV(SDValue Addr) const;
bool isFlatScratchBaseLegalSVImm(SDValue Addr) const;
bool isSOffsetLegalWithImmOffset(SDValue *SOffset, bool Imm32Only,
bool IsBuffer, int64_t ImmOffset = 0) const;

bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
Expand Down Expand Up @@ -174,11 +176,13 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {

bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset,
SDValue *Offset, bool Imm32Only = false,
bool IsBuffer = false) const;
bool IsBuffer = false, bool HasSOffset = false,
int64_t ImmOffset = 0) const;
SDValue Expand32BitAddress(SDValue Addr) const;
bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset,
SDValue *Offset, bool Imm32Only = false,
bool IsBuffer = false) const;
bool IsBuffer = false, bool HasSOffset = false,
int64_t ImmOffset = 0) const;
bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset,
SDValue *Offset, bool Imm32Only = false) const;
bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
Expand All @@ -190,6 +194,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const;
bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
SDValue &Offset) const;
bool SelectSMRDPrefetchImm(SDValue Addr, SDValue &SBase,
SDValue &Offset) const;
bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;

bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods,
Expand Down
18 changes: 16 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4216,10 +4216,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
return false;

const GEPInfo &GEPI = AddrInfo[0];
std::optional<int64_t> EncodedImm =
AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false);
std::optional<int64_t> EncodedImm;

if (SOffset && Offset) {
EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
/*HasSOffset=*/true);
if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
AddrInfo.size() > 1) {
const GEPInfo &GEPI2 = AddrInfo[1];
Expand All @@ -4229,13 +4230,26 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
Base = GEPI2.SgprParts[0];
*SOffset = OffsetReg;
*Offset = *EncodedImm;
if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
return true;

// For unbuffered smem loads, it is illegal for the Immediate Offset
// to be negative if the resulting (Offset + (M0 or SOffset or zero)
// is negative. Handle the case where the Immediate Offset + SOffset
// is negative.
auto SKnown = KB->getKnownBits(*SOffset);
if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
return false;

return true;
}
}
}
return false;
}

EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
/*HasSOffset=*/false);
if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
Base = GEPI.SgprParts[0];
*Offset = *EncodedImm;
Expand Down
19 changes: 14 additions & 5 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,12 @@ namespace llvm {

namespace AMDGPU {

/// \returns true if the target supports signed immediate offset for SMRD
/// instructions.
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
return isGFX9Plus(ST);
}

/// \returns True if \p STI is AMDHSA.
bool isHsaAbi(const MCSubtargetInfo &STI) {
return STI.getTargetTriple().getOS() == Triple::AMDHSA;
Expand Down Expand Up @@ -2819,10 +2825,6 @@ static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) {
return isGCN3Encoding(ST) || isGFX10Plus(ST);
}

static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
return isGFX9Plus(ST);
}

bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
int64_t EncodedOffset) {
if (isGFX12Plus(ST))
Expand Down Expand Up @@ -2857,7 +2859,14 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST,
}

std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
int64_t ByteOffset, bool IsBuffer) {
int64_t ByteOffset, bool IsBuffer,
bool HasSOffset) {
// For unbuffered smem loads, it is illegal for the Immediate Offset to be
// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
// Handle case where SOffset is not present.
if (!IsBuffer && !HasSOffset && ByteOffset < 0 && hasSMRDSignedImmOffset(ST))
return std::nullopt;

if (isGFX12Plus(ST)) // 24 bit signed offsets
return isInt<24>(ByteOffset) ? std::optional<int64_t>(ByteOffset)
: std::nullopt;
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1307,6 +1307,7 @@ bool hasVOPD(const MCSubtargetInfo &STI);
bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI);
int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
unsigned hasKernargPreload(const MCSubtargetInfo &STI);
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST);

/// Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
Expand Down Expand Up @@ -1479,7 +1480,8 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset);
/// S_LOAD instructions have a signed offset, on other subtargets it is
/// unsigned. S_BUFFER has an unsigned offset for all subtargets.
std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
int64_t ByteOffset, bool IsBuffer);
int64_t ByteOffset, bool IsBuffer,
bool HasSOffset = false);

/// \return The encoding that can be used for a 32-bit literal offset in an SMRD
/// instruction. This is only useful on CI.s
Expand Down
20 changes: 18 additions & 2 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
Original file line number Diff line number Diff line change
Expand Up @@ -1234,7 +1234,15 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -1, 0 :: (load (s32), addrspace 4)
; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(s64) = G_CONSTANT i64 -1
Expand Down Expand Up @@ -1304,7 +1312,15 @@ body: |
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -524288, 0 :: (load (s32), addrspace 4)
; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(s64) = G_CONSTANT i64 -524288
Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,13 @@ entry:
ret void
}

; GFX9_10 can use a signed immediate byte offset
; GFX9+ can use a signed immediate byte offset but not without sgpr[offset]
; GCN-LABEL: {{^}}smrd6:
; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4
; SICIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0
; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], -0x4
; GFX9_10: s_add_u32 s2, s2, -4
; GFX9_10: s_addc_u32 s3, s3, -1
; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0
define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
entry:
%tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 -1
Expand Down
12 changes: 9 additions & 3 deletions llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll
Original file line number Diff line number Diff line change
Expand Up @@ -297,20 +297,26 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: .LBB5_1: ; %loop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s3, s[0:1], -0x190
; GFX9-NEXT: s_add_i32 s2, s2, -1
; GFX9-NEXT: s_add_u32 s4, s0, 0xfffffe70
; GFX9-NEXT: s_addc_u32 s5, s1, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %end
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sink_smem_offset_neg400:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_movk_i32 s4, 0xfe70
; GFX12-NEXT: s_mov_b32 s5, -1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: .LBB5_1: ; %loop
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b32 s3, s[0:1], -0x190
; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0
; GFX12-NEXT: s_add_co_i32 s2, s2, -1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
Expand Down
Loading

0 comments on commit 3aef525

Please sign in to comment.