-
Notifications
You must be signed in to change notification settings - Fork 12.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Handle amdgpu.last.use metadata #83816
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Mirko Brkušanin (mbrkusanin) ChangesConvert !amdgpu.last.use metadata into Machine Memory Operand for last use Patch is 1.85 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/83816.diff 28 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0df0b5cdf0f392..72c79cc58f8ea9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16181,9 +16181,12 @@ bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
MachineMemOperand::Flags
SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
// Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
+ MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
if (I.getMetadata("amdgpu.noclobber"))
- return MONoClobber;
- return MachineMemOperand::MONone;
+ Flags |= MONoClobber;
+ if (I.getMetadata("amdgpu.last.use"))
+ Flags |= MOLastUse;
+ return Flags;
}
bool SITargetLowering::checkForPhysRegDependency(
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 4069a368f68719..ff3e6bc9f8b08f 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -98,6 +98,7 @@ class SIMemOpInfo final {
bool IsCrossAddressSpaceOrdering = false;
bool IsVolatile = false;
bool IsNonTemporal = false;
+ bool IsLastUse = false;
SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
SIAtomicScope Scope = SIAtomicScope::SYSTEM,
@@ -107,13 +108,15 @@ class SIMemOpInfo final {
AtomicOrdering FailureOrdering =
AtomicOrdering::SequentiallyConsistent,
bool IsVolatile = false,
- bool IsNonTemporal = false)
+ bool IsNonTemporal = false,
+ bool IsLastUse = false)
: Ordering(Ordering), FailureOrdering(FailureOrdering),
Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
InstrAddrSpace(InstrAddrSpace),
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
IsVolatile(IsVolatile),
- IsNonTemporal(IsNonTemporal) {
+ IsNonTemporal(IsNonTemporal),
+ IsLastUse(IsLastUse) {
if (Ordering == AtomicOrdering::NotAtomic) {
assert(Scope == SIAtomicScope::NONE &&
@@ -201,6 +204,12 @@ class SIMemOpInfo final {
return IsNonTemporal;
}
+ /// \returns True if memory access of the machine instruction used to
+ /// create this SIMemOpInfo is last use, false otherwise.
+ bool isLastUse() const {
+ return IsLastUse;
+ }
+
/// \returns True if ordering constraint of the machine instruction used to
/// create this SIMemOpInfo is unordered or higher, false otherwise.
bool isAtomic() const {
@@ -316,6 +325,12 @@ class SICacheControl {
return false;
};
+ /// Update \p MI memory instruction to indicate it is a last use. Return true
+ /// iff the instruction was modified.
+ virtual bool enableLastUse(MachineInstr &MI, bool IsLastUse) const {
+ return false;
+ }
+
/// Inserts any necessary instructions at position \p Pos relative
/// to instruction \p MI to ensure memory instructions before \p Pos of kind
/// \p Op associated with address spaces \p AddrSpace have completed. Used
@@ -592,6 +607,10 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
// MI. \returns Returns true if \p MI is modified, false otherwise.
bool setScope(const MachineBasicBlock::iterator MI,
AMDGPU::CPol::CPol Value) const;
+ // Checks if CPol operand is present in instruction \p MI and if current Scope
+ // policy is same as \p Value.
+ bool isScope(const MachineBasicBlock::iterator MI,
+ AMDGPU::CPol::CPol Value) const;
// Stores with system scope (SCOPE_SYS) need to wait for:
// - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
@@ -618,6 +637,9 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool IsNonTemporal) const override;
bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
+
+ bool enableLastUse(MachineInstr &MI,
+ bool IsLastUse) const override;
};
class SIMemoryLegalizer final : public MachineFunctionPass {
@@ -745,12 +767,14 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
bool IsNonTemporal = true;
bool IsVolatile = false;
+ bool IsLastUse = false;
// Validator should check whether or not MMOs cover the entire set of
// locations accessed by the memory instruction.
for (const auto &MMO : MI->memoperands()) {
IsNonTemporal &= MMO->isNonTemporal();
IsVolatile |= MMO->isVolatile();
+ IsLastUse |= MMO->getFlags() & MOLastUse;
InstrAddrSpace |=
toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
@@ -792,7 +816,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
}
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
- IsNonTemporal);
+ IsNonTemporal, IsLastUse);
}
std::optional<SIMemOpInfo>
@@ -2209,6 +2233,15 @@ bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
return false;
}
+bool SIGfx12CacheControl::isScope(const MachineBasicBlock::iterator MI,
+ AMDGPU::CPol::CPol Value) const {
+ MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
+ if (!CPol)
+ return false;
+
+ return (CPol->getImm() & AMDGPU::CPol::SCOPE) == Value;
+}
+
bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
const MachineBasicBlock::iterator MI) const {
// TODO: implement flag for frontend to give us a hint not to insert waits.
@@ -2392,6 +2425,11 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
bool Changed = false;
+ if (IsNonTemporal) {
+ // Set non-temporal hint for all cache levels.
+ Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
+ }
+
if (IsVolatile) {
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
@@ -2407,14 +2445,19 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
Position::AFTER);
}
- if (IsNonTemporal) {
- // Set non-temporal hint for all cache levels.
- Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
- }
-
return Changed;
}
+bool SIGfx12CacheControl::enableLastUse(MachineInstr &MI,
+ bool IsLastUse) const {
+ assert(MI.mayLoad() && !MI.mayStore());
+
+ if (IsLastUse && !isScope(MI, AMDGPU::CPol::SCOPE_SYS))
+ return setTH(MI, AMDGPU::CPol::TH_LU);;
+
+ return false;
+}
+
bool SIGfx12CacheControl::expandSystemScopeStore(
MachineBasicBlock::iterator &MI) const {
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
@@ -2471,12 +2514,19 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
return Changed;
}
+ // enableVolatileAndOrNonTemporal can insert instructions and advance iterator
+ // MI and we need original instruction for enabling last use.
+ MachineInstr &Inst = *MI;
+
// Atomic instructions already bypass caches to the scope specified by the
// SyncScope operand. Only non-atomic volatile and nontemporal instructions
// need additional treatment.
Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
SIMemOp::LOAD, MOI.isVolatile(),
MOI.isNonTemporal());
+
+ Changed |= CC->enableLastUse(Inst, MOI.isLastUse());
+
return Changed;
}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index 77962fadcacfc6..e13542f61474e2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -10,6 +10,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
define amdgpu_kernel void @singlethread_acquire_fence() {
; GFX6-LABEL: singlethread_acquire_fence:
@@ -55,6 +57,14 @@ define amdgpu_kernel void @singlethread_acquire_fence() {
; GFX11-CU-LABEL: singlethread_acquire_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: singlethread_acquire_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: singlethread_acquire_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread") acquire
ret void
@@ -104,6 +114,14 @@ define amdgpu_kernel void @singlethread_release_fence() {
; GFX11-CU-LABEL: singlethread_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: singlethread_release_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: singlethread_release_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread") release
ret void
@@ -153,6 +171,14 @@ define amdgpu_kernel void @singlethread_acq_rel_fence() {
; GFX11-CU-LABEL: singlethread_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: singlethread_acq_rel_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: singlethread_acq_rel_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread") acq_rel
ret void
@@ -202,6 +228,14 @@ define amdgpu_kernel void @singlethread_seq_cst_fence() {
; GFX11-CU-LABEL: singlethread_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: singlethread_seq_cst_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: singlethread_seq_cst_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread") seq_cst
ret void
@@ -251,6 +285,14 @@ define amdgpu_kernel void @singlethread_one_as_acquire_fence() {
; GFX11-CU-LABEL: singlethread_one_as_acquire_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: singlethread_one_as_acquire_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: singlethread_one_as_acquire_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") acquire
ret void
@@ -300,6 +342,14 @@ define amdgpu_kernel void @singlethread_one_as_release_fence() {
; GFX11-CU-LABEL: singlethread_one_as_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: singlethread_one_as_release_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: singlethread_one_as_release_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") release
ret void
@@ -349,6 +399,14 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() {
; GFX11-CU-LABEL: singlethread_one_as_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: singlethread_one_as_acq_rel_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: singlethread_one_as_acq_rel_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") acq_rel
ret void
@@ -398,6 +456,14 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() {
; GFX11-CU-LABEL: singlethread_one_as_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: singlethread_one_as_seq_cst_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: singlethread_one_as_seq_cst_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") seq_cst
ret void
@@ -447,6 +513,14 @@ define amdgpu_kernel void @wavefront_acquire_fence() {
; GFX11-CU-LABEL: wavefront_acquire_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: wavefront_acquire_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: wavefront_acquire_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront") acquire
ret void
@@ -496,6 +570,14 @@ define amdgpu_kernel void @wavefront_release_fence() {
; GFX11-CU-LABEL: wavefront_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: wavefront_release_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: wavefront_release_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront") release
ret void
@@ -545,6 +627,14 @@ define amdgpu_kernel void @wavefront_acq_rel_fence() {
; GFX11-CU-LABEL: wavefront_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: wavefront_acq_rel_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: wavefront_acq_rel_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront") acq_rel
ret void
@@ -594,6 +684,14 @@ define amdgpu_kernel void @wavefront_seq_cst_fence() {
; GFX11-CU-LABEL: wavefront_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: wavefront_seq_cst_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: wavefront_seq_cst_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront") seq_cst
ret void
@@ -643,6 +741,14 @@ define amdgpu_kernel void @wavefront_one_as_acquire_fence() {
; GFX11-CU-LABEL: wavefront_one_as_acquire_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: wavefront_one_as_acquire_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: wavefront_one_as_acquire_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") acquire
ret void
@@ -692,6 +798,14 @@ define amdgpu_kernel void @wavefront_one_as_release_fence() {
; GFX11-CU-LABEL: wavefront_one_as_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: wavefront_one_as_release_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: wavefront_one_as_release_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") release
ret void
@@ -741,6 +855,14 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() {
; GFX11-CU-LABEL: wavefront_one_as_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: wavefront_one_as_acq_rel_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: wavefront_one_as_acq_rel_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") acq_rel
ret void
@@ -790,6 +912,14 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() {
; GFX11-CU-LABEL: wavefront_one_as_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: wavefront_one_as_seq_cst_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: wavefront_one_as_seq_cst_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") seq_cst
ret void
@@ -843,6 +973,15 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX11-CU-LABEL: workgroup_acquire_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: workgroup_acquire_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: workgroup_acquire_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acquire
ret void
@@ -892,6 +1031,14 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX11-CU-LABEL: workgroup_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: workgroup_release_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: workgroup_release_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") release
ret void
@@ -945,6 +1092,15 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX11-CU-LABEL: workgroup_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: workgroup_acq_rel_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acq_rel
ret void
@@ -998,6 +1154,15 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX11-CU-LABEL: workgroup_seq_cst_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: workgroup_seq_cst_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") seq_cst
ret void
@@ -1051,6 +1216,15 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX11-CU-LABEL: workgroup_one_as_acquire_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: workgroup_one_as_acquire_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: workgroup_one_as_acquire_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acquire
ret void
@@ -1100,6 +1274,14 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX11-CU-LABEL: workgroup_one_as_release_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: workgroup_one_as_release_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: workgroup_one_as_release_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") release
ret void
@@ -1153,6 +1335,15 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acq_rel
ret void
@@ -1206,6 +1397,15 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX1...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
39b5411
to
f0f3ffa
Compare
bool IsLastUse) const { | ||
assert(MI.mayLoad() && !MI.mayStore()); | ||
|
||
if (IsLastUse && !isScope(MI, AMDGPU::CPol::SCOPE_SYS)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the reason for not doing this for SCOPE_SYS?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TH_LOAD_LU is not available in SYS scope. th=3, scope=3 is TH_LOAD_BYPASS.
Unless that is also what we want. Docs describe similar behavior but I am confused as to why they are different policies.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For simplicity I think we should probably set th=3 regardless of scope.
If I understand correctly "last use" does not really mean anything useful for a system scope load. Normally "last use" means you can discard dirty data from any cache at the same or higher level than the scope. But for system scope, there is no cache at that scope, so there is nothing to do. But it is harmless to set th=3 anyway. @Pierre-vh
f0f3ffa
to
4a77ea0
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good overall. I think we should document amdgpu.last.use
in AMDGPUUsage, in a new "LLVM IR Metadata" section like the existing "LLVM IR Intrinsics" documentation.
// Atomic instructions already bypass caches to the scope specified by the | ||
// SyncScope operand. Only non-atomic volatile and nontemporal instructions | ||
// need additional treatment. | ||
Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), | ||
SIMemOp::LOAD, MOI.isVolatile(), | ||
MOI.isNonTemporal()); | ||
|
||
Changed |= CC->enableLastUse(Inst, MOI.isLastUse()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move this before enableVolatileAndOrNonTemporal since it no longer depends on scope?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It could still be overridden by nontemporal.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed, lastuse should take priority over nontemporal. (Do you have any tests for that?) I think this would be more obvious if you moved the lastuse logic into enableVolatileAndOrNonTemporal, instead of having a separate enableLastUse, but I won't insist.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I moved it to enableVolatileAndOrNonTemporal but had to update methods in all subclasses as well.
Convert !amdgpu.last.use metadata into Machine Memory Operand for last use and handle it in SIMemoryLegalizer similary to nontemporal and volatile.
// Atomic instructions already bypass caches to the scope specified by the | ||
// SyncScope operand. Only non-atomic volatile and nontemporal instructions | ||
// need additional treatment. | ||
Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), | ||
SIMemOp::LOAD, MOI.isVolatile(), | ||
MOI.isNonTemporal()); | ||
|
||
Changed |= CC->enableLastUse(Inst, MOI.isLastUse()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed, lastuse should take priority over nontemporal. (Do you have any tests for that?) I think this would be more obvious if you moved the lastuse logic into enableVolatileAndOrNonTemporal, instead of having a separate enableLastUse, but I won't insist.
4a77ea0
to
af0c8df
Compare
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks. Please wait until tomorrow for any other comments.
Convert !amdgpu.last.use metadata into MachineMemOperand for last use
and handle it in SIMemoryLegalizer similar to nontemporal and volatile.