-
Notifications
You must be signed in to change notification settings - Fork 12.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Avoid NEON dot product in streaming[-compatible] functions #101677
Merged
sdesmalen-arm
merged 2 commits into
llvm:main
from
sdesmalen-arm:sme-dont-emit-neon-dot-in-streaming-mode
Aug 2, 2024
Merged
[AArch64] Avoid NEON dot product in streaming[-compatible] functions #101677
sdesmalen-arm
merged 2 commits into
llvm:main
from
sdesmalen-arm:sme-dont-emit-neon-dot-in-streaming-mode
Aug 2, 2024
Conversation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
A follow-up patch will improve codegen for these functions.
@llvm/pr-subscribers-backend-aarch64 Author: Sander de Smalen (sdesmalen-arm) ChangesThe NEON dot product is not valid in streaming mode. Full diff: https://github.com/llvm/llvm-project/pull/101677.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 01079a95b4746..2e869f11b8431 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17866,6 +17866,9 @@ static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
const AArch64Subtarget *ST) {
+ if (!ST->isNeonAvailable())
+ return SDValue();
+
if (!ST->hasDotProd())
return performVecReduceAddCombineWithUADDLP(N, DAG);
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
new file mode 100644
index 0000000000000..00a15f4bcd639
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mattr=+dotprod,+sve < %s | FileCheck %s -check-prefix=DOT
+; RUN: llc -mattr=+dotprod,+sve -force-streaming-compatible < %s | FileCheck %s --check-prefix=STREAMING-SVE
+; RUN: llc -mattr=+dotprod,+sme -force-streaming < %s | FileCheck %s --check-prefix=STREAMING-SVE
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define i32 @reduce_uaddv_v16i8(<32 x i8> %a) {
+; CHECK-LABEL: reduce_uaddv_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll2 v2.8h, v1.16b, #0
+; CHECK-NEXT: ushll2 v3.8h, v0.16b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: uaddl2 v4.4s, v3.8h, v2.8h
+; CHECK-NEXT: uaddl v2.4s, v3.4h, v2.4h
+; CHECK-NEXT: uaddl2 v5.4s, v0.8h, v1.8h
+; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT: add v1.4s, v5.4s, v4.4s
+; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+;
+; DOT-LABEL: reduce_uaddv_v16i8:
+; DOT: // %bb.0:
+; DOT-NEXT: movi v2.16b, #1
+; DOT-NEXT: movi v3.2d, #0000000000000000
+; DOT-NEXT: udot v3.4s, v1.16b, v2.16b
+; DOT-NEXT: udot v3.4s, v0.16b, v2.16b
+; DOT-NEXT: addv s0, v3.4s
+; DOT-NEXT: fmov w0, s0
+; DOT-NEXT: ret
+;
+; STREAMING-SVE-LABEL: reduce_uaddv_v16i8:
+; STREAMING-SVE: // %bb.0:
+; STREAMING-SVE-NEXT: // kill: def $q1 killed $q1 def $z1
+; STREAMING-SVE-NEXT: uunpklo z2.h, z1.b
+; STREAMING-SVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; STREAMING-SVE-NEXT: uunpklo z3.h, z0.b
+; STREAMING-SVE-NEXT: ptrue p0.s, vl4
+; STREAMING-SVE-NEXT: ext z1.b, z1.b, z1.b, #8
+; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8
+; STREAMING-SVE-NEXT: uunpklo z1.h, z1.b
+; STREAMING-SVE-NEXT: uunpklo z0.h, z0.b
+; STREAMING-SVE-NEXT: uunpklo z4.s, z2.h
+; STREAMING-SVE-NEXT: ext z2.b, z2.b, z2.b, #8
+; STREAMING-SVE-NEXT: uunpklo z6.s, z3.h
+; STREAMING-SVE-NEXT: ext z3.b, z3.b, z3.b, #8
+; STREAMING-SVE-NEXT: mov z5.d, z1.d
+; STREAMING-SVE-NEXT: uunpklo z7.s, z0.h
+; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8
+; STREAMING-SVE-NEXT: uunpklo z2.s, z2.h
+; STREAMING-SVE-NEXT: uunpklo z3.s, z3.h
+; STREAMING-SVE-NEXT: add z4.s, z6.s, z4.s
+; STREAMING-SVE-NEXT: ext z5.b, z5.b, z1.b, #8
+; STREAMING-SVE-NEXT: uunpklo z1.s, z1.h
+; STREAMING-SVE-NEXT: uunpklo z0.s, z0.h
+; STREAMING-SVE-NEXT: add z2.s, z3.s, z2.s
+; STREAMING-SVE-NEXT: uunpklo z5.s, z5.h
+; STREAMING-SVE-NEXT: add z1.s, z7.s, z1.s
+; STREAMING-SVE-NEXT: add z0.s, z0.s, z5.s
+; STREAMING-SVE-NEXT: add z1.s, z4.s, z1.s
+; STREAMING-SVE-NEXT: add z0.s, z2.s, z0.s
+; STREAMING-SVE-NEXT: add z0.s, z1.s, z0.s
+; STREAMING-SVE-NEXT: uaddv d0, p0, z0.s
+; STREAMING-SVE-NEXT: fmov x0, d0
+; STREAMING-SVE-NEXT: // kill: def $w0 killed $w0 killed $x0
+; STREAMING-SVE-NEXT: ret
+ %1 = zext <32 x i8> %a to <32 x i32>
+ %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
+ ret i32 %2
+}
+
+define i32 @reduce_saddv_v16i8(<32 x i8> %a) {
+; CHECK-LABEL: reduce_saddv_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sshll2 v2.8h, v1.16b, #0
+; CHECK-NEXT: sshll2 v3.8h, v0.16b, #0
+; CHECK-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: saddl2 v4.4s, v3.8h, v2.8h
+; CHECK-NEXT: saddl v2.4s, v3.4h, v2.4h
+; CHECK-NEXT: saddl2 v5.4s, v0.8h, v1.8h
+; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT: add v1.4s, v5.4s, v4.4s
+; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+;
+; DOT-LABEL: reduce_saddv_v16i8:
+; DOT: // %bb.0:
+; DOT-NEXT: movi v2.16b, #1
+; DOT-NEXT: movi v3.2d, #0000000000000000
+; DOT-NEXT: sdot v3.4s, v1.16b, v2.16b
+; DOT-NEXT: sdot v3.4s, v0.16b, v2.16b
+; DOT-NEXT: addv s0, v3.4s
+; DOT-NEXT: fmov w0, s0
+; DOT-NEXT: ret
+;
+; STREAMING-SVE-LABEL: reduce_saddv_v16i8:
+; STREAMING-SVE: // %bb.0:
+; STREAMING-SVE-NEXT: // kill: def $q1 killed $q1 def $z1
+; STREAMING-SVE-NEXT: sunpklo z2.h, z1.b
+; STREAMING-SVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; STREAMING-SVE-NEXT: sunpklo z3.h, z0.b
+; STREAMING-SVE-NEXT: ptrue p0.s, vl4
+; STREAMING-SVE-NEXT: ext z1.b, z1.b, z1.b, #8
+; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8
+; STREAMING-SVE-NEXT: sunpklo z1.h, z1.b
+; STREAMING-SVE-NEXT: sunpklo z0.h, z0.b
+; STREAMING-SVE-NEXT: sunpklo z4.s, z2.h
+; STREAMING-SVE-NEXT: ext z2.b, z2.b, z2.b, #8
+; STREAMING-SVE-NEXT: sunpklo z6.s, z3.h
+; STREAMING-SVE-NEXT: ext z3.b, z3.b, z3.b, #8
+; STREAMING-SVE-NEXT: mov z5.d, z1.d
+; STREAMING-SVE-NEXT: sunpklo z7.s, z0.h
+; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8
+; STREAMING-SVE-NEXT: sunpklo z2.s, z2.h
+; STREAMING-SVE-NEXT: sunpklo z3.s, z3.h
+; STREAMING-SVE-NEXT: add z4.s, z6.s, z4.s
+; STREAMING-SVE-NEXT: ext z5.b, z5.b, z1.b, #8
+; STREAMING-SVE-NEXT: sunpklo z1.s, z1.h
+; STREAMING-SVE-NEXT: sunpklo z0.s, z0.h
+; STREAMING-SVE-NEXT: add z2.s, z3.s, z2.s
+; STREAMING-SVE-NEXT: sunpklo z5.s, z5.h
+; STREAMING-SVE-NEXT: add z1.s, z7.s, z1.s
+; STREAMING-SVE-NEXT: add z0.s, z0.s, z5.s
+; STREAMING-SVE-NEXT: add z1.s, z4.s, z1.s
+; STREAMING-SVE-NEXT: add z0.s, z2.s, z0.s
+; STREAMING-SVE-NEXT: add z0.s, z1.s, z0.s
+; STREAMING-SVE-NEXT: uaddv d0, p0, z0.s
+; STREAMING-SVE-NEXT: fmov x0, d0
+; STREAMING-SVE-NEXT: // kill: def $w0 killed $w0 killed $x0
+; STREAMING-SVE-NEXT: ret
+ %1 = sext <32 x i8> %a to <32 x i32>
+ %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
+ ret i32 %2
+}
|
jroelofs
approved these changes
Aug 2, 2024
/cherry-pick 12937b1 |
llvmbot
pushed a commit
to llvmbot/llvm-project
that referenced
this pull request
Aug 5, 2024
…lvm#101677) The NEON dot product is not valid in streaming mode. A follow-up patch will improve codegen for these operations. (cherry picked from commit 12937b1)
/pull-request #101933 |
banach-space
pushed a commit
to banach-space/llvm-project
that referenced
this pull request
Aug 7, 2024
…lvm#101677) The NEON dot product is not valid in streaming mode. A follow-up patch will improve codegen for these operations.
tru
pushed a commit
to llvmbot/llvm-project
that referenced
this pull request
Aug 10, 2024
…lvm#101677) The NEON dot product is not valid in streaming mode. A follow-up patch will improve codegen for these operations. (cherry picked from commit 12937b1)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
The NEON dot product is not valid in streaming mode.
A follow-up patch will improve codegen for these operations.